In [13]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout


In [2]:
# load data

alldata = pd.read_csv('alldata2.csv')
alldata.drop(columns = ['Unnamed: 0'], inplace = True)
X = alldata.iloc[: , 1:]
y = alldata.iloc[: , 0]

In [3]:
class CustomFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self # nothing to fit
    
    def transform(self, X, y=None):
        # Z2, Loan loss provision / net interest revenue
        LLPNIR = X.ELNLOS / X.NIM

        # Z3, Impaired loans / gross loans
        ILGL = X.EAMINTAN / X.LNLSGRS

        # Z4, Net charge off / Gross loans
        NCOGL = X.NTLNLS / X.LNLSGRS

        # Z5, Impaired loans / equity
        ILEQ = X.EAMINTAN / X.EQ

        # Z9, Equity / Net Loans
        EQNL = X.EQ / X.LNLSNET
        
        # no data for Z10

        # Z11, Equity / Liabilities
        EQLIAB = X.EQ / X.LIAB

        # Z13, Total capital / Net Loans
        TCNL = X.LIABEQ / X.LNLSNET

        # Z14, Total capital / deposits
        TCDEP = X.LIABEQ / X.DEP

        # Z15, Total capital / Liabilities
        TCLIAB = X.LIABEQ / X.LIAB

        # no data for Z18

        # Z21, taxes / avg assets
        TAVAST = X.ITAX / X.ASSET5

        # Z24, net income / equity
        NIEQ = X.NETINC / X.EQ

        # no data for Z25 and Z26, use Efficiency ratio instead

        # Z27 , net loans / total assets
        NLTA = X.LNLSNET / X.ASSET

        # not sure how Z28 and Z29 are different, use net loans / total deposits
        NLTD = X.LNLSNET / X.DEP

        # not sure how Z30 and 31 are different, use liquid assets / total deposits = (assets - fixed assets) / total deposits
        LATD = (X.ASSET - X.BKPREM) / X.DEP

        # construct final table
        finalTable = pd.concat([X.LNATRESR, LLPNIR, ILGL, NCOGL, ILEQ, X.IDT1RWAJR, X.EQTOTR, X.EQV, EQNL, EQLIAB, X.LIABEQR, TCNL, TCDEP, TCLIAB, X.NIMY, X.NIMR, X.NONIXR, X.PTAXNETINCR, TAVAST, X.ROA, X.ROE, NIEQ, X.EEFFR, NLTA, NLTD, LATD, X.ACTIVE], axis = 1)
        
        
        # drop rows that have resulted in division by zero
        finalTable.replace([np.inf, -np.inf], np.nan, inplace=True)
        finalTable.dropna(inplace = True)

        finalTable.columns = ['LNATRESR', 'LLPNIR', 'ILGL', 'NCOGL', 'ILEQ', 'IDT1RWAJR', 'EQTOTR', 'EQV', 'EQNL', 'EQLIAB', 'LIABEQR', 'TCNL', 'TCDEP', 'TCLIAB', 'NIMY', 'NIMR', 'NONIXR', 'PTAXNETINCR', 'TAVAST', 'ROA', 'ROE', 'NIEQ', 'EEFFR', 'NLTA', 'NLTD', 'LATD', 'ACTIVE']
        return finalTable

In [4]:
# this pipeline should be run before the data is split into features and targets or into training/test sets. 
# there is no fitting involved, only feature engineering in deriving features from other features.
# some of these ratios lead to division by zero, so these rows AND THEIR CORRESPONDING TARGETS must be dropped from the dataset before any splitting is done

pre_split_pipeline = Pipeline([
                    ('feature transformer', CustomFeatures())
                    ])

transformed_data = pre_split_pipeline.fit_transform(alldata)

In [5]:
# split data
X = transformed_data.drop(columns = 'ACTIVE')
y = transformed_data.ACTIVE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True)

In [6]:
# post split pipeline

post_split_pipeline = Pipeline([
                                ('scaler', StandardScaler())
                                ])

# assign scaled pipeline values back to dataframes
X_train.values[:] = post_split_pipeline.fit_transform(X_train)
X_test.values[:] = post_split_pipeline.transform(X_test)

In [7]:
model6 = Sequential([
    Dense(128, activation = 'relu'),
    Dropout(0.2),
    Dense(64, activation = 'relu'),
    Dropout(0.2),
    Dense(32, activation = 'relu'),
    Dropout(0.2),
    Dense(1, activation = 'sigmoid')
])

# Compile the model
model6.compile(
    loss = tf.keras.losses.BinaryCrossentropy(),
    optimizer = tf.keras.optimizers.legacy.Adam(0.01),
    metrics = ['accuracy']
)

# Train the model
history6 = model6.fit(X_train,
                    y_train,
                    epochs=150,
                    validation_data=(X_test, y_test),
                    verbose=1)

2023-12-03 20:35:39.646971: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-03 20:35:39.674284: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-03 20:35:39.674339: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-12-03 20:35:39.675378: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other ope

Epoch 1/150
 12/225 [>.............................] - ETA: 2s - loss: 0.6607 - accuracy: 0.6198

2023-12-03 20:35:43.101965: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78/150
Epoch 7

In [8]:
model6.save('transformed_model.h5')

In [None]:
# from keras.models import load_model
# new_model = load_model('transformed_model.h5')

In [9]:
pred = model6.predict(X_test)



In [10]:
pred

array([[0.00134376],
       [0.7330341 ],
       [0.7240183 ],
       ...,
       [0.99973375],
       [0.71446896],
       [0.0512053 ]], dtype=float32)

In [30]:
def metrics(y_true, y_pred):
    print(f'accuracy: {accuracy_score(y_true, y_pred >= 0.5) * 100 : .2f}%')
    print(f'recall: {recall_score(y_true, y_pred >= 0.5) * 100 : .2f}%')
    print(f'precision: {precision_score(y_true, y_pred >= 0.5) * 100 : .2f}%')
    print(f'f1: {f1_score(y_true, y_pred >= 0.5) * 100 : .2f}%')

In [31]:
metrics(y_test, pred)

accuracy:  83.00%
recall:  92.08%
precision:  78.47%
f1:  84.73%


In [40]:
from sklearn.neighbors import KNeighborsClassifier

knnmodel = KNeighborsClassifier(n_neighbors = 9)
knnmodel.fit(X_train, y_train)


In [41]:
knnpred = knnmodel.predict(X_test)

In [35]:
knnpred

array([0, 0, 1, ..., 1, 1, 0])

In [42]:
metrics(y_test, knnpred)

accuracy:  77.50%
recall:  81.56%
precision:  76.19%
f1:  78.78%
