In [2]:
# Benchmark models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from preprocessing import complete_vars
from preprocessing import ratios
from preprocessing import breakdown_vars
from preprocessing import dummies_ohe
from preprocessing import Xy
from preprocessing import std_z
from imblearn.under_sampling import RandomUnderSampler

In [3]:
df_train = pd.read_csv("Datapooled.csv")

VARS = ['Ganancia bruta', 'Ganancia (pérdida)','Ingresos de actividades ordinarias' , 'Costo de ventas', 'Patrimonio total',
     'Total pasivos', 'Total de activos', 'Ganancias acumuladas',  'Pasivos corrientes totales',  'Activos corrientes totales']

df_train.rename(columns={'Clasificación Industrial Internacional Uniforme Versión 4 A.C':'Sector'}, inplace=True)

df_train = df_train[VARS+[ 'event', 'Sector']]
print(df_train[df_train['event']==1].info())
df_train['complete-vars'] = complete_vars(df_train) #1 is that have all variables!
df_train =  df_train[df_train['complete-vars']==1] #filtering firms that have not financial information 
print(df_train[df_train['event']==1].info())
df_train = ratios(df_train)
predictors =[ 'GPM', 'NPM', 'ROE','ROA', 'IR', 'DER', 'RSL', 'CR', 'Ax1', 'Ax2', 'Sector']
print(df_train[df_train['event']==1].info())
df_train.replace([np.inf,-np.inf], np.nan, inplace=True)
df_train.dropna(inplace=True)
df_train.drop(columns=['complete-vars'], inplace=True)
df_train = df_train[predictors + ['event']]

<class 'pandas.core.frame.DataFrame'>
Index: 771 entries, 4 to 26499
Data columns (total 12 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Ganancia bruta                      353 non-null    float64
 1   Ganancia (pérdida)                  353 non-null    float64
 2   Ingresos de actividades ordinarias  353 non-null    float64
 3   Costo de ventas                     330 non-null    float64
 4   Patrimonio total                    353 non-null    float64
 5   Total pasivos                       353 non-null    float64
 6   Total de activos                    353 non-null    float64
 7   Ganancias acumuladas                351 non-null    float64
 8   Pasivos corrientes totales          352 non-null    float64
 9   Activos corrientes totales          353 non-null    float64
 10  event                               771 non-null    float64
 11  Sector                              771 non-null

In [4]:
X, y = Xy(df_train, 'event')
cat, binaries, nonormal, normal  = breakdown_vars(X)
nums = nonormal + normal
X = dummies_ohe(X, cat)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state = 666, stratify=y)
rus = RandomUnderSampler(random_state=123)
X_train, y_train = rus.fit_resample(X_train, y_train)
from preprocessing import standardize_X_test
X_test = standardize_X_test(X_train, X_test) # Apply the mean and std of X_test with info from X_train
X_train = std_z(nonormal + normal, X_train)

vars = [
'ROE', 'ROA', 
'IR', 'DER', 
'RSL', 'CR', 
'Sector_C', 'Sector_I',
'Sector_K', 'Sector_L', 
'Sector_O', 'Sector_Q',
'Sector_R', 'Sector_U'
]


X_train, X_test = X_train.loc[:, vars],  X_test.loc[:, vars]

GPM
NPM
ROE
ROA
IR
DER
RSL
CR
Ax1
Ax2




In [5]:
X_test.isnull().sum()

ROE         0
ROA         0
IR          0
DER         0
RSL         0
CR          0
Sector_C    0
Sector_I    0
Sector_K    0
Sector_L    0
Sector_O    0
Sector_Q    0
Sector_R    0
Sector_U    0
dtype: int64

In [6]:
y_train.value_counts()

event
0.0    254
1.0    254
Name: count, dtype: int64

In [7]:
y_test.value_counts()

event
0.0    3296
1.0      64
Name: count, dtype: int64

In [8]:
X_train.isnull().sum()

ROE         0
ROA         0
IR          0
DER         0
RSL         0
CR          0
Sector_C    0
Sector_I    0
Sector_K    0
Sector_L    0
Sector_O    0
Sector_Q    0
Sector_R    0
Sector_U    0
dtype: int64

In [9]:
X_test.isnull().sum()

ROE         0
ROA         0
IR          0
DER         0
RSL         0
CR          0
Sector_C    0
Sector_I    0
Sector_K    0
Sector_L    0
Sector_O    0
Sector_Q    0
Sector_R    0
Sector_U    0
dtype: int64

In [10]:
# Logistic Regression
def grid_lr(X_train, y_train):
    model = LogisticRegression(random_state=666, max_iter=1500)
    solvers = ['lbfgs']
    penalty = ['l2',None]
    c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001,0.0001 , 0.00001 ]
    grid = dict(solver=solvers,penalty=penalty,C=c_values)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring='f1',error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    return  grid_result.best_estimator_



"""
def grid_RandomForest(X_train, y_train):
  model = RandomForestClassifier(random_state=666)
  n_estimators =  [100, 300, 500, 800]
  criterion = ['gini', 'entropy', 'log_loss']
  max_depth  =  [None, 5, 10, 30]
  min_samples_split =  [2, 5, 10, 15]
  min_samples_leaf  =[1, 2, 4, 7]
  max_features = ['sqrt', 'log2']


  grid = dict(n_estimators = n_estimators, criterion = criterion,  
              min_samples_split = min_samples_split,  
              max_features=max_features,
              max_depth = max_depth,
              min_samples_leaf = min_samples_leaf
              )
  cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                            scoring='f1',error_score='raise')
  grid_result = grid_search.fit(X_train, y_train)
  return  grid_result.best_estimator_
"""



# Support Vector Machine
def grid_SVM(X_train, y_train, performance_metric='f1', resultsGrid=False):
    model = SVC(random_state=666)
    C = np.linspace(0.000001 , 100, 100)
    kernels = ['poly', 'rbf', 'linear']
    gamma = ['scale', 'auto']
    grid = dict(C = C, kernel = kernels, gamma = gamma)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
    grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                           scoring=performance_metric,error_score='raise')
    grid_result = grid_search.fit(X_train, y_train)
    if resultsGrid==True:
        return grid_result.cv_results_
    else:
        return  grid_result.best_estimator_

In [11]:
model_lr = grid_lr(X_train, y_train)
lr_predict = model_lr.predict(X_test)
print(classification_report(y_test, lr_predict))
lr_table  = pd.DataFrame(classification_report(y_test, lr_predict, output_dict=True)).iloc[:,0:2]


              precision    recall  f1-score   support

         0.0       0.99      0.73      0.84      3296
         1.0       0.05      0.80      0.10        64

    accuracy                           0.73      3360
   macro avg       0.52      0.76      0.47      3360
weighted avg       0.98      0.73      0.83      3360





In [12]:
"""
model_random_forest  = grid_RandomForest(X_train, y_train)
random_forest_predict = model_random_forest.predict(X_test)
print(classification_report(y_test, random_forest_predict))
random_forest_table  = pd.DataFrame(classification_report(y_test,  random_forest_predict, output_dict=True)).iloc[:,0:2]
"""

'\nmodel_random_forest  = grid_RandomForest(X_train, y_train)\nrandom_forest_predict = model_random_forest.predict(X_test)\nprint(classification_report(y_test, random_forest_predict))\nrandom_forest_table  = pd.DataFrame(classification_report(y_test,  random_forest_predict, output_dict=True)).iloc[:,0:2]\n'

In [13]:
model_SVM  = grid_SVM(X_train, y_train)
SVM_predict = model_SVM.predict(X_test)
print(classification_report(y_test, SVM_predict))
SVM_table  = pd.DataFrame(classification_report(y_test,  SVM_predict, output_dict=True)).iloc[:,0:2]


              precision    recall  f1-score   support

         0.0       0.99      0.72      0.83      3296
         1.0       0.05      0.81      0.10        64

    accuracy                           0.72      3360
   macro avg       0.52      0.77      0.47      3360
weighted avg       0.98      0.72      0.82      3360



In [14]:
from sklearn.neural_network import MLPClassifier

def grid_MLP(X_train, y_train):
  model = MLPClassifier(random_state=123)
  hidden_layer_sizes =  [(5, 5)]
  activation = ['logistic']
  solver =  ['sgd'] 
  learning_rate = ['constant', 'invscaling', 'adaptive']
  alpha   =  [0.00001, 0.0001, 0.001, 0.01, 1]
  learning_rate_init = [0.00001, 0.0001, 0.001, 0.01, 1]
  batch_size = [X_train.shape[0]]
  momentum = [0.5, 0.8,  0.9 , 1]
  max_iter = [500, 700, 1000, 1500, 2000]
  grid = dict(hidden_layer_sizes = hidden_layer_sizes,
              solver = solver,
              alpha = alpha,
              max_iter = max_iter,
              activation = activation,
              batch_size = batch_size,
              learning_rate_init = learning_rate_init,
              momentum = momentum,
              learning_rate = learning_rate)
  cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
  grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv,
                            scoring='f1',error_score='raise')
  grid_result = grid_search.fit(X_train, y_train)
  return  grid_result.best_estimator_

In [15]:
model_mlp  = grid_MLP(X_train, y_train)
NN_predict = model_mlp.predict(X_test)
print(classification_report(y_test, NN_predict))
MLP_table  = pd.DataFrame(classification_report(y_test,  NN_predict, output_dict=True)).iloc[:,0:2]

              precision    recall  f1-score   support

         0.0       1.00      0.75      0.86      3296
         1.0       0.06      0.81      0.11        64

    accuracy                           0.75      3360
   macro avg       0.53      0.78      0.48      3360
weighted avg       0.98      0.75      0.84      3360



In [16]:
models_tab = pd.concat([lr_table, SVM_table, MLP_table], axis=1)
cols_names =  pd.MultiIndex.from_tuples([('Logistic regression','No-Default'),("Logistic regression",'Deafult'),
              ("Support vector machine",'No-Default'),('Support vector machine','Default'),
              ('Backpropagation NN', 'No-default'), ('Backpropagation NN', 'Default'),])
models_tab.columns  = cols_names
models_tab = models_tab.style.set_table_styles([
   {'selector': 'th','props': [('text-align', 'center')]}]).format(precision=2)
models_tab.to_latex("benchmark-models.tex")
models_tab

Unnamed: 0_level_0,Logistic regression,Logistic regression,Support vector machine,Support vector machine,Backpropagation NN,Backpropagation NN
Unnamed: 0_level_1,No-Default,Deafult,No-Default,Default,No-default,Default
precision,0.99,0.05,0.99,0.05,1.0,0.06
recall,0.73,0.8,0.72,0.81,0.75,0.81
f1-score,0.84,0.1,0.83,0.1,0.86,0.11
support,3296.0,64.0,3296.0,64.0,3296.0,64.0
