## Classification Models

In [30]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling import SMOTE

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [21]:
X_train = pd.read_csv("../Data/X_train_default.csv").iloc[:,2:176]
y_train = pd.read_csv("../Data/y_train_default.csv").iloc[:,1]
X_test = pd.read_csv("../Data/X_test_default.csv").iloc[:,2:176]
y_test = pd.read_csv("../Data/y_test_default.csv").iloc[:,1]

In [22]:
X_train = X_train.drop(columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])
X_test = X_test.drop(columns=['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3'])

In [23]:
smote = SMOTE(sampling_strategy='auto', random_state=42, n_jobs=-1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = smote.fit_resample(X_test, y_test)

### Logistic Regression

In [5]:
parameters_logistic_regression = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [1000]
}

grid_search = GridSearchCV(LogisticRegression(), parameters_logistic_regression, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train_resampled, y_train_resampled)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_predictions = best_model.predict(X_test_resampled)
best_conf_matrix = confusion_matrix(y_test_resampled, best_predictions)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_

In [28]:
roc_auc_score(y_test_resampled, best_predictions)

0.9180549660441426

In [29]:
best_score

0.9504311606372637

### CART

In [32]:
parameters_RF = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [10,20,30],
    'min_samples_leaf': [10,20,30],
    'n_estimators': [50,100,200]
}

grid_search_RF = GridSearchCV(RandomForestClassifier(), parameters_RF, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search_RF.fit(X_train_resampled, y_train_resampled)
best_model = grid_search_RF.best_estimator_
best_params = grid_search_RF.best_params_
best_score = grid_search_RF.best_score_
best_predictions = best_model.predict(X_test_resampled)
best_conf_matrix = confusion_matrix(y_test_resampled, best_predictions)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and arra

In [33]:
best_score

0.9674876419078314

In [34]:
for variable, importance in zip(X_train.columns, best_model.feature_importances_):
    print(f"{variable}: {importance}")

CNT_CHILDREN: 0.010224093152488472
AMT_INCOME_TOTAL: 0.006610591003671415
AMT_CREDIT: 0.008073613996900743
AMT_ANNUITY: 0.00689886979482112
AMT_GOODS_PRICE: 0.008028006848018772
REGION_POPULATION_RELATIVE: 0.012021396394640194
OWN_CAR_AGE: 0.03227507359780442
FLAG_MOBIL: 0.0
FLAG_EMP_PHONE: 0.0
FLAG_WORK_PHONE: 0.006035945458028175
FLAG_CONT_MOBILE: 0.0
FLAG_PHONE: 0.030610598721753885
FLAG_EMAIL: 0.003027359299778459
CNT_FAM_MEMBERS: 0.05110438524588299
REGION_RATING_CLIENT: 0.004364098773908917
REGION_RATING_CLIENT_W_CITY: 0.002656406476096286
HOUR_APPR_PROCESS_START: 0.007427523002734957
REG_REGION_NOT_LIVE_REGION: 5.165657628765128e-05
REG_REGION_NOT_WORK_REGION: 0.001762282134537489
LIVE_REGION_NOT_WORK_REGION: 0.0010348389608117402
REG_CITY_NOT_LIVE_CITY: 0.002800754465932025
REG_CITY_NOT_WORK_CITY: 0.006887113490656321
LIVE_CITY_NOT_WORK_CITY: 0.012381431870457009
FLAG_DOCUMENT_2: 0.0
FLAG_DOCUMENT_3: 0.002110895575968489
FLAG_DOCUMENT_4: 0.0
FLAG_DOCUMENT_5: 5.516598861084423e-