In [165]:
#   Random forest: https://www.datacamp.com/tutorial/random-forests-classifier-python
#   XGBoost: https://www.datacamp.com/tutorial/xgboost-in-python

In [166]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pyplot

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.preprocessing import OrdinalEncoder
from sklearn import datasets
from sklearn import metrics  # Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

In [167]:
df_app = pd.read_csv("data/application_train.csv", encoding='utf-8')
df_bur = pd.read_csv("data/bureau.csv", encoding='utf-8')
df_bur_bal = pd.read_csv("data/bureau_balance.csv", encoding='utf-8')

In [168]:
df_bur['AMT_CREDIT_MAX_OVERDUE'].fillna(0, inplace=True)
creditMax_df = df_bur.groupby(['SK_ID_CURR'])['AMT_CREDIT_MAX_OVERDUE'].mean()

In [169]:
#df_bur['CREDIT_DAY_OVERDUE']
result = df_bur[df_bur.CREDIT_DAY_OVERDUE != 0]
creditDay_df = df_bur.groupby(['SK_ID_CURR'])['CREDIT_DAY_OVERDUE'].max()


In [170]:
df_bur_bal.dtypes
status_mapping = {'C': 0, 'X': 0, '0': 0, '1': 1, '2': 2, '5': 5}
df_bur_bal['STATUS'] = df_bur_bal['STATUS'].map(status_mapping)

In [171]:
status_df = df_bur_bal.groupby(['SK_ID_BUREAU'])['STATUS'].max()

In [172]:
df_bur2 = df_bur.merge(status_df, on='SK_ID_BUREAU', how='left')
df_bur2['STATUS'].fillna(0, inplace=True)

In [173]:
status2_df = df_bur2.groupby(['SK_ID_CURR'])['STATUS'].max()

In [174]:
df_app2 = df_app.merge(creditMax_df, on='SK_ID_CURR', how='left')
df_app2 = df_app2.merge(creditDay_df, on='SK_ID_CURR', how='left')
df_app2 = df_app2.merge(status2_df, on='SK_ID_CURR', how='left')

In [175]:
col_to_ignore = []
for col in df_app2.columns:
    if df_app2[col].isna().sum().sum() > 100000:
        col_to_ignore.append(col)
        # print(col, df_app2[col].isna().sum().sum())
        
df_app2.drop(columns=col_to_ignore, inplace=True)


In [176]:
# split dataset in features and target variable
feature_cols = df_app2.columns[2:]
X = df_app2[feature_cols] # Features
y = df_app2.TARGET # Target variable

In [177]:
df_app2.dtypes

SK_ID_CURR                      int64
TARGET                          int64
NAME_CONTRACT_TYPE             object
CODE_GENDER                    object
FLAG_OWN_CAR                   object
                               ...   
AMT_REQ_CREDIT_BUREAU_QRT     float64
AMT_REQ_CREDIT_BUREAU_YEAR    float64
AMT_CREDIT_MAX_OVERDUE        float64
CREDIT_DAY_OVERDUE            float64
STATUS                        float64
Length: 76, dtype: object

In [178]:
cols = [column for column in df_app2.columns if df_app2[column].dtype == 'object']

In [179]:
enc = OrdinalEncoder()
enc.fit(df_app2[cols])
X[cols] = enc.transform(df_app2[cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = enc.transform(df_app2[cols])


In [180]:
X.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.fillna(0, inplace=True)


In [186]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=21, stratify=y) # 70% training and 30% test

In [200]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(class_weight="balanced")

grid_values = {'n_estimators': list(range(20, 101, 40)), 'max_depth': list(range(5, 16, 5)), 'min_samples_split': list(range(2, 9, 2))}
grid_clf = GridSearchCV(clf, param_grid=grid_values, scoring='roc_auc')

#Train the model using the training sets y_pred=clf.predict(X_test)
grid_clf.fit(X_train,y_train)

y_pred=grid_clf.predict(X_test)

In [201]:
# Model Accuracy, how often is the classifier correct?
print("ROC AUC score:", roc_auc_score(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall score:", recall_score(y_test, y_pred))
print("F1 score:", f1_score(y_test, y_pred))


ROC AUC score: 0.668927686502746
Accuracy: 0.7196467129738027
Precision: 0.16489803048168042
Recall score: 0.6084434418304866
F1 score: 0.2594743171276413
