# Going to use 27-Nov-2023 dataset for all 3 RFs

In [1]:
# Good idea from Uncle Steve to do this as it puts a timestamp on the last run
import datetime
print(datetime.datetime.now())

2023-12-17 16:01:27.318250


In [2]:
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np

import matplotlib.pyplot as plt

import itertools
import scipy

from sklearn.preprocessing import LabelEncoder


# Apparently, this display the full output, not just the last result in Jupyter
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# Also a good idea to show the version that was used in the last run, in case of conflicting results due to different versions
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 1.3.1.


# Google Drive Things

In [4]:
# If you want to import data from Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

# Set the Google Drive path you want to read from
# Andre's Path
#path = "/content/drive/My Drive/Education/Notes/MMAI 869 - Machine Learning and AI Technology/Assignments/Team/MMAI 869 Team Assignment Shared Google Folder/Data Cleaning/"

# Read Data

In [5]:
# If using Google Drive in Colab
#train_aggregated_df = pd.read_csv(path + "train_aggregated_df.csv")
#test_aggregated_df = pd.read_csv(path + "test_aggregated_df.csv")

# If using local computer
train_aggregated_df = pd.read_csv("train_aggregated_df_27-Nov-2023.csv")
test_aggregated_df = pd.read_csv("test_aggregated_df_27-Nov-2023.csv")

train_aggregated_df.head()
train_aggregated_df.info()

Unnamed: 0,customerid,systemloanid_x,loannumber_x,approveddate_x,creationdate_x,loanamount_x,totaldue_x,termdays_x,referredby_x,good_bad_flag_x,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,level_of_education_clients,Age,loanamount_sum,loanamount_mean,loanamount_median,loanamount_max,loanamount_min,loannumber_count,totaldue_sum,totaldue_mean,termdays_max,termdays_min,termdays_mean,average_interest,average_repayment_time,EarlyRepaymentFlag,LatePaymentFlag
0,8a2a81a74ce8c05d014cfb32a0da1049,301994762,12,2017-07-25 08:22:56,2017-07-25 07:22:47,30000.0,34500.0,30,0,1,Other,3.43201,6.433055,Diamond Bank,Permanent,Post-Graduate,45,200000.0,18181.818182,20000.0,30000.0,10000.0,11.0,242900.0,22081.818182,30.0,30.0,30.0,3900.0,29.964184,0.636364,0.363636
1,8a8588f35438fe12015444567666018e,301966580,7,2017-07-06 14:52:57,2017-07-06 13:52:51,20000.0,22250.0,15,0,1,Other,11.13935,10.292041,EcoBank,Permanent,Others,33,60000.0,10000.0,10000.0,10000.0,10000.0,6.0,70500.0,11750.0,30.0,15.0,17.5,1750.0,18.65892,0.833333,0.5
2,8a85890754145ace015429211b513e16,301999343,3,2017-07-27 19:00:41,2017-07-27 18:00:35,10000.0,11500.0,15,0,1,Savings,3.98577,7.491708,First Bank,Permanent,Others,40,20000.0,10000.0,10000.0,10000.0,10000.0,2.0,24500.0,12250.0,30.0,15.0,22.5,2250.0,31.841991,0.0,1.0
3,8a858970548359cc0154883481981866,301962360,9,2017-07-03 23:42:45,2017-07-03 22:42:39,40000.0,44000.0,30,0,1,Other,7.457913,9.076574,GT Bank,Permanent,Primary,31,150000.0,18750.0,20000.0,30000.0,10000.0,8.0,188400.0,23550.0,60.0,30.0,37.5,4800.0,27.564702,1.0,0.0
4,8a8589f35451855401546b0738c42524,301986516,8,2017-07-19 21:46:24,2017-07-19 20:46:18,30000.0,39000.0,60,0,1,Savings,3.311981,6.633271,Access Bank,Permanent,Others,45,100000.0,14285.714286,10000.0,20000.0,10000.0,7.0,124800.0,17828.571429,30.0,30.0,30.0,3542.857143,25.202556,0.571429,0.428571


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4359 non-null   object 
 1   systemloanid_x              4359 non-null   int64  
 2   loannumber_x                4359 non-null   int64  
 3   approveddate_x              4359 non-null   object 
 4   creationdate_x              4359 non-null   object 
 5   loanamount_x                4359 non-null   float64
 6   totaldue_x                  4359 non-null   float64
 7   termdays_x                  4359 non-null   int64  
 8   referredby_x                4359 non-null   int64  
 9   good_bad_flag_x             4359 non-null   int64  
 10  bank_account_type           4359 non-null   object 
 11  longitude_gps               4359 non-null   float64
 12  latitude_gps                4359 non-null   float64
 13  bank_name_clients           4359 

# Select the Training Features to use and assign to X and y

In [6]:
train_aggregated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 32 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  4359 non-null   object 
 1   systemloanid_x              4359 non-null   int64  
 2   loannumber_x                4359 non-null   int64  
 3   approveddate_x              4359 non-null   object 
 4   creationdate_x              4359 non-null   object 
 5   loanamount_x                4359 non-null   float64
 6   totaldue_x                  4359 non-null   float64
 7   termdays_x                  4359 non-null   int64  
 8   referredby_x                4359 non-null   int64  
 9   good_bad_flag_x             4359 non-null   int64  
 10  bank_account_type           4359 non-null   object 
 11  longitude_gps               4359 non-null   float64
 12  latitude_gps                4359 non-null   float64
 13  bank_name_clients           4359 

In [7]:
# Define your feature columns (independent variables)
X = train_aggregated_df.drop(columns=['good_bad_flag_x','customerid','systemloanid_x', 'approveddate_x', 'creationdate_x'], axis=1)
# Define your target variable (dependent variable)
y = train_aggregated_df['good_bad_flag_x']

## Set up to create do the predictions for submission

In [8]:
# test_aggregated_df was already read above, so let's just look at it to start

test_aggregated_df.head()
test_aggregated_df.info()

customer_id_predict = test_aggregated_df['customerid']

Unnamed: 0,customerid,systemloanid_x,loannumber_x,approveddate_x,creationdate_x,loanamount_x,totaldue_x,termdays_x,referredby_x,bank_account_type,longitude_gps,latitude_gps,bank_name_clients,employment_status_clients,level_of_education_clients,Age,loanamount_sum,loanamount_mean,loanamount_median,loanamount_max,loanamount_min,loannumber_count,totaldue_sum,totaldue_mean,termdays_max,termdays_min,termdays_mean,average_interest,average_repayment_time,EarlyRepaymentFlag,LatePaymentFlag
0,8a858899538ddb8e015390510b321f08,301998974,4,40:48.0,39:35.0,10000,12250.0,30,0,Other,5.835219,8.27473,First Bank,Permanent,Others,34,26000.0,8666.666667,10000.0,10000.0,6000.0,3.0,32900.0,10966.666667,30.0,15.0,25.0,2300.0,28.617388,0.333333,0.666667
1,8a858959537a097401537a4e316e25f7,301963615,10,43:40.0,42:34.0,40000,44000.0,30,0,Savings,3.245768,6.601778,First Bank,Permanent,Others,31,215000.0,23888.888889,30000.0,40000.0,5000.0,9.0,248400.0,27600.0,60.0,15.0,31.666667,3711.111111,24.689053,0.777778,0.222222
2,8a8589c253ace09b0153af6ba58f1f31,301982236,6,15:11.0,15:04.0,20000,24500.0,30,0,Savings,3.227945,6.586668,UBA,Permanent,Others,36,65000.0,13000.0,10000.0,20000.0,10000.0,5.0,79675.0,15935.0,30.0,15.0,27.0,2935.0,31.73919,0.4,0.6
3,8a858e095aae82b7015aae86ca1e030b,301971730,8,00:54.0,00:49.0,30000,34500.0,30,0,Savings,6.106486,7.287064,Zenith Bank,Permanent,Others,40,120000.0,17142.857143,20000.0,30000.0,10000.0,7.0,135400.0,19342.857143,30.0,15.0,17.142857,2200.0,13.837419,1.0,0.0
4,8a858e225a28c713015a30db5c48383d,301959177,4,04:33.0,04:27.0,20000,24500.0,30,0,Savings,5.248368,13.059864,UBA,Permanent,Others,42,30000.0,10000.0,10000.0,10000.0,10000.0,3.0,37500.0,12500.0,30.0,15.0,25.0,2500.0,22.252338,1.0,0.333333


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1450 entries, 0 to 1449
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   customerid                  1450 non-null   object 
 1   systemloanid_x              1450 non-null   int64  
 2   loannumber_x                1450 non-null   int64  
 3   approveddate_x              1450 non-null   object 
 4   creationdate_x              1450 non-null   object 
 5   loanamount_x                1450 non-null   int64  
 6   totaldue_x                  1450 non-null   float64
 7   termdays_x                  1450 non-null   int64  
 8   referredby_x                1450 non-null   int64  
 9   bank_account_type           1450 non-null   object 
 10  longitude_gps               1450 non-null   float64
 11  latitude_gps                1450 non-null   float64
 12  bank_name_clients           1450 non-null   object 
 13  employment_status_clients   1450 

## Select Prediction Features, make sure we select the same as Training Features

In [9]:
# Select the features for the submission data set, the same as the training data set above
X_predict = test_aggregated_df.iloc[:, [2,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]]
X_predict.shape

(1450, 27)

1. Pipeline (Target encoding and GridSearchCV)

In [10]:
#Pipeline for target encoding + GridSearchCV

from types import NoneType
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from category_encoders import TargetEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


# Define categorical and numerical features
categorical_features = ['bank_account_type', 'bank_name_clients', 'employment_status_clients',
                        'level_of_education_clients']
numerical_features = X.columns.difference(categorical_features)

# Create a column transformer with target encoding for categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', TargetEncoder(), categorical_features)
    ])


# Create a pipeline with preprocessor and classifier
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])


# Define the parameter grid for GridSearchCV
param_grid = {
   'classifier__n_estimators': [300], #steve recoomends big number, don't tune
   #'classifier__max_depth': [none], #steve recommends, dont tune, tune other hyperparameters to control size
   'classifier__max_samples': [0.5, 0.6, 0.7], #recommendation - usually want 0.5-0.7 OK to tune
   'classifier__min_samples_split': [2,5,10,30,50],  #min number of instances in node to consider splitting --higher = less overfitting. good to tune!
   'classifier__class_weight':["balanced"]




}
#other features that you can consider tuning
#max_features='sqrt', #recommends done tune, default is good
#min_samples_split=2,
    #random_state = 0


# Create GridSearchCV object with error_score set to 'raise' for debugging
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, error_score='raise')

# Fit the model
try:
    grid_search.fit(X, y)
except ValueError as e:
    print("Error during fitting: ", e)
    # Optionally add more diagnostic print statements here

# If the fitting is successful, print best parameters and score
if grid_search.best_score_:
    print("Best parameters found: ", grid_search.best_params_)
    print("Best score found: ", grid_search.best_score_)

Best parameters found:  {'classifier__class_weight': 'balanced', 'classifier__max_samples': 0.6, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best score found:  0.7900902158227915


Best parameters found:  {'classifier__class_weight': 'balanced', 'classifier__max_samples': 0.6, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best score found:  0.7934959541390455

# Splitting the (Train) Data

In [11]:
# We will work with train_aggregated_df here. We will save test_aggregated_df for final prediction later
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

type(X_train)
print()
type(y_train)


pandas.core.frame.DataFrame




pandas.core.series.Series

In [12]:
X.size

117693

# Target Encoding with A Multi-label Target

In [13]:
import category_encoders as ce
from category_encoders.wrapper import PolynomialWrapper

enc = PolynomialWrapper(ce.target_encoder.TargetEncoder(min_samples_leaf=1, smoothing=0, return_df=True))
enc.fit(X_train[['bank_account_type']], y_train.tolist())
X_train['bank_account_type'] = enc.transform(X_train[['bank_account_type']])
X_test['bank_account_type'] = enc.transform(X_test[['bank_account_type']])
X_predict['bank_account_type'] = enc.transform(X_predict[['bank_account_type']])
enc.fit(X_train[['bank_name_clients']], y_train.tolist())
X_train['bank_name_clients'] = enc.transform(X_train[['bank_name_clients']])
X_test['bank_name_clients'] = enc.transform(X_test[['bank_name_clients']])
X_predict['bank_name_clients'] = enc.transform(X_predict[['bank_name_clients']])
enc.fit(X_train[['employment_status_clients']], y_train.tolist())
X_train['employment_status_clients'] = enc.transform(X_train[['employment_status_clients']])
X_test['employment_status_clients'] = enc.transform(X_test[['employment_status_clients']])
X_predict['employment_status_clients'] = enc.transform(X_predict[['employment_status_clients']])
enc.fit(X_train[['level_of_education_clients']], y_train.tolist())
X_train['level_of_education_clients'] = enc.transform(X_train[['level_of_education_clients']])
X_test['level_of_education_clients'] = enc.transform(X_test[['level_of_education_clients']])
X_predict['level_of_education_clients'] = enc.transform(X_predict[['level_of_education_clients']])





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_predict['bank_account_type'] = enc.transform(X_predict[['bank_account_type']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_predict['bank_name_clients'] = enc.transform(X_predict[['bank_name_clients']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_predict['employment_status_clients'] = en

# Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf_rf1 = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=0
)
clf_rf1.fit(X_train, y_train)
    
clf_rf2 = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    max_features='sqrt',
    max_samples=0.7,
    min_samples_split=2,
    random_state=0
)
clf_rf2.fit(X_train, y_train)

# Best model from grid search
clf_rf3 = grid_search.best_estimator_


classifiers = [('rf1', clf_rf1), ('rf2', clf_rf2), ('rf3', clf_rf3)]

cclf = VotingClassifier(estimators=classifiers, voting="soft")
cclf = cclf.fit(X_train, y_train)








## Feature Importances

In [15]:
#clf_rf.feature_importances_

In [16]:
#feature_names = train_aggregated_df.columns[[2,5,6,7,8, 10, 11,12,13,14,15, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]]
#values = sorted(zip(feature_names, clf_rf.feature_importances_), key=lambda x: x[1] * -1)
#values

In [17]:
#from yellowbrick.model_selection import ValidationCurve

#viz = ValidationCurve(RandomForestClassifier(max_depth=None, min_samples_split=2, random_state=0), param_name="n_estimators", param_range=np.arange(1, 100), cv=5, scoring="roc_auc")
#%time viz.fit(X, y)
#viz.poof(outpath='out/default-rf-n_estimators.png')
#viz.poof()

## Predict the Test Split

In [18]:
y_pred_rf = cclf.predict(X_test)

# Performance metrics



## Model Performance

In [19]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred_rf))

[[ 47 161]
 [ 23 641]]


In [20]:
from sklearn.metrics import classification_report
#class_names = [str(x) for x in ccl1.classes_]
#print(classification_report(y1_2_test, y1_pred_rf, target_names=class_names))
#print(classification_report(y1_2_test, y2_pred_rf, target_names=class_names))

print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.67      0.23      0.34       208
           1       0.80      0.97      0.87       664

    accuracy                           0.79       872
   macro avg       0.74      0.60      0.61       872
weighted avg       0.77      0.79      0.75       872



In [21]:
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, log_loss
print("Accuracy = {:.2f}".format(accuracy_score(y_test, y_pred_rf)))
print("Kappa = {:.2f}".format(cohen_kappa_score(y_test, y_pred_rf)))
print("F1 Score = {:.2f}".format(f1_score(y_test, y_pred_rf)))
print("Log Loss = {:.2f}".format(log_loss(y_test, y_pred_rf)))

Accuracy = 0.79
Kappa = 0.25
F1 Score = 0.87
Log Loss = 7.61


## Make the Predictions

In [22]:
# make the predictions

#y1_2_predict = eclf.fit(X1_2, y1_2).predict(X1_2_predict)
#y3_predict = eclf.fit(X3, y3).predict(X3_test_tosubmit)

#y1_2_predict = eclf.fit(X1_2, y1_2).predict(X1_2_predict)
y_predict = cclf.predict(X_predict)

## Create the submission

In [23]:
submission_df = pd.DataFrame({'customer_id':customer_id_predict, 'Good_Bad_flag':y_predict})
submission_df

Unnamed: 0,customer_id,Good_Bad_flag
0,8a858899538ddb8e015390510b321f08,1
1,8a858959537a097401537a4e316e25f7,1
2,8a8589c253ace09b0153af6ba58f1f31,1
3,8a858e095aae82b7015aae86ca1e030b,1
4,8a858e225a28c713015a30db5c48383d,1
...,...,...
1445,8a858fb45bb59c21015bb88a191f58f2,1
1446,8a858fcb5b00cc54015b0253ced26a5f,1
1447,8a858fde56eb02280156f59b976d46c0,1
1448,8a858e10570f2d65015717fcfec44996,1


In [24]:
submission_df.Good_Bad_flag.value_counts()

1    1323
0     127
Name: Good_Bad_flag, dtype: int64

## Save the submission to CSV

In [25]:
submissionPath = "EnsembleVoting_RFx3_submission_2_Team_Bay.csv"
print(submissionPath)
submission_df.to_csv(submissionPath, index=False)

EnsembleVoting_RFx3_submission_2_Team_Bay.csv
