In [1]:
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from joblib import load, dump

In [2]:
# Read data
df_data = pd.read_csv('../data/train_data.csv')

In [3]:
df_data.columns

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [4]:
# Drop Loan_ID
df_data = df_data.drop(columns='Loan_ID')

#### Model Building

In [5]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df_data.drop(columns='Loan_Status'),
                                                    df_data['Loan_Status'],
                                                    stratify=df_data['Loan_Status'],
                                                    train_size=0.7, random_state=123) 

In [6]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((429, 11), (185, 11), (429,), (185,))

In [7]:
X_train.head(3)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
433,Male,Yes,0,Graduate,No,2425,2340.0,143.0,360.0,1.0,Semiurban
69,Female,No,0,Graduate,No,4300,0.0,136.0,360.0,0.0,Semiurban
49,Female,No,0,Graduate,No,4000,2275.0,144.0,360.0,1.0,Semiurban


In [8]:
X_train.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,429.0,429.0,412.0,420.0,395.0
mean,5343.016317,1519.843357,146.550971,341.428571,0.827848
std,5614.656513,2323.436844,82.931654,65.307184,0.377991
min,150.0,0.0,9.0,36.0,0.0
25%,2929.0,0.0,104.0,360.0,1.0
50%,3859.0,1260.0,130.0,360.0,1.0
75%,5780.0,2253.0,167.25,360.0,1.0
max,63337.0,33837.0,700.0,480.0,1.0


In [9]:
# Check categorical columns
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education',
                       'Self_Employed', 'Property_Area']
for column in categorical_columns:
    print(f"Column: {column}")
    print(X_train[column].value_counts())
    print("="*50)

Column: Gender
Gender
Male      345
Female     74
Name: count, dtype: int64
Column: Married
Married
Yes    278
No     149
Name: count, dtype: int64
Column: Dependents
Dependents
0     240
1      76
2      75
3+     29
Name: count, dtype: int64
Column: Education
Education
Graduate        333
Not Graduate     96
Name: count, dtype: int64
Column: Self_Employed
Self_Employed
No     345
Yes     58
Name: count, dtype: int64
Column: Property_Area
Property_Area
Semiurban    166
Urban        137
Rural        126
Name: count, dtype: int64


In [10]:
# Create categorical to continuous maps
categorical_continuous_map = defaultdict(lambda : -1)
categorical_continuous_map.update({'Male': 1, 'Female': 0, 'Yes': 1, 'No': 0, '0': 0,
                            '1': 1, '2': 2, '3+': 3, 'Graduate': 1, 'Not Graduate': 0,
                            'Semiurban': 0, 'Urban': 1, 'Rural': 2})

In [11]:
# Fill nan's and convert categorical to continuous
X_train = X_train.fillna(-1)
X_train = X_train.replace(categorical_continuous_map)
X_test = X_test.fillna(-1)
X_test = X_test.replace(categorical_continuous_map)

In [12]:
# Continuous columns
continuous_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                      'Loan_Amount_Term', 'Credit_History']

In [13]:
scaler_model = StandardScaler()
scaler_model = scaler_model.fit(X_train[continuous_columns])
X_train[continuous_columns] = scaler_model.transform(X_train[continuous_columns])
X_test[continuous_columns] = scaler_model.transform(X_test[continuous_columns])

In [14]:
# Model building
randomforest_model = RandomForestClassifier(random_state=123)
params_map = {'n_estimators': [5, 10, 30, 50, 100, 200],
              'criterion': ['gini', 'entropy'],
              'max_depth': [2, 3, 5, 7, 10],
              'class_weight': ['balanced', None]}
ss_split = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=123)
gridsearchcv = GridSearchCV(randomforest_model, param_grid=params_map,
                            scoring='f1_weighted', n_jobs=-1, refit=True, cv=ss_split,
                            return_train_score=True)

In [15]:
# Model training
gridsearchcv_result = gridsearchcv.fit(X_train, y_train).cv_results_
print("Mean fit time : %.3fs" % gridsearchcv_result['mean_fit_time'].mean())
print("Mean test time : %.3fs" % gridsearchcv_result['mean_score_time'].mean())
print("Mean train score : %.3f" % gridsearchcv_result['mean_train_score'].mean())
print("Mean CV score : %.3f" % gridsearchcv_result['mean_test_score'].mean())

Mean fit time : 0.305s
Mean test time : 0.026s
Mean train score : 0.834
Mean CV score : 0.734


In [16]:
# Get the train score on the best estimator
print("Best Train Score : %.3f" % f1_score(y_train, gridsearchcv.predict(X_train), average='weighted'))
# Get the test score on the best estimator
y_prediction = gridsearchcv.predict(X_test)
print("Best Test Score  : %.3f" % f1_score(y_test, y_prediction, average='weighted'))
print("Best params : ", gridsearchcv.best_params_)

Best Train Score : 0.822
Best Test Score  : 0.745
Best params :  {'class_weight': None, 'criterion': 'gini', 'max_depth': 5, 'n_estimators': 30}


In [17]:
# Classification report
print(classification_report(y_true=y_test, y_pred=y_prediction))

              precision    recall  f1-score   support

           N       0.95      0.33      0.49        58
           Y       0.76      0.99      0.86       127

    accuracy                           0.78       185
   macro avg       0.86      0.66      0.68       185
weighted avg       0.82      0.78      0.75       185



In [18]:
# Save models
dump(gridsearchcv.best_estimator_, '../models/randomforest_model.model')
dump(scaler_model, '../models/scaler_model.model')
dump({'Male': 1, 'Female': 0, 'Yes': 1, 'No': 0, '0': 0,
      '1': 1, '2': 2, '3+': 3, 'Graduate': 1, 'Not Graduate': 0,
      'Semiurban': 0, 'Urban': 1, 'Rural': 2}, '../models/categorical_continuous_map.dict')

['../models/categorical_continuous_map.dict']

#### Model Testing

In [5]:
df_data.head(2)

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N


In [6]:
randomforest_model = load('../models/randomforest_model.model')
scaler_model = load('../models/scaler_model.model')
categorical_continuous_map = defaultdict(lambda x: -1)
categorical_continuous_map.update(load('../models/categorical_continuous_map.dict'))

In [7]:
df_data_sample = df_data.sample(10, random_state=123)

In [8]:
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education',
                       'Self_Employed', 'Property_Area']
continuous_columns = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
                      'Loan_Amount_Term', 'Credit_History']

In [9]:
X_test = df_data_sample.drop(columns='Loan_Status')
y_test = df_data_sample['Loan_Status']
X_test = X_test.fillna(-1)
X_test = X_test.replace(categorical_continuous_map)
X_test[continuous_columns] = scaler_model.transform(X_test[continuous_columns])

In [10]:
y_prediction = randomforest_model.predict(X_test)

In [11]:
f1_score(y_test, y_prediction, average='weighted')

0.8

In [12]:
print(classification_report(y_true=y_test, y_pred=y_prediction))

              precision    recall  f1-score   support

           N       0.50      0.50      0.50         2
           Y       0.88      0.88      0.88         8

    accuracy                           0.80        10
   macro avg       0.69      0.69      0.69        10
weighted avg       0.80      0.80      0.80        10

