In [47]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv('../dataset/Statlog (German Credit Data) Data Set/german.data', sep=' ', header=None)
df.columns = ['Status_of_existing_checking_account', 'Duration_in_month', 'Credit_history',
             'Purpose', 'Credit_amount', 'Savings_account', 'Present_employment_since',
             'Installment_rate_in_percentage_of_disposable_income', 'Personal_status_and_sex',
             'Other_debtors', 'Present_residence_since', 'Property', 'Age_in_years',
             'Other_installment_plans', 'Housing', 'Number_of_existing_credits_at_this_bank',
             'Job', 'Number_of_people_being_liable_to_provide_maintenance_for', 'Telephone', 'Foreign worker', 'class']

In [3]:
df.head()

Unnamed: 0,Status_of_existing_checking_account,Duration_in_month,Credit_history,Purpose,Credit_amount,Savings_account,Present_employment_since,Installment_rate_in_percentage_of_disposable_income,Personal_status_and_sex,Other_debtors,...,Property,Age_in_years,Other_installment_plans,Housing,Number_of_existing_credits_at_this_bank,Job,Number_of_people_being_liable_to_provide_maintenance_for,Telephone,Foreign worker,class
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,1
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,2
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,1
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,1
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,2


In [4]:
df['class'] = df['class'] - 1

Creation of dummy variables for categorical data

In [7]:
list(df.columns.values)

['Status_of_existing_checking_account',
 'Duration_in_month',
 'Credit_history',
 'Purpose',
 'Credit_amount',
 'Savings_account',
 'Present_employment_since',
 'Installment_rate_in_percentage_of_disposable_income',
 'Personal_status_and_sex',
 'Other_debtors',
 'Present_residence_since',
 'Property',
 'Age_in_years',
 'Other_installment_plans',
 'Housing',
 'Number_of_existing_credits_at_this_bank',
 'Job',
 'Number_of_people_being_liable_to_provide_maintenance_for',
 'Telephone',
 'Foreign worker',
 'class']

In [8]:
discrete_columns, continuous_columns = [], []
for i, j in enumerate(df.dtypes[:-1]): ## select columns up till class
    if j == object:
        discrete_columns.append(df.columns[i])
    else:
        continuous_columns.append(df.columns[i])

In [11]:
discrete_columns

['Status_of_existing_checking_account',
 'Credit_history',
 'Purpose',
 'Savings_account',
 'Present_employment_since',
 'Personal_status_and_sex',
 'Other_debtors',
 'Property',
 'Other_installment_plans',
 'Housing',
 'Job',
 'Telephone',
 'Foreign worker']

In [13]:
continuous_columns

['Duration_in_month',
 'Credit_amount',
 'Installment_rate_in_percentage_of_disposable_income',
 'Present_residence_since',
 'Age_in_years',
 'Number_of_existing_credits_at_this_bank',
 'Number_of_people_being_liable_to_provide_maintenance_for']

In [15]:
dummy_stseca = pd.get_dummies(df['Status_of_existing_checking_account'], prefix='status_exs_accnt')
dummy_ch = pd.get_dummies(df['Credit_history'], prefix='cred_hist')
dummy_purpose = pd.get_dummies(df['Purpose'], prefix='purpose')
dummy_savacc = pd.get_dummies(df['Savings_account'], prefix='sav_acc')
dummy_presc = pd.get_dummies(df['Present_employment_since'], prefix='pre_emp_snc')
dummy_perssx = pd.get_dummies(df['Personal_status_and_sex'], prefix='per_stat_sx')
dummy_othdts = pd.get_dummies(df['Other_debtors'], prefix='oth_debtors')
dummy_property = pd.get_dummies(df['Property'], prefix='property')
dummy_othinstpln = pd.get_dummies(df['Other_installment_plans'], prefix='oth_inst_plan')
dummy_housing = pd.get_dummies(df['Housing'], prefix='housing')
dummy_job = pd.get_dummies(df['Job'], prefix='job')
dummy_telephn = pd.get_dummies(df['Telephone'], prefix='telephn')
dummy_forgnwrkr = pd.get_dummies(df['Foreign worker'], prefix='forgn_wrkr')

In [16]:
credit_continuous = df[continuous_columns]

In [18]:
df_new = pd.concat([dummy_stseca, dummy_ch, dummy_purpose, dummy_savacc, dummy_presc,
                   dummy_perssx, dummy_othdts, dummy_property, dummy_othdts, dummy_property,
                   dummy_othinstpln, dummy_housing, dummy_job, dummy_telephn, dummy_forgnwrkr, credit_continuous,
                   df['class']], axis=1)

In [20]:
df_new.columns

Index(['status_exs_accnt_A11', 'status_exs_accnt_A12', 'status_exs_accnt_A13',
       'status_exs_accnt_A14', 'cred_hist_A30', 'cred_hist_A31',
       'cred_hist_A32', 'cred_hist_A33', 'cred_hist_A34', 'purpose_A40',
       'purpose_A41', 'purpose_A410', 'purpose_A42', 'purpose_A43',
       'purpose_A44', 'purpose_A45', 'purpose_A46', 'purpose_A48',
       'purpose_A49', 'sav_acc_A61', 'sav_acc_A62', 'sav_acc_A63',
       'sav_acc_A64', 'sav_acc_A65', 'pre_emp_snc_A71', 'pre_emp_snc_A72',
       'pre_emp_snc_A73', 'pre_emp_snc_A74', 'pre_emp_snc_A75',
       'per_stat_sx_A91', 'per_stat_sx_A92', 'per_stat_sx_A93',
       'per_stat_sx_A94', 'oth_debtors_A101', 'oth_debtors_A102',
       'oth_debtors_A103', 'property_A121', 'property_A122', 'property_A123',
       'property_A124', 'oth_debtors_A101', 'oth_debtors_A102',
       'oth_debtors_A103', 'property_A121', 'property_A122', 'property_A123',
       'property_A124', 'oth_inst_plan_A141', 'oth_inst_plan_A142',
       'oth_inst_plan_A1

In [28]:
X = df_new.iloc[:, :-1]
y = df_new['class']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

The random forest ML model is applied with assumed hyperparameter values, as follows:
-  Number of trees is 1000
-  Criterion of slitting is gini
-  Maximum depth each decision tree can grow is 100
-  Minimum observations required at each not to be eligible for splitting is 3
-  Minimum number of observations in tree node should be 2

In [42]:
rf_fit = RandomForestClassifier(n_estimators=1000, criterion='gini', max_depth=100, min_samples_split=3, min_samples_leaf=2)
rf_fit.fit(X_train, y_train)
print('Random Forest - Train Confusion Matrix\n\n', pd.crosstab(y_train, rf_fit.predict(X_train),
                                                             rownames=['Actuall'], colnames=['Predicted']), sep="")
print('\nRandom Forest - Train accuracy', round(accuracy_score(y_train, rf_fit.predict(X_train)), 3))

print('\nRandom Forest - Test Confusion Matrix\n\n', pd.crosstab(y_test, rf_fit.predict(X_test),
                                                                rownames=['Actual'], colnames=['Predicted']), sep="")
print('\nRandom Forest - Test accuracy', round(accuracy_score(y_test, rf_fit.predict(X_test)), 3))

Random Forest - Train Confusion Matrix

Predicted    0    1
Actuall            
0          491    0
1           30  179

Random Forest - Train accuracy 0.957

Random Forest - Test Confusion Matrix

Predicted    0   1
Actual            
0          198  11
1           63  28

Random Forest - Test accuracy 0.753


Tuning and optimizing hyperparameters with grid search

-  Number of tree is (1000, 2000, 3000)
-  Maximum depth is (100, 200, 300)
-  Minimum samples per split are (2, 3)
-  Minimum samples in leaf node are (1,2)

In [43]:
from sklearn.pipeline import Pipeline #
from sklearn.model_selection import train_test_split, GridSearchCV

In [46]:
# The Pipeline function creates the combinations which will be applied one by one 
# sequentially to determine the best possible combination:
pipeline = Pipeline([('clf', RandomForestClassifier(criterion='gini'))])
parameters = {
    'clf__n_estimators': (1000, 2000, 3000),
    'clf__max_depth': (100, 200, 300),
    'clf__min_samples_split': (2, 3),
    'clf__min_samples_leaf': (1, 2)
}

In the following, grid search utilizes cross-validation of five to ensure robustness in the
model, which is the ML way of creating two-point validation of the model:

In [48]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, cv=5, verbose=1, scoring='accuracy')
grid_search.fit(X_train, y_train)

print('Best Training score: %0.3f' % grid_search.best_score_)
print('Best parameters set:')
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print('\t%s: %r' % (param_name, best_parameters[param_name]))

predictions = grid_search.predict(X_test)
print("Testing accuracy:", round(accuracy_score(y_test, predictions), 4))
print("\nComplete report of Testing data\n", classification_report(y_test, predictions))

print("\n\nRandom Forest Grid Search - Test Confusion Matrix\n\n",
     pd.crosstab(y_test, predictions, rownames=['Actuall'], colnames=['Predicted']), sep="")

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   23.4s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  1.9min finished


Best Training score: 0.773
Best parameters set:
	clf__max_depth: 200
	clf__min_samples_leaf: 2
	clf__min_samples_split: 3
	clf__n_estimators: 1000
Testing accuracy: 0.7467

Complete report of Testing data
              precision    recall  f1-score   support

          0       0.76      0.94      0.84       209
          1       0.68      0.31      0.42        91

avg / total       0.73      0.75      0.71       300



Random Forest Grid Search - Test Confusion Matrix

Predicted    0   1
Actuall           
0          196  13
1           63  28


In [None]:
# References and credits to
# Statistics for Machine Learning