## Random Forest Classifier 




In [1]:
import pandas as pd
import numpy as np 
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from collections import Counter
%matplotlib inline
from pathlib import Path

# Needed for decision tree visualization

# Try using oversampling to help balance the data more
import pydotplus
from IPython.display import Image
# model.feature_importances
# target 1 is payment difficulties
import gc

## Encoding 

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
def encode(X):
return X.apply(lambda x: X[x.name] if X[x.name].dtype == 'int64' or X[x.name].dtype == 'float64' else LabelEncoder().fit_transform(X[x.name].astype(str)))

In [4]:
from google.colab import drive
drive.mount('/content/drive')
relative_path = '/content/drive/My Drive/Colab Notebooks'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Import Data

In [5]:
application_train = pd.read_csv(Path('/content/Data/application_train.csv'))
application_train['DAYS_EMPLOYED'] = application_train['DAYS_EMPLOYED'].replace(365243, np.nan)
application_train['OWN_CAR_AGE'] = application_train['OWN_CAR_AGE'].fillna(0)
application_train = encode(application_train)
application_train.shape

(307511, 122)

In [6]:
application_test = pd.read_csv(Path('/content/Data/application_test.csv'))
application_test['DAYS_EMPLOYED'] = application_test['DAYS_EMPLOYED'].replace(365243, np.nan)
application_test['OWN_CAR_AGE'] = application_test['OWN_CAR_AGE'].fillna(0)
application_test = encode(application_test)
application_test.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,...,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,FONDKAPREMONT_MODE,HOUSETYPE_MODE,TOTALAREA_MODE,WALLSMATERIAL_MODE,EMERGENCYSTATE_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100001,0,0,0,1,0,135000.0,568800.0,20560.5,450000.0,6,6,1,1,1,0.01885,-19241,-2329.0,-5170.0,-812,0.0,1,1,0,1,0,1,18,2.0,2,2,5,18,0,0,0,0,0,0,28,...,,0.0514,,,0,0,0.0392,5,0,0.0,0.0,0.0,0.0,-1740.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
1,100005,0,1,0,1,0,99000.0,222768.0,17370.0,180000.0,6,6,4,1,1,0.035792,-18064,-4469.0,-9118.0,-1623,0.0,1,1,0,1,0,0,9,2.0,2,2,0,9,0,0,0,0,0,0,42,...,,,,,0,1,,7,2,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
2,100013,0,1,1,1,0,202500.0,663264.0,69777.0,630000.0,7,6,1,1,1,0.019101,-20038,-4458.0,-2175.0,-3503,5.0,1,1,0,1,0,0,4,2.0,2,2,1,14,0,0,0,0,0,0,54,...,,,,,0,1,,7,2,0.0,0.0,0.0,0.0,-856.0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,1.0,4.0
3,100028,0,0,0,1,2,315000.0,1575000.0,49018.5,1575000.0,6,6,4,1,1,0.026392,-13976,-1866.0,-2000.0,-4208,0.0,1,1,0,1,1,0,14,4.0,2,2,6,11,0,0,0,0,0,0,5,...,0.2446,0.3739,0.0388,0.0817,3,0,0.37,4,0,0.0,0.0,0.0,0.0,-1805.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
4,100038,0,1,1,0,1,180000.0,625500.0,32067.0,625500.0,6,6,4,1,1,0.010032,-13040,-2191.0,-4000.0,-4262,16.0,1,1,1,1,0,0,18,3.0,2,2,0,5,0,0,0,0,1,1,5,...,,,,,0,1,,7,2,0.0,0.0,0.0,0.0,-821.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,,,,,


In [7]:
# use helper function to assist in grouping data
def get_agg(app, agg_list, group_by_column='SK_ID_CURR'):
  app = encode(app)
  app = app.groupby(group_by_column).agg(agg_list)
  app.columns = ['_'.join(col) for col in app.columns.values]
  app.reset_index(inplace=True)
  return app

In [8]:
credit_card_balance = pd.read_csv(Path('/content/Data/credit_card_balance.csv'))
credit_card_balance.drop(columns=['SK_ID_PREV'], inplace=True)
credit_card_balance.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,AMT_PAYMENT_TOTAL_CURRENT,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,378907,-6,56.97,135000,0.0,877.5,0.0,877.5,1700.325,1800.0,1800.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,35.0,Active,0.0,0.0
1,363914,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,2250.0,60175.08,64875.555,64875.555,1.0,1.0,0.0,0.0,69.0,Active,0.0,0.0
2,371185,-7,31815.225,450000,0.0,0.0,0.0,0.0,2250.0,2250.0,2250.0,26926.425,31460.085,31460.085,0.0,0.0,0.0,0.0,30.0,Active,0.0,0.0
3,337855,-4,236572.11,225000,2250.0,2250.0,0.0,0.0,11795.76,11925.0,11925.0,224949.285,233048.97,233048.97,1.0,1.0,0.0,0.0,10.0,Active,0.0,0.0
4,126868,-1,453919.455,450000,0.0,11547.0,0.0,11547.0,22924.89,27000.0,27000.0,443044.395,453919.455,453919.455,0.0,1.0,0.0,1.0,101.0,Active,0.0,0.0


In [9]:
credit_card_balance = get_agg(credit_card_balance, ['min', 'max', 'mean', 'sum', 'var'])
credit_card_balance.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_min,MONTHS_BALANCE_max,MONTHS_BALANCE_mean,MONTHS_BALANCE_sum,MONTHS_BALANCE_var,AMT_BALANCE_min,AMT_BALANCE_max,AMT_BALANCE_mean,AMT_BALANCE_sum,AMT_BALANCE_var,AMT_CREDIT_LIMIT_ACTUAL_min,AMT_CREDIT_LIMIT_ACTUAL_max,AMT_CREDIT_LIMIT_ACTUAL_mean,AMT_CREDIT_LIMIT_ACTUAL_sum,AMT_CREDIT_LIMIT_ACTUAL_var,AMT_DRAWINGS_ATM_CURRENT_min,AMT_DRAWINGS_ATM_CURRENT_max,AMT_DRAWINGS_ATM_CURRENT_mean,AMT_DRAWINGS_ATM_CURRENT_sum,AMT_DRAWINGS_ATM_CURRENT_var,AMT_DRAWINGS_CURRENT_min,AMT_DRAWINGS_CURRENT_max,AMT_DRAWINGS_CURRENT_mean,AMT_DRAWINGS_CURRENT_sum,AMT_DRAWINGS_CURRENT_var,AMT_DRAWINGS_OTHER_CURRENT_min,AMT_DRAWINGS_OTHER_CURRENT_max,AMT_DRAWINGS_OTHER_CURRENT_mean,AMT_DRAWINGS_OTHER_CURRENT_sum,AMT_DRAWINGS_OTHER_CURRENT_var,AMT_DRAWINGS_POS_CURRENT_min,AMT_DRAWINGS_POS_CURRENT_max,AMT_DRAWINGS_POS_CURRENT_mean,AMT_DRAWINGS_POS_CURRENT_sum,AMT_DRAWINGS_POS_CURRENT_var,AMT_INST_MIN_REGULARITY_min,AMT_INST_MIN_REGULARITY_max,AMT_INST_MIN_REGULARITY_mean,AMT_INST_MIN_REGULARITY_sum,...,CNT_DRAWINGS_ATM_CURRENT_min,CNT_DRAWINGS_ATM_CURRENT_max,CNT_DRAWINGS_ATM_CURRENT_mean,CNT_DRAWINGS_ATM_CURRENT_sum,CNT_DRAWINGS_ATM_CURRENT_var,CNT_DRAWINGS_CURRENT_min,CNT_DRAWINGS_CURRENT_max,CNT_DRAWINGS_CURRENT_mean,CNT_DRAWINGS_CURRENT_sum,CNT_DRAWINGS_CURRENT_var,CNT_DRAWINGS_OTHER_CURRENT_min,CNT_DRAWINGS_OTHER_CURRENT_max,CNT_DRAWINGS_OTHER_CURRENT_mean,CNT_DRAWINGS_OTHER_CURRENT_sum,CNT_DRAWINGS_OTHER_CURRENT_var,CNT_DRAWINGS_POS_CURRENT_min,CNT_DRAWINGS_POS_CURRENT_max,CNT_DRAWINGS_POS_CURRENT_mean,CNT_DRAWINGS_POS_CURRENT_sum,CNT_DRAWINGS_POS_CURRENT_var,CNT_INSTALMENT_MATURE_CUM_min,CNT_INSTALMENT_MATURE_CUM_max,CNT_INSTALMENT_MATURE_CUM_mean,CNT_INSTALMENT_MATURE_CUM_sum,CNT_INSTALMENT_MATURE_CUM_var,NAME_CONTRACT_STATUS_min,NAME_CONTRACT_STATUS_max,NAME_CONTRACT_STATUS_mean,NAME_CONTRACT_STATUS_sum,NAME_CONTRACT_STATUS_var,SK_DPD_min,SK_DPD_max,SK_DPD_mean,SK_DPD_sum,SK_DPD_var,SK_DPD_DEF_min,SK_DPD_DEF_max,SK_DPD_DEF_mean,SK_DPD_DEF_sum,SK_DPD_DEF_var
0,100006,-6,-1,-3.5,-21,3.5,0.0,0.0,0.0,0.0,0.0,270000,270000,270000.0,1620000,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,...,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100011,-74,-2,-37.618182,-2069,473.351515,0.0,184568.85,54078.017727,2974290.975,4519606000.0,90000,180000,162000.0,8910000,1320000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9000.0,4105.530818,225804.195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,33.0,25.563636,1406.0,108.472727,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100013,-96,-1,-44.657534,-3260,770.867199,0.0,161420.22,16917.706233,1234992.555,1680734000.0,45000,157500,125136.986301,9135000,2629495000.0,0.0,157500.0,5978.571429,418500.0,826271300.0,0.0,157500.0,5732.876712,418500.0,793271400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7875.0,1283.905435,88589.475,...,0.0,7.0,0.214286,15.0,1.127329,0.0,7.0,0.205479,15.0,1.082192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,22.0,19.318841,1333.0,28.896846,0,0,0.0,0,0.0,0.0,1.0,0.013699,1.0,0.013699,0.0,1.0,0.013699,1.0,0.013699
3,100021,-18,-2,-9.466667,-142,25.266667,0.0,0.0,0.0,0.0,0.0,675000,675000,675000.0,10125000,0.0,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,...,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0,2,1.2,18,1.028571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,100023,-11,-4,-7.714286,-54,6.571429,0.0,0.0,0.0,0.0,0.0,45000,225000,122142.857143,855000,9257143000.0,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,...,,,,0.0,,0.0,0.0,0.0,0.0,0.0,,,,0.0,,,,,0.0,,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Merge data and split this way to avoid overfitting
merged = pd.merge(left=application_train, right=credit_card_balance, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
merged_test = pd.merge(left=application_test, right=credit_card_balance, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del application_train
del credit_card_balance
gc.collect()

15

In [11]:
# import csv file 
prev_app = pd.read_csv(Path('/content/Data/previous_application.csv'))
prev_app.replace(365243, np.nan, inplace= True)
prev_app.drop(columns=['SK_ID_PREV'], inplace=True) 
prev_app.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,271877.0,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35.0,Connectivity,12.0,middle,POS mobile with interest,,-42.0,300.0,-42.0,-37.0,0.0
1,108129.0,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1.0,XNA,36.0,low_action,Cash X-Sell: low,,-134.0,916.0,,,1.0
2,122040.0,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1.0,XNA,12.0,high,Cash X-Sell: high,,-271.0,59.0,,,1.0
3,176158.0,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1.0,XNA,12.0,middle,Cash X-Sell: middle,,-482.0,-152.0,-182.0,-177.0,1.0
4,202054.0,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1.0,XNA,24.0,high,Cash Street: high,,,,,,


In [12]:
prev_app = get_agg(prev_app, ['min', 'max', 'mean'])
prev_app.head()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE_min,NAME_CONTRACT_TYPE_max,NAME_CONTRACT_TYPE_mean,AMT_ANNUITY_min,AMT_ANNUITY_max,AMT_ANNUITY_mean,AMT_APPLICATION_min,AMT_APPLICATION_max,AMT_APPLICATION_mean,AMT_CREDIT_min,AMT_CREDIT_max,AMT_CREDIT_mean,AMT_DOWN_PAYMENT_min,AMT_DOWN_PAYMENT_max,AMT_DOWN_PAYMENT_mean,AMT_GOODS_PRICE_min,AMT_GOODS_PRICE_max,AMT_GOODS_PRICE_mean,WEEKDAY_APPR_PROCESS_START_min,WEEKDAY_APPR_PROCESS_START_max,WEEKDAY_APPR_PROCESS_START_mean,HOUR_APPR_PROCESS_START_min,HOUR_APPR_PROCESS_START_max,HOUR_APPR_PROCESS_START_mean,FLAG_LAST_APPL_PER_CONTRACT_min,FLAG_LAST_APPL_PER_CONTRACT_max,FLAG_LAST_APPL_PER_CONTRACT_mean,NFLAG_LAST_APPL_IN_DAY_min,NFLAG_LAST_APPL_IN_DAY_max,NFLAG_LAST_APPL_IN_DAY_mean,RATE_DOWN_PAYMENT_min,RATE_DOWN_PAYMENT_max,RATE_DOWN_PAYMENT_mean,RATE_INTEREST_PRIMARY_min,RATE_INTEREST_PRIMARY_max,RATE_INTEREST_PRIMARY_mean,RATE_INTEREST_PRIVILEGED_min,RATE_INTEREST_PRIVILEGED_max,RATE_INTEREST_PRIVILEGED_mean,...,NAME_PORTFOLIO_mean,NAME_PRODUCT_TYPE_min,NAME_PRODUCT_TYPE_max,NAME_PRODUCT_TYPE_mean,CHANNEL_TYPE_min,CHANNEL_TYPE_max,CHANNEL_TYPE_mean,SELLERPLACE_AREA_min,SELLERPLACE_AREA_max,SELLERPLACE_AREA_mean,NAME_SELLER_INDUSTRY_min,NAME_SELLER_INDUSTRY_max,NAME_SELLER_INDUSTRY_mean,CNT_PAYMENT_min,CNT_PAYMENT_max,CNT_PAYMENT_mean,NAME_YIELD_GROUP_min,NAME_YIELD_GROUP_max,NAME_YIELD_GROUP_mean,PRODUCT_COMBINATION_min,PRODUCT_COMBINATION_max,PRODUCT_COMBINATION_mean,DAYS_FIRST_DRAWING_min,DAYS_FIRST_DRAWING_max,DAYS_FIRST_DRAWING_mean,DAYS_FIRST_DUE_min,DAYS_FIRST_DUE_max,DAYS_FIRST_DUE_mean,DAYS_LAST_DUE_1ST_VERSION_min,DAYS_LAST_DUE_1ST_VERSION_max,DAYS_LAST_DUE_1ST_VERSION_mean,DAYS_LAST_DUE_min,DAYS_LAST_DUE_max,DAYS_LAST_DUE_mean,DAYS_TERMINATION_min,DAYS_TERMINATION_max,DAYS_TERMINATION_mean,NFLAG_INSURED_ON_APPROVAL_min,NFLAG_INSURED_ON_APPROVAL_max,NFLAG_INSURED_ON_APPROVAL_mean
0,100001.0,1,1,1.0,3951.0,3951.0,3951.0,24835.5,24835.5,24835.5,23787.0,23787.0,23787.0,2520.0,2520.0,2520.0,24835.5,24835.5,24835.5,0,0,0.0,13,13,13.0,1,1,1.0,1,1,1.0,0.104326,0.104326,0.104326,,,,,,,...,3.0,0,0,0.0,4,4,4.0,23.0,23.0,23.0,2,2,2.0,8.0,8.0,8.0,1,1,1.0,13,13,13.0,,,,-1709.0,-1709.0,-1709.0,-1499.0,-1499.0,-1499.0,-1619.0,-1619.0,-1619.0,-1612.0,-1612.0,-1612.0,0.0,0.0,0.0
1,100002.0,1,1,1.0,9251.775,9251.775,9251.775,179055.0,179055.0,179055.0,179055.0,179055.0,179055.0,0.0,0.0,0.0,179055.0,179055.0,179055.0,2,2,2.0,9,9,9.0,1,1,1.0,1,1,1.0,0.0,0.0,0.0,,,,,,,...,3.0,0,0,0.0,7,7,7.0,500.0,500.0,500.0,0,0,0.0,24.0,24.0,24.0,3,3,3.0,15,15,15.0,,,,-565.0,-565.0,-565.0,125.0,125.0,125.0,-25.0,-25.0,-25.0,-17.0,-17.0,-17.0,0.0,0.0,0.0
2,100003.0,0,1,0.666667,6737.31,98356.995,56553.99,68809.5,900000.0,435436.5,68053.5,1035882.0,484191.0,0.0,6885.0,3442.5,68809.5,900000.0,435436.5,0,3,1.666667,12,17,14.666667,1,1,1.0,1,1,1.0,0.0,0.100061,0.05003,,,,,,,...,2.666667,0,2,0.666667,4,7,5.333333,-1.0,1400.0,533.0,4,10,6.333333,6.0,12.0,10.0,3,4,3.666667,7,11,9.0,,,,-2310.0,-716.0,-1274.333333,-1980.0,-386.0,-1004.333333,-1980.0,-536.0,-1054.333333,-1976.0,-527.0,-1047.333333,0.0,1.0,0.666667
3,100004.0,1,1,1.0,5357.25,5357.25,5357.25,24282.0,24282.0,24282.0,20106.0,20106.0,20106.0,4860.0,4860.0,4860.0,24282.0,24282.0,24282.0,0,0,0.0,5,5,5.0,1,1,1.0,1,1,1.0,0.212008,0.212008,0.212008,,,,,,,...,3.0,0,0,0.0,6,6,6.0,30.0,30.0,30.0,2,2,2.0,4.0,4.0,4.0,4,4,4.0,14,14,14.0,,,,-784.0,-784.0,-784.0,-694.0,-694.0,-694.0,-724.0,-724.0,-724.0,-714.0,-714.0,-714.0,0.0,0.0,0.0
4,100005.0,0,1,0.5,4813.2,4813.2,4813.2,0.0,44617.5,22308.75,0.0,40153.5,20076.75,4464.0,4464.0,4464.0,44617.5,44617.5,44617.5,0,4,2.0,10,11,10.5,1,1,1.0,1,1,1.0,0.108964,0.108964,0.108964,,,,,,,...,3.5,0,0,0.0,4,5,4.5,-1.0,37.0,18.0,2,10,6.0,12.0,12.0,12.0,0,1,0.5,2,13,7.5,,,,-706.0,-706.0,-706.0,-376.0,-376.0,-376.0,-466.0,-466.0,-466.0,-460.0,-460.0,-460.0,0.0,0.0,0.0


In [13]:
# Merge data and split this way to avoid overfitting
merged = pd.merge(left=merged, right=prev_app, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
merged_test = pd.merge(left=merged_test, right=prev_app, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del prev_app
gc.collect()

0

In [14]:
# import csv files
bureau = pd.read_csv('/content/Data/bureau.csv')
bureau_balance = pd.read_csv('/content/Data/bureau_balance.csv')

In [15]:
bureau_balance = get_agg(bureau_balance, ['min', 'max', 'mean'], group_by_column='SK_ID_BUREAU')
bureau_all = pd.merge(left=bureau, right=bureau_balance, how='left', left_on='SK_ID_BUREAU', right_on='SK_ID_BUREAU')
bureau_all.drop(columns=['SK_ID_BUREAU'], inplace=True)
bureau_all = get_agg(bureau_all, ['min', 'max', 'mean'])

In [16]:
# Merge data and split this way to avoid overfitting
merged = pd.merge(left=merged, right=bureau_all, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
merged_test = pd.merge(left=merged_test, right=bureau_all, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del bureau
del bureau_balance
del bureau_all

gc.collect()

0

In [17]:
# import csv files
installments_payment = pd.read_csv('/content/Data/installments_payments.csv')
installments_payment.drop(columns=['SK_ID_PREV'], inplace=True)
installments_payment.head()

Unnamed: 0,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,161674,1.0,6,-1180.0,-1187.0,6948.36,6948.36
1,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,193053,2.0,1,-63.0,-63.0,25425.0,25425.0
3,199697,1.0,3,-2418.0,-2426.0,24350.13,24350.13
4,167756,1.0,2,-1383.0,-1366.0,2165.04,2160.585


In [18]:
installments_payment = get_agg(installments_payment, ['max', 'mean', 'sum', 'var'])

In [19]:
# Merge data and split this way to avoid overfitting
merged = pd.merge(left=merged, right=installments_payment, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
merged_test = pd.merge(left=merged_test, right=installments_payment, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del installments_payment
gc.collect()

0

In [20]:
pos_cash_balance = pd.read_csv('/content/Data/POS_CASH_balance.csv')
pos_cash_balance.drop(columns=['SK_ID_PREV'], inplace=True)
pos_cash_balance.head()

Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,182943,-31.0,48.0,45.0,Active,0.0,0.0
1,367990,-33.0,36.0,35.0,Active,0.0,0.0
2,397406,-32.0,12.0,9.0,Active,0.0,0.0
3,269225,-35.0,48.0,42.0,Active,0.0,0.0
4,334279,-35.0,36.0,35.0,Active,0.0,0.0


In [21]:
pos_cash_balance = get_agg(pos_cash_balance, ['max', 'mean', 'size'])

In [22]:
# Merge data and split this way to avoid overfitting
merged = pd.merge(left=merged, right=pos_cash_balance, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')
merged_test = pd.merge(left=merged_test, right=pos_cash_balance, how='left', left_on='SK_ID_CURR', right_on='SK_ID_CURR')

del pos_cash_balance
gc.collect()

0

In [23]:
merged.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,...,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_var,NUM_INSTALMENT_NUMBER_max,NUM_INSTALMENT_NUMBER_mean,NUM_INSTALMENT_NUMBER_sum,NUM_INSTALMENT_NUMBER_var,DAYS_INSTALMENT_max,DAYS_INSTALMENT_mean,DAYS_INSTALMENT_sum,DAYS_INSTALMENT_var,DAYS_ENTRY_PAYMENT_max,DAYS_ENTRY_PAYMENT_mean,DAYS_ENTRY_PAYMENT_sum,DAYS_ENTRY_PAYMENT_var,AMT_INSTALMENT_max,AMT_INSTALMENT_mean,AMT_INSTALMENT_sum,AMT_INSTALMENT_var,AMT_PAYMENT_max,AMT_PAYMENT_mean,AMT_PAYMENT_sum,AMT_PAYMENT_var,MONTHS_BALANCE_max_y,MONTHS_BALANCE_mean_y,MONTHS_BALANCE_size,CNT_INSTALMENT_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_size,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_size,NAME_CONTRACT_STATUS_max,NAME_CONTRACT_STATUS_mean,NAME_CONTRACT_STATUS_size,SK_DPD_max_y,SK_DPD_mean_y,SK_DPD_size,SK_DPD_DEF_max_y,SK_DPD_DEF_mean_y,SK_DPD_DEF_size
0,100002,1,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,6,7,4,3,1,0.018801,-9461,-637.0,-3648.0,-2120,0.0,1,1,0,1,1,0,8,1.0,2,2,6,10,0,0,0,0,0,0,...,20.0,0.052632,19.0,10.0,190.0,31.666667,-25.0,-295.0,-5605.0,28500.0,-49.0,-315.421053,-5993.0,29604.25731,53093.745,11559.247105,219625.695,101164100.0,53093.745,11559.247105,219625.695,101164100.0,-1.0,-11.181818,11.0,24.0,24.0,11.0,24.0,16.181818,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0
1,100003,0,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,1,4,1,1,1,0.003541,-16765,-1188.0,-1186.0,-291,0.0,1,1,0,1,1,0,3,2.0,1,1,1,11,0,0,0,0,0,0,...,26.0,0.04,12.0,5.08,127.0,9.826667,-536.0,-1378.16,-34454.0,573735.89,-544.0,-1385.32,-34633.0,573541.81,560835.36,64754.586,1618864.65,12219660000.0,560835.36,64754.586,1618864.65,12219660000.0,-18.0,-43.192308,26.0,12.0,10.192308,26.0,12.0,5.615385,26.0,4.0,0.307692,26.0,0.0,0.0,26.0,0.0,0.0,26.0
2,100004,0,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,6,7,4,3,1,0.010032,-19046,-225.0,-4260.0,-2531,26.0,1,1,1,1,1,0,8,1.0,2,2,1,9,0,0,0,0,0,0,...,4.0,0.333333,3.0,2.0,6.0,1.0,-724.0,-754.0,-2262.0,900.0,-727.0,-761.666667,-2285.0,1157.333333,10573.965,7096.155,21288.465,9071372.0,10573.965,7096.155,21288.465,9071372.0,-24.0,-25.5,4.0,4.0,3.75,4.0,4.0,2.25,4.0,4.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0
3,100006,0,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,6,7,4,0,1,0.008019,-19005,-3039.0,-9833.0,-2437,0.0,1,1,0,1,0,0,8,2.0,2,2,6,17,0,0,0,0,0,0,...,18.0,0.116667,10.0,4.4375,71.0,8.395833,-11.0,-252.25,-4036.0,31817.0,-12.0,-271.625,-4346.0,39222.116667,691786.89,62947.088438,1007153.415,28256810000.0,691786.89,62947.088438,1007153.415,28256810000.0,-1.0,-10.470588,17.0,48.0,12.4375,17.0,48.0,9.75,17.0,6.0,0.588235,17.0,0.0,0.0,17.0,0.0,0.0,17.0
4,100007,0,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,6,7,4,3,1,0.028663,-19932,-3038.0,-4311.0,-3458,0.0,1,1,0,1,0,0,3,1.0,2,2,4,11,0,0,0,0,1,1,...,77.0,0.141026,17.0,7.045455,465.0,16.751748,-14.0,-1028.606061,-67888.0,465297.965501,-14.0,-1032.242424,-68128.0,461933.971096,22678.785,12666.444545,835985.34,61668210.0,22678.785,12214.060227,806127.975,64771280.0,-1.0,-32.887097,62.0,24.0,15.580645,62.0,24.0,9.225806,62.0,7.0,0.306452,62.0,0.0,0.0,62.0,0.0,0.0,62.0


## Split data into training and testing sets

In [24]:
y = merged["TARGET"].values.reshape(-1, 1)
y[:5]

array([[1],
       [0],
       [0],
       [0],
       [0]])

In [25]:
X = merged.drop(columns=['SK_ID_CURR', 'TARGET'])
X.head()

Unnamed: 0,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,NAME_TYPE_SUITE,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,ORGANIZATION_TYPE,EXT_SOURCE_1,...,NUM_INSTALMENT_VERSION_sum,NUM_INSTALMENT_VERSION_var,NUM_INSTALMENT_NUMBER_max,NUM_INSTALMENT_NUMBER_mean,NUM_INSTALMENT_NUMBER_sum,NUM_INSTALMENT_NUMBER_var,DAYS_INSTALMENT_max,DAYS_INSTALMENT_mean,DAYS_INSTALMENT_sum,DAYS_INSTALMENT_var,DAYS_ENTRY_PAYMENT_max,DAYS_ENTRY_PAYMENT_mean,DAYS_ENTRY_PAYMENT_sum,DAYS_ENTRY_PAYMENT_var,AMT_INSTALMENT_max,AMT_INSTALMENT_mean,AMT_INSTALMENT_sum,AMT_INSTALMENT_var,AMT_PAYMENT_max,AMT_PAYMENT_mean,AMT_PAYMENT_sum,AMT_PAYMENT_var,MONTHS_BALANCE_max_y,MONTHS_BALANCE_mean_y,MONTHS_BALANCE_size,CNT_INSTALMENT_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_size,CNT_INSTALMENT_FUTURE_max,CNT_INSTALMENT_FUTURE_mean,CNT_INSTALMENT_FUTURE_size,NAME_CONTRACT_STATUS_max,NAME_CONTRACT_STATUS_mean,NAME_CONTRACT_STATUS_size,SK_DPD_max_y,SK_DPD_mean_y,SK_DPD_size,SK_DPD_DEF_max_y,SK_DPD_DEF_mean_y,SK_DPD_DEF_size
0,0,1,0,1,0,202500.0,406597.5,24700.5,351000.0,6,7,4,3,1,0.018801,-9461,-637.0,-3648.0,-2120,0.0,1,1,0,1,1,0,8,1.0,2,2,6,10,0,0,0,0,0,0,5,0.083037,...,20.0,0.052632,19.0,10.0,190.0,31.666667,-25.0,-295.0,-5605.0,28500.0,-49.0,-315.421053,-5993.0,29604.25731,53093.745,11559.247105,219625.695,101164100.0,53093.745,11559.247105,219625.695,101164100.0,-1.0,-11.181818,11.0,24.0,24.0,11.0,24.0,16.181818,11.0,0.0,0.0,11.0,0.0,0.0,11.0,0.0,0.0,11.0
1,0,0,0,0,0,270000.0,1293502.5,35698.5,1129500.0,1,4,1,1,1,0.003541,-16765,-1188.0,-1186.0,-291,0.0,1,1,0,1,1,0,3,2.0,1,1,1,11,0,0,0,0,0,0,39,0.311267,...,26.0,0.04,12.0,5.08,127.0,9.826667,-536.0,-1378.16,-34454.0,573735.89,-544.0,-1385.32,-34633.0,573541.81,560835.36,64754.586,1618864.65,12219660000.0,560835.36,64754.586,1618864.65,12219660000.0,-18.0,-43.192308,26.0,12.0,10.192308,26.0,12.0,5.615385,26.0,4.0,0.307692,26.0,0.0,0.0,26.0,0.0,0.0,26.0
2,1,1,1,1,0,67500.0,135000.0,6750.0,135000.0,6,7,4,3,1,0.010032,-19046,-225.0,-4260.0,-2531,26.0,1,1,1,1,1,0,8,1.0,2,2,1,9,0,0,0,0,0,0,11,,...,4.0,0.333333,3.0,2.0,6.0,1.0,-724.0,-754.0,-2262.0,900.0,-727.0,-761.666667,-2285.0,1157.333333,10573.965,7096.155,21288.465,9071372.0,10573.965,7096.155,21288.465,9071372.0,-24.0,-25.5,4.0,4.0,3.75,4.0,4.0,2.25,4.0,4.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0
3,0,0,0,1,0,135000.0,312682.5,29686.5,297000.0,6,7,4,0,1,0.008019,-19005,-3039.0,-9833.0,-2437,0.0,1,1,0,1,0,0,8,2.0,2,2,6,17,0,0,0,0,0,0,5,,...,18.0,0.116667,10.0,4.4375,71.0,8.395833,-11.0,-252.25,-4036.0,31817.0,-12.0,-271.625,-4346.0,39222.116667,691786.89,62947.088438,1007153.415,28256810000.0,691786.89,62947.088438,1007153.415,28256810000.0,-1.0,-10.470588,17.0,48.0,12.4375,17.0,48.0,9.75,17.0,6.0,0.588235,17.0,0.0,0.0,17.0,0.0,0.0,17.0
4,0,1,0,1,0,121500.0,513000.0,21865.5,513000.0,6,7,4,3,1,0.028663,-19932,-3038.0,-4311.0,-3458,0.0,1,1,0,1,0,0,3,1.0,2,2,4,11,0,0,0,0,1,1,37,,...,77.0,0.141026,17.0,7.045455,465.0,16.751748,-14.0,-1028.606061,-67888.0,465297.965501,-14.0,-1032.242424,-68128.0,461933.971096,22678.785,12666.444545,835985.34,61668210.0,22678.785,12214.060227,806127.975,64771280.0,-1.0,-32.887097,62.0,24.0,15.580645,62.0,24.0,9.225806,62.0,7.0,0.306452,62.0,0.0,0.0,62.0,0.0,0.0,62.0


In [26]:
# Use this helper function to assit in dropping null values and use the mean ti fill the gap
def fill_in(data):
    for i in data.columns:
      if data[i].dtype == "object":
          data[i] = data[i].fillna("")
      elif (data[i].dtype == "int64" or data[i].dtype == "float64"):
             data[i] = data[i].fillna(data[i].mean())
      else:
            pass
    return data

In [27]:
X = fill_in(X)

In [28]:
X.isnull().sum().sum()

0

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=78)

In [31]:
y_train[:-50]

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

## Scale the data

In [32]:
scaler = StandardScaler()

In [33]:
# fit the scaler to the training data
scaler = scaler.fit(X_train)

In [34]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [35]:
X_train_scaled.shape

(230633, 435)

## Build RandomForest Model

In [36]:
model = RandomForestClassifier(n_estimators = 1000, random_state = 78, max_depth=3, min_samples_split=4)

In [37]:
model = model.fit(X_train_scaled, y_train )

  """Entry point for launching an IPython kernel.


In [38]:
# We need to make predictions based on the data
predictions = model.predict(X_test_scaled)

In [39]:
y_test.shape

(76878, 1)

In [40]:
# Model Evaluation
# create a confusion matrix to easily read the data evaluation
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual Approve", "Actual Deny"], columns=["Predicted Approve", "Predicted Deny"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [41]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted Approve,Predicted Deny
Actual Approve,70749,0
Actual Deny,6129,0


Accuracy Score : 0.9202762819011941
Classification Report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     70749
           1       0.00      0.00      0.00      6129

    accuracy                           0.92     76878
   macro avg       0.46      0.50      0.48     76878
weighted avg       0.85      0.92      0.88     76878



  _warn_prf(average, modifier, msg_start, len(result))


## Do oversampling 

In [42]:
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state =78, sampling_strategy = 1).fit_resample(X_train_scaled, y_train)

from collections import Counter

Counter(y_resampled)

  y = column_or_1d(y, warn=True)


Counter({0: 211937, 1: 211937})

In [97]:
model_2 = RandomForestClassifier(n_estimators = 1500, random_state = 78, max_depth=2, min_samples_split=4 )

In [98]:
model_2 = model_2.fit(X_resampled, y_resampled)

In [99]:
predictions_2 = model_2.predict(X_test_scaled)

In [100]:
cm = confusion_matrix(y_test, predictions_2)
cm_df = pd.DataFrame(
    cm, index=["Actual Approve", "Actual Deny"], columns=["Predicted Approve", "Predicted Deny"]
)

# Calculating the accuracy score
acc_score_2 = accuracy_score(y_test, predictions_2)

In [101]:
#important_features = model.feature_importances_
importances_sorted = sorted(zip(model_2.feature_importances_, X.columns), reverse=True)
#importances_sorted

In [102]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions_2))

Confusion Matrix


Unnamed: 0,Predicted Approve,Predicted Deny
Actual Approve,59122,11627
Actual Deny,4605,1524


Accuracy Score : 0.9202762819011941
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.84      0.88     70749
           1       0.12      0.25      0.16      6129

    accuracy                           0.79     76878
   macro avg       0.52      0.54      0.52     76878
weighted avg       0.86      0.79      0.82     76878



In [108]:
# Use SelectFromModel to use the the top important feautres

In [103]:
 from sklearn.feature_selection import SelectFromModel

In [104]:
model_3 = SelectFromModel(model_2, prefit=True)

In [105]:
new_df = model_3.transform(X_resampled)
xtest_transformed = model_3.transform(X_test_scaled)

In [52]:
new_df[0]

array([-7.24017494e-01, -7.20354933e-01, -6.55316371e-01,  6.24047461e-01,
       -1.77981453e-04, -6.24815196e-01,  1.17106660e+00, -6.10366829e-02,
        1.18488236e+00, -2.81876915e-01,  5.14638145e-01,  1.41180308e-03,
       -5.49771919e-04, -7.00034850e-04, -6.57016678e-01,  8.95096743e-01,
        8.23772522e-01,  1.04924042e+00, -1.55969602e+00, -8.15217529e-02,
       -6.74300803e-02, -1.80902630e-01,  8.58791036e-01, -3.37750479e-01,
        2.69935510e-01, -1.86044208e+00,  1.12844894e+00, -4.98124920e-01,
        3.63934971e-01,  2.72362700e-01, -1.15909152e-02, -6.29807099e-02,
       -4.96544021e-02, -1.08390621e-02, -5.78952935e-02, -4.20434095e-02,
       -4.86417556e-01, -1.24958485e-02, -3.57761623e-02, -2.07669551e-02,
       -3.30710277e-02, -1.78927668e-01, -1.27712279e-01, -1.83114615e-02,
       -3.25496327e-01, -1.08326930e-01, -3.55857084e-01, -2.78951633e-01,
       -5.74134988e-02, -2.91411047e-01, -2.15502934e-01, -3.78428462e-03,
       -1.35760379e-02, -

In [53]:
y_resampled.shape

(423874,)

## Refit new model


In [119]:
rf_model = RandomForestClassifier(n_estimators = 1100, random_state = 78, max_depth=2, min_samples_split=4, min_samples_leaf=2)

In [120]:
rf_model = rf_model.fit(new_df, y_resampled)

In [121]:
predictions_3 = rf_model.predict(xtest_transformed)

In [122]:
predictions_3.shape

(76878,)

In [123]:
cm = confusion_matrix(y_test, predictions_3)
cm_df_3 = pd.DataFrame(
    cm, index=["Actual Approve", "Actual Deny"], columns=["Predicted Approve", "Predicted Deny"]
)

# Calculating the accuracy score
acc_score_3 = accuracy_score(y_test, predictions_3)

In [124]:
# Displaying results
print("Confusion Matrix")
display(cm_df_3)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions_3))

Confusion Matrix


Unnamed: 0,Predicted Approve,Predicted Deny
Actual Approve,59342,11407
Actual Deny,4265,1864


Accuracy Score : 0.9202762819011941
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.84      0.88     70749
           1       0.14      0.30      0.19      6129

    accuracy                           0.80     76878
   macro avg       0.54      0.57      0.54     76878
weighted avg       0.87      0.80      0.83     76878

