In [4]:
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [5]:
df = pd.read_excel('cc_default.xlsx').drop(['Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25'], axis=1).set_index('cust')
df

Unnamed: 0_level_0,ncard,outst,limit,balance,tusage,tcash,tretail,unpaid,branch,payrat,...,usage3,payrat3,util6,usage6,payrat6,balpcard,unpaidplmt,tuseplmt,length,default
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,7691430,7000000,6463709,0,0,0,7398073,5,0.00,...,0.070000,13.04,0.930000,0.080000,26.66,2.466024e+06,1.060000,0.000000,1.000000,1
2,2,5309091,5000000,5309314,0,0,0,5310555,1,9.17,...,0.020000,7.43,0.960000,0.270000,14.26,2.655278e+06,1.060000,0.000000,2.830000,1
3,2,22533915,20000000,22419126,0,0,0,22442690,1,10.51,...,0.100000,7.49,1.130000,0.110000,12.20,1.122134e+07,1.120000,0.000000,0.920000,1
4,2,55083,8000000,23120,0,0,0,0,6,0.00,...,0.000000,0.00,0.020000,0.000000,0.00,0.000000e+00,0.000000,0.000000,0.750000,1
5,3,5747546,6000000,5754698,1300000,1300000,0,5757744,9,42.09,...,0.270000,75.02,0.550000,0.010000,99.94,1.919248e+06,0.960000,0.220000,11.830000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17703,2,41197,30000000,46050,0,0,0,0,1,49.13,...,0.000000,29.21,0.001052,0.000317,71.39,0.000000e+00,0.000000,0.000000,11.083333,0
17704,4,19886398,66000000,12028808,2250000,0,2250000,0,9,100.00,...,0.117773,100.34,0.184462,0.119284,100.15,0.000000e+00,0.000000,0.034091,9.166667,0
17705,2,72918,5000000,89245,0,0,0,0,5,0.00,...,0.005655,25.00,0.013833,0.006800,0.00,0.000000e+00,0.000000,0.000000,3.333333,0
17706,2,32174762,29000000,10059326,0,0,0,31624615,1,0.00,...,0.244776,50.03,1.149329,0.334362,22.26,1.581231e+07,1.090504,0.000000,12.583333,0


In [6]:
df.default.value_counts()

default
0    16143
1     1564
Name: count, dtype: int64

In [7]:
df.columns

Index(['ncard', 'outst', 'limit', 'balance', 'tusage', 'tcash', 'tretail',
       'unpaid', 'branch', 'payrat', 'percol', 'util3', 'usage3', 'payrat3',
       'util6', 'usage6', 'payrat6', 'balpcard', 'unpaidplmt', 'tuseplmt',
       'length', 'default'],
      dtype='object')

In [8]:
X, y = df.drop('default', axis=1), df['default']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[3212,   25],
       [ 289,   16]])

In [10]:
recall_score(y_test, y_pred)

0.05245901639344262

# Oversampling with the SMOTE

In [11]:
sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(X, y)
print(y_res.value_counts())
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

default
1    16143
0    16143
Name: count, dtype: int64


In [13]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[2966,  289],
       [ 172, 3031]])

In [14]:
recall_score(y_test, y_pred)

0.9463003434280363

In [15]:
0.9463003434280363 / 0.05245901639344262

18.03885029659694

That's an 18x on the recall score for Oversampling using SMOTE.

# Undersamping with NearMiss algorithm

In [16]:
nm = NearMiss()
X_res, y_res = nm.fit_resample(X, y)
print(y_res.value_counts())
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

default
0    1564
1    1564
Name: count, dtype: int64


In [17]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

confusion_matrix(y_test, y_pred)

array([[288,  33],
       [ 65, 240]])

In [18]:
recall_score(y_test, y_pred)

0.7868852459016393

In [19]:
0.7868852459016393 / 0.05245901639344262

15.0

And a 15x on the recall score for Undersampling using NearMiss.