In [97]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [98]:
df = pd.read_csv('../Data/cleaned_cc_data.csv')
df.head()

cc_data_scaled = StandardScaler().fit_transform(df[["LIMIT_BAL", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"]])
# Use original column names for the scaled data and concatenate with the columns left out from scaling
cc_data_scaled = pd.DataFrame(cc_data_scaled, columns=["LIMIT_BAL", "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6", "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", "BILL_AMT1", "BILL_AMT2", "BILL_AMT3", "BILL_AMT4", "BILL_AMT5", "BILL_AMT6"])
cc_data_scaled = pd.concat([cc_data_scaled, df[["LIMIT_BAL","SEX","EDUCATION","MARRIAGE","AGE","default.payment.next.month"]]], join='inner',  axis=1)
cc_data_scaled.head()



Unnamed: 0,LIMIT_BAL,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,...,BILL_AMT3.1,BILL_AMT4,BILL_AMT5,BILL_AMT6,LIMIT_BAL.1,SEX,EDUCATION,MARRIAGE,AGE,default.payment.next.month
0,-1.135512,1.791873,1.776654,-0.697265,-0.666863,-1.527923,-1.486255,-0.641203,-0.646339,-0.667135,...,-0.667135,-0.67174,-0.662877,-0.652876,20000.0,2,2,1,24,1
1,-0.365937,-0.876018,1.776654,0.136226,0.186234,0.232334,1.985406,-0.657981,-0.665755,-0.638302,...,-0.638302,-0.620771,-0.605957,-0.598087,120000.0,2,2,2,26,1
2,-0.59681,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.296016,-0.492299,-0.480945,...,-0.480945,-0.448499,-0.41661,-0.391631,90000.0,2,2,2,34,0
3,-0.90464,0.013279,0.109462,0.136226,0.186234,0.232334,0.249576,-0.054075,-0.009999,0.035991,...,0.035991,-0.23068,-0.18578,-0.156445,50000.0,2,2,1,37,0
4,-0.90464,-0.876018,0.109462,-0.697265,0.186234,0.232334,0.249576,-0.577089,-0.610131,-0.158677,...,-0.158677,-0.345548,-0.347449,-0.331449,50000.0,1,2,1,57,0


In [99]:
cc_data_scaled.value_counts('default.payment.next.month')

default.payment.next.month
0    22996
1     6605
Name: count, dtype: int64

In [100]:
# Split the features and target using default.payment.next.month as the target
X = df.drop('default.payment.next.month', axis=1)
y = df['default.payment.next.month']
target_names = ['No Default', 'Default']


In [101]:
# Prepare the data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [102]:
# Import an Extremely Random Trees classifier
from sklearn.ensemble import ExtraTreesClassifier

In [103]:
# Train the ExtraTreesClassifier model
clf = ExtraTreesClassifier(n_estimators=100, random_state=1).fit(X_train, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.9994144144144144
Testing Score: 0.8155654641264694


In [104]:
# Import Gradient Boosting classifier .
from sklearn.ensemble import GradientBoostingClassifier

In [105]:
# Train the Gradient Boosting classifier
clf = GradientBoostingClassifier(random_state=1).fit(X_train, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.8238288288288288
Testing Score: 0.8262396973381976


In [106]:
# Import an Adaptive Boosting classifier
from sklearn.ensemble import AdaBoostClassifier

In [107]:
# Train the AdaBoostClassifier
clf = AdaBoostClassifier(random_state=1).fit(X_train, y_train)

# Evaluate the model
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.8144144144144144
Testing Score: 0.8200243210376976


In [108]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
# Set sampling strategy to auto to balance the dataset
smote = SMOTE(sampling_strategy='auto', random_state=1)


In [109]:
# Fit the original training data to the SMOTE model
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [110]:
# Cout the resampled target data
y_resampled.value_counts()

default.payment.next.month
0    17240
1    17240
Name: count, dtype: int64

In [111]:
# Instantiate a extra trees classifier
smote_extra_trees = ExtraTreesClassifier()

smote_extra_trees.fit(X_resampled, y_resampled)


In [112]:
# Predict the labels for resampled test data
smote_predictions = smote_extra_trees.predict(X_test)

In [113]:
# Print classifcation report
# Evaluate the model
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.8144144144144144
Testing Score: 0.8200243210376976
