In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE



In [2]:
train = pd.read_csv('train_data.csv')
test = pd.read_csv('test_features.csv')

In [3]:
train = train.replace({'EDUCATION': 6}, 5)
train = train.replace({'EDUCATION': 0}, 5)
train = train.replace({'MARRIAGE': 0}, 3)
train = train.drop('ID', axis=1)

test = test.replace({'EDUCATION': 6}, 5)
test = test.replace({'EDUCATION': 0}, 5)
test = test.replace({'MARRIAGE': 0}, 3)
test = test.drop('ID', axis=1)
test = test.drop('Unnamed: 0', axis=1)

In [4]:
X_train = train.drop('default payment next month', axis=1)
y_train = train['default payment next month']

In [5]:
categorical_feature_names = ['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']

In [6]:
ohe = OneHotEncoder(categories="auto", handle_unknown="ignore")
ohe.fit(X_train[categorical_feature_names])
feature_array_train = ohe.transform(X_train[categorical_feature_names]).toarray()
ohe_df_train = pd.DataFrame(feature_array_train, columns=ohe.get_feature_names(categorical_feature_names), index=X_train.index)

X_train = X_train.drop(categorical_feature_names, axis=1)
X_train = pd.concat([X_train, ohe_df_train], axis=1)

In [7]:
feature_array = ohe.transform(test[categorical_feature_names]).toarray()
ohe_df = pd.DataFrame(feature_array, columns=ohe.get_feature_names(categorical_feature_names), index=test.index)

# drop the old feature from X and concat the new one-hot encoded df
X_test = test.drop(categorical_feature_names, axis=1)
X_test = pd.concat([X_test, ohe_df], axis=1)

In [8]:
rf = RandomForestClassifier(class_weight=None, criterion='entropy', max_depth = 10, max_features='auto', n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [9]:
y_preds = rf.predict(X_test)

In [10]:
pd.DataFrame(y_preds).to_csv('credit_default_preds_hmdcus.csv', index=False)

In [11]:
y_test = pd.read_csv('chisea_bakeoff_classification_answers.csv',header = None)[1]

In [12]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.84      0.96      0.89      4673
           1       0.69      0.36      0.47      1328

    accuracy                           0.82      6001
   macro avg       0.77      0.66      0.68      6001
weighted avg       0.81      0.82      0.80      6001

