In [5]:
import pandas as pd
import numpy as np
from collections import Counter

import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
sns.set_style('ticks')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn import datasets, linear_model
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
import statsmodels.api as sm
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [59]:
df_train = pd.read_csv('dataset.csv', index_col=0)

df_test = pd.read_csv('validation_set.csv', index_col=0)

In [60]:
def df_cleaner(df_train):
    df_train = df_train.astype(float)
    df_train.SEX = np.where(df_train.SEX == 2, 0, df_train.SEX)

    conditions_mar = [df_train.MARRIAGE == 3, df_train.MARRIAGE == 2]
    choices_mar = [0, 0]
    df_train.MARRIAGE = np.select(conditions_mar, choices_mar, df_train.MARRIAGE)

    conditions_edu = [df_train.EDUCATION == 5, df_train.EDUCATION == 6, df_train.EDUCATION == 0]
    choices_edu = [4, 4, 4]
    df_train.EDUCATION = np.select(conditions_edu, choices_edu, df_train.EDUCATION)
    
    df_train.rename(columns={'default payment next month': 'default', 'PAY_0': 'PAY_1'}, inplace=True)

    col_dict = {col: col.lower() for col in df_train.columns}
    df_train.rename(columns=col_dict, inplace = True) 
    
    to_dummy = ['education', 'pay_1', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']
    df_train = pd.get_dummies(df_train, columns=to_dummy, prefix=to_dummy, drop_first=True, dtype=float)

    df_train.columns = [col.replace('.0', '').replace('-', 'minus_') for col in df_train.columns]
    
    return df_train

In [61]:
df_train = df_cleaner(df_train)
df_test = df_cleaner(df_test)

In [62]:
train_drop = list(np.setdiff1d(list(df_train.columns),list(df_test.columns)))
# yields the elements in `train columns` that are NOT in `test columns`
train_drop.remove('default')
train_drop

['pay_5_8', 'pay_6_8']

In [63]:
df_train = df_train.drop(columns=train_drop, axis=1)
df_train.head()

Unnamed: 0,limit_bal,sex,marriage,age,bill_amt1,bill_amt2,bill_amt3,bill_amt4,bill_amt5,bill_amt6,...,pay_5_6,pay_5_7,pay_6_minus_1,pay_6_0,pay_6_2,pay_6_3,pay_6_4,pay_6_5,pay_6_6,pay_6_7
0,200000.0,0.0,1.0,30.0,140327.0,143137.0,145374.0,147273.0,149244.0,151973.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,200000.0,0.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,20000.0,0.0,0.0,28.0,2937.0,5696.0,4144.0,8168.0,6894.0,11424.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,50000.0,1.0,0.0,23.0,51274.0,50474.0,49724.0,48437.0,18712.0,19129.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,20000.0,1.0,1.0,47.0,390.0,780.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## FEATURE CREATION/SELECTION

In [64]:
def df_features(df_train):
    df_train['avg_bill'] = (df_train['bill_amt1'] + df_train['bill_amt2'] + df_train['bill_amt3'] + df_train['bill_amt4'] + df_train['bill_amt5'] + df_train['bill_amt6']).apply(lambda x: x/6)
    df_train['avg_pay'] = (df_train['pay_amt1'] + df_train['pay_amt2'] + df_train['pay_amt3'] + df_train['pay_amt4'] + df_train['pay_amt5'] + df_train['pay_amt6']).apply(lambda x: x/6)
    df_train['bill_pct'] = df_train['avg_bill'] / df_train['limit_bal']

    to_drop = ['bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6']
    df_train.drop(columns=to_drop, inplace=True)

    corr_drop = ['pay_1_8', 'pay_4_8', 'pay_5_7', 'pay_2_7', 'pay_1_7', 'pay_2_0', 'pay_6_7']
    df_train.drop(columns=corr_drop, inplace=True)
    
    return df_train

In [65]:
df_train = df_features(df_train)
df_test = df_features(df_test)

X_train = df_train.drop(columns=['default'], axis=1)
y_train = df_train['default']

X_test = df_test.copy()

In [66]:
def find_extremes(df):
    '''Takes in a dataframe and returns a list of columns with values farther than 4 standard deviations from the mean.'''
    extreme_list = []
    for column in list(df.columns):
        if (df[column].max() > (df[column].mean() + 4*df[column].std())) or (df[column].min() < (df[column].mean() - 4*df[column].std())):
            extreme_list.append(column)
    return extreme_list

def rein_extremes(df, columns):
    '''Takes in a dataframe and a list of columns and changes any values farther than 4 standard deviations from the mean
    to 4 standard deviations from the mean.
    Overwrites the input column!'''
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        conditions = [df[column] > mean + 4*std,
                      df[column] < mean - 4*std]
        choices = [mean + 4*std,
                   mean - 4*std]
        df[column] = np.select(conditions, choices, df[column])

In [67]:
['limit_bal',
 'age',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6',
 'avg_bill',
 'avg_pay',
 'bill_pct']

['limit_bal',
 'age',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6',
 'avg_bill',
 'avg_pay',
 'bill_pct']

In [68]:
extreme_cols = find_extremes(X_train)
extremes = extreme_cols[:8]
extremes.extend(extreme_cols[-3:])
extremes

['limit_bal',
 'age',
 'pay_amt1',
 'pay_amt2',
 'pay_amt3',
 'pay_amt4',
 'pay_amt5',
 'pay_amt6',
 'avg_bill',
 'avg_pay',
 'bill_pct']

In [69]:
rein_extremes(X_train, extremes)
rein_extremes(X_test, extremes)

In [74]:
X_sm = sm.add_constant(X_train)
est = sm.OLS(y, X_sm)
est2 = est.fit()

p = est2.pvalues
high_p = p[p > .05].keys()

X_train = X_train.drop(columns=high_p, axis=1)
X_test = X_test.drop(columns=high_p, axis=1)


In [128]:
est2.summary()

0,1,2,3
Dep. Variable:,default,R-squared:,0.208
Model:,OLS,Adj. R-squared:,0.206
Method:,Least Squares,F-statistic:,92.26
Date:,"Fri, 22 May 2020",Prob (F-statistic):,0.0
Time:,13:40:21,Log-Likelihood:,-9512.4
No. Observations:,22500,AIC:,19150.0
Df Residuals:,22435,BIC:,19680.0
Df Model:,64,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.1567,0.015,10.484,0.000,0.127,0.186
limit_bal,-1.617e-07,3e-08,-5.398,0.000,-2.2e-07,-1.03e-07
sex,0.0170,0.005,3.329,0.001,0.007,0.027
marriage,0.0169,0.006,2.990,0.003,0.006,0.028
age,0.0008,0.000,2.640,0.008,0.000,0.001
pay_amt1,-1.865e-06,3.5e-07,-5.331,0.000,-2.55e-06,-1.18e-06
pay_amt2,-1.614e-06,3.13e-07,-5.158,0.000,-2.23e-06,-1e-06
pay_amt3,-1.01e-06,3.4e-07,-2.972,0.003,-1.68e-06,-3.44e-07
pay_amt4,-5.219e-07,3.67e-07,-1.423,0.155,-1.24e-06,1.97e-07

0,1,2,3
Omnibus:,3728.195,Durbin-Watson:,1.998
Prob(Omnibus):,0.0,Jarque-Bera (JB):,5865.191
Skew:,1.187,Prob(JB):,0.0
Kurtosis:,3.789,Cond. No.,53000000.0


In [75]:
print(X_train.shape)
print(X_test.shape)

(22500, 32)
(7500, 33)


In [78]:
test_drop = list(np.setdiff1d(list(X_test.columns),list(X_train.columns)))
# yields the elements in `test columns` that are NOT in `train columns`
test_drop

['pay_2_8']

In [79]:
X_test = X_test.drop(columns=test_drop, axis=1)
X_test.shape

(7500, 32)

## SCALE

In [80]:
scaler = StandardScaler()  
scaler.fit(X_train)

X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)

### Model

In [116]:
params = { 
    'penalty': ['l1', 'l2', 'elasticnet'],
    'C': [round(0.00125+(.00005*i), 5) for i in range(10)],
    'solver': ['sag', 'saga', 'liblinear', 'lbfgs']
}

#create a grid search object and fit it to the data
CV_weighted = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42),
                      params, n_jobs=-1, scoring='f1', verbose=2)
CV_weighted.fit(X_train, y_train)

Fitting 5 folds for each of 120 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done 124 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    7.9s finished


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=42, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.00125, 0.0013, 0.00135, 0.0014, 0.00145,
                               0.0015, 0.00155, 0.0016, 0.00165, 0.0017],
                         'penalty': ['l1', 'l2', 'elasticnet'],
                         'solver': ['sag', 'saga', 'liblinear', 'lbfgs']},
             pre_dispatch='2*n_jobs', refit=True, return_tra

In [117]:
### Identify the best params 
print(CV_weighted.best_estimator_)

#Identify the best score during fitting with cross-validation
print(CV_weighted.best_score_)

LogisticRegression(C=0.0014, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)
0.531238041870105


In [118]:
y_preds = CV_weighted.best_estimator_.predict(X_test)
y_preds

array([0., 0., 0., ..., 0., 0., 0.])

In [132]:
pd.DataFrame(y_preds, columns=['default']).to_csv('predictions_CF_JS.csv')

In [133]:
predictions = pd.read_csv('predictions_CF_JS.csv', index_col=0)

In [135]:
predictions['default'].value_counts(normalize=True)

0.0    0.738133
1.0    0.261867
Name: default, dtype: float64