In [2]:
import pandas as pd
import numpy as np
import os
import statsmodels.api as sm

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LassoCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import itertools
import category_encoders as ce
import datetime

  from pandas.core import datetools
  from numpy.core.umath_tests import inner1d


In [4]:
os.chdir(r"C:\Users\kredfield\Documents\Berkeley\W207\FinalProject\Inputs")

# Importing Data

First, we import the relevant datasets. These datasets were downloaded as a zip from https://www.kaggle.com/c/9120/download-all. You can download the data and assign your os.chdir to the download location to run this kernel.

In [119]:
bureau = pd.read_csv("./bureau.csv")
bureau_balance = pd.read_csv("./bureau_balance.csv")

In [7]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

train_raw = pd.read_csv("./application_train.csv")
dev_data = pd.read_csv("./application_test.csv")
credit_card_balance = pd.read_csv("./credit_card_balance.csv")
cash_balance = pd.read_csv("./POS_CASH_balance.csv")

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

began at 2018-08-21 08:19:14.581880
complete at 2018-08-21 08:19:47.722676
total runtime: 0:00:33.140796


In [8]:
train_labels = train_raw["TARGET"]
train = train_raw.drop(labels="TARGET", axis=1)

train_data, test_data, train_labels, test_labels = train_test_split(train,train_labels)
for df in [train_data, test_data, train_labels, test_labels]:
    print(df.shape)

(230633, 121)
(76878, 121)
(230633,)
(76878,)


In [122]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

#create mini sets just for debugging
mini_train_data   = train_data[:10000]
mini_train_labels = train_labels[:10000]
mini_test_data    = test_data[:10000]
mini_test_labels  = test_labels[:10000]


#make mini versions of bureau and bureau balance for debugging
mini_bureau         = bureau[:100000]
mini_bureau_balance = bureau_balance[:100000]
mini_cash           = cash_balance[:100000]
mini_credit         = credit_card_balance[:100000]

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

began at 2018-08-20 16:12:09.337189
complete at 2018-08-20 16:12:09.581857
total runtime: 0:00:00.244668


# Data Cleaning Functions

The data requires a few main cleaning functions: 
    1. categorizing string variables
    2. replacing infinitea and na variables
    3. scaling the data to 0 mean and unit variance

In [123]:
def handle_missing_inf(df,only_num = False):
    if only_num in [True,False]:
        print("Entered original loop for replace_na")
        mini_df = df[:10]
        g = mini_df.columns.to_series().groupby(mini_df.dtypes).groups
        type_dict = {k.name: v for k, v in g.items()}
        num_cols = []
        for x in mini_df.columns:        
            try:
                if x in type_dict['float64'] or x in type_dict['int64']:
                    num_cols.append(x)
            except:
                pass
        
        for x in num_cols:
            df[x] = np.where(np.isinf(df[x]),np.nan,df[x])
            df[x] = df[x].fillna(df[x].mean())
            
        if only_num is True:
            print("removing all non-numeric columns")
            return df[num_cols]
        
        else:
            return df
    else:
        print('only_num parameter may only be True or False')
        raise

#deprecated
def replace_inf(df):
    if isinstance(df, pd.DataFrame):
        for clm in df.columns:
            if not df[clm].dtype == "O":
                df[clm] = np.where(np.isinf(df[clm]),np.nan,df[clm])
    return df

def merge_bureau(df):
    df = pd.merge(df,bureau,on="SK_ID_CURR", how='left', indicator=True)
    df = df[df["_merge"] != "right_only"]
    df["credit_history"] = np.where(df._merge == "left_only",0,1)
    del df["_merge"]
    
    return df

def categorize_string_vars(df):
    str_cols = []
    mini_df = df[:10]
#     for clm in mini_df.columns:
#             if mini_df[clm].dtype == 'O':
#                 if mini_df[clm].nunique() < 60:
#                     str_cols.append(clm)
    encoder = ce.OneHotEncoder()
    df = encoder.fit_transform(df)
    
    return df

scaler = StandardScaler()

# Define Confusion Matrix Plotting Function

We'll need to plot a confusion matrix for later classification functions. We'll define it here.

In [124]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Feature Engineering

Now, we're going to create some additional features out of the observation's credit histories. Importantly, these features map well to the features used in constructing the FICO credit score. First, we look at their history in repaying their loans.

In [125]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

mini_bureau_balance = bureau_balance[:10000]
mini_bureau = bureau[:10000]


#replace string vars with negative numbers, we're just trying to pull out the dpd loans
bureau_balance["STATUS_encode"] = bureau_balance.STATUS.replace("C",-1)
bureau_balance["STATUS_encode"] = bureau_balance.STATUS_encode.replace("X",0)

#then cast that new column as numeric
bureau_balance["STATUS_encode"] = bureau_balance["STATUS_encode"].astype('int64')

#conver months_balance, which is negative, to months_ago, which makes more intuitive sense
bureau_balance["months_ago"] = bureau_balance["MONTHS_BALANCE"] *-1

#perform operations by group since the data is long
grp = bureau_balance.groupby("SK_ID_BUREAU")

#get the max number of months of credit for this loan
max_credit_months = grp.apply(lambda x: np.amax(x["months_ago"]))
max_credit_months.name = "max_credit_months"

#get the max value for dpd, which maps to how late the person ever was. 5 is the worst, 0 is the best
max_dpd = grp.apply(lambda x: np.amax(x["STATUS_encode"]))
max_dpd.name = "max_dpd"

#get the number of times the person was ever dpd
ever_dpd_count = grp.apply(lambda x: x[x["STATUS_encode"]>0]["SK_ID_BUREAU"].count())
ever_dpd_count.name = "ever_dpd_count"

#and then merge each one back onto the original file

for feature in [max_credit_months, max_dpd, ever_dpd_count]:
    bureau_balance = pd.merge(mini_bureau_balance,pd.DataFrame(feature).reset_index(),on="SK_ID_BUREAU")
    bureau_balance.rename(columns={0:feature.name}, inplace=True)
    
#clean for np.nan
sk = bureau_balance["SK_ID_BUREAU"]
bureau_balance = pd.get_dummies(bureau_balance,prefix="cat",dummy_na=True)
bureau_balance = handle_missing_inf(bureau_balance, True)
bureau_balance["SK_ID_BUREAU"] = sk

bureau_balance.drop_duplicates("SK_ID_BUREAU",inplace=True)
bureau_balance = bureau_balance.loc[:,["SK_ID_BUREAU","max_dpd","max_credit_months"]]
#and then put the cleaned file onto bureau iteself
# bureau = pd.merge(bureau,bureau_balance,on="SK_ID_BUREAU")


print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

mini_bureau_balance

began at 2018-08-20 16:12:09.712192
Entered original loop for replace_na
removing all non-numeric columns
complete at 2018-08-20 16:24:08.866850
total runtime: 0:11:59.154658


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C
3,5715448,-3,C
4,5715448,-4,C
5,5715448,-5,C
6,5715448,-6,C
7,5715448,-7,C
8,5715448,-8,C
9,5715448,-9,0


# Feature Engineering (continued)

Now, we'll do some additional engineering on the high level information from each loan the people have had before.

In [126]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

grp = bureau.groupby("SK_ID_CURR")

#get the max number of months of credit for this loan
max_credit_days = grp.apply(lambda x: np.amin(x["DAYS_CREDIT"]))
max_credit_days.name = "max_credit_days"

#get the max value for dpd, which maps to how late the person ever was. 5 is the worst, 0 is the best
future_end_credit_day = grp.apply(lambda x: np.amax(x["DAYS_CREDIT_ENDDATE"]))
future_end_credit_day.name = "future_end_credit_day"

#are you dpd on anything at the time of application?
curr_dpd = grp.apply(lambda x: np.amax(x["CREDIT_DAY_OVERDUE"]))
curr_dpd.name = "curr_dpd"

#whats your total current credit obligation
curr_obligation = grp.apply(lambda x: np.sum(x["AMT_CREDIT_SUM_DEBT"]))
curr_obligation.name = "curr_obligation"

#what's your current total credit limit
curr_limit = grp.apply(lambda x: np.sum(x["AMT_CREDIT_SUM"]))
curr_limit.name = "curr_limit"

#what's your current total utilization ratio
curr_util_ratio = curr_obligation/curr_limit
curr_util_ratio.name = "curr_util_ratio"

#how many loans have you ever had
num_total_loans = grp.apply(lambda x: x["SK_ID_CURR"].nunique())
num_total_loans.name = "num_total_loans"

#how many of those are active
active_loans = grp.apply(lambda x: x[x["CREDIT_ACTIVE"]=='Active']["SK_ID_CURR"].nunique())
active_loans.name = "active_loans"

#how much has the person ever prolonged credit
tot_prolonged = grp.apply(lambda x: np.sum(x["CNT_CREDIT_PROLONG"]))
tot_prolonged.name = "tot_prolonged"

#put these features in the dataset
for feature in [max_credit_days, future_end_credit_day, curr_dpd, curr_obligation, curr_limit, curr_util_ratio, num_total_loans, active_loans ,tot_prolonged]:
    bureau = pd.merge(bureau,pd.DataFrame(feature).reset_index(),on="SK_ID_CURR")
    bureau.rename(columns={0:feature.name}, inplace=True)

sk = bureau["SK_ID_CURR"]

bureau = pd.get_dummies(bureau,prefix="cat",dummy_na=True)
bureau = handle_missing_inf(bureau, True)
bureau["SK_ID_CURR"] = sk

bureau.drop_duplicates("SK_ID_CURR",inplace=True)

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))


began at 2018-08-20 16:24:08.954093
Entered original loop for replace_na
removing all non-numeric columns
complete at 2018-08-20 16:32:46.886252
total runtime: 0:08:37.932159


# Clean all datasets

Now, we'll apply those cleaning functions that we developed above. It will first categorize all string variables in the dataset. Then, it will turn replace all the numerical columns with a unit adjusted column (with np.nans replaced with the mean of the column).

In [127]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

#clean categorical variables and replace na values

train_data = merge_bureau(train_data)
train_data = categorize_string_var(train_data)
train_data = handle_missing_inf(train_data,True)
train_cols = train_data.columns
scaler.fit(train_data)
train_data = scaler.transform(train_data)
train_data = pd.DataFrame(train_data, columns=train_cols)
train_data.to_csv("./train_data.csv")

test_data = merge_bureau(test_data)
test_data = categorize_string_var(test_data)
test_data = handle_missing_inf(test_data,True)
test_cols = test_data.columns
test_data = scaler.transform(test_data)
test_data = pd.DataFrame(test_data, columns=train_cols)
test_data.to_csv("./test_data.csv")

dev_data = merge_bureau(dev_data)
dev_data = categorize_string_var(dev_data)
dev_data = handle_missing_inf(dev_data,True)
dev_cols = dev_data.columns
dev_data = scaler.transform(dev_data)
dev_data = pd.DataFrame(dev_data, columns=train_cols)
dev_data.to_csv("./dev_data.csv")

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

began at 2018-08-20 16:32:46.906307
Entered original loop for replace_na
removing all non-numeric columns
Entered original loop for replace_na
removing all non-numeric columns
Entered original loop for replace_na
removing all non-numeric columns
complete at 2018-08-20 16:34:48.514637
total runtime: 0:02:01.609334


# Prep for Nueral Net

Now, we'll do final prep for the dataset to be fed to the nueral net, which means putting the bureau data on and stripping off the identifier, which is decidedly not a feature in the final dataset.

In [44]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

train_data = pd.read_csv("./train_data.csv")
test_data = pd.read_csv("./test_data.csv")
dev_data = pd.read_csv("./dev_data.csv")

# train_data_nn = merge_bureau(train_data)
train_data.drop(labels="SK_ID_CURR", axis=1, inplace=True)

# test_data_nn = merge_bureau(test_data)
test_data.drop(labels="SK_ID_CURR", axis=1, inplace=True)

# dev_data_nn = merge_bureau(dev_data)
dev_data.drop(labels="SK_ID_CURR", axis=1, inplace=True)

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

began at 2018-08-21 14:32:11.780157
complete at 2018-08-21 14:32:31.295685
total runtime: 0:00:19.516030


In [22]:
# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
lasso = LassoCV(max_iter = 10000, alphas = [x*.1 for x in range(1,100)], random_state = 42, n_jobs = -1)
selector = RFE(lasso,15)

selector.fit(train_data,train_labels)
train_to_model = train_data.iloc[:,selector.support_]
test_to_model = test_data.iloc[:,selector.support_]
dev_to_model = dev_data.iloc[:,selector.support_]


# Train Random Forest

First, we're going to train a random forest ensemble classifier as a baseline result.

In [131]:
#train and fit random forest regression
rf = RandomForestRegressor(n_estimators = 50, max_leaf_nodes = 10, n_jobs = -1)
rf.fit(train_to_model, train_labels)
test_predict = rf.predict(test_to_model)


NameError: name 're' is not defined

In [None]:
#predict the results for test_labels
test_predict = np.where(test_predict > .5, 1, 0)
cfmx = confusion_matrix(test_labels, test_predict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfmx,['no-default','default'])

print("F1 Score is: {}".format(f1_score(test_labels, test_predict)))

# Train GBM

Next, we'll train a gradient boosted tree to improve on the previous resulta and compare the result.

In [None]:
#train and fit a Gradient Boosted Tree
gbt = GradientBoostingClassifier()
gbt.fit(train_to_model, train_labels)
test_predict = gbt.predict(test_data)


In [None]:
#predict results for trest_predict
test_predict = np.where(test_predict > .5, 1, 0)
cfmx = confusion_matrix(test_labels, test_predict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfmx,['no-default','default'])

print("F1 Score is: {}".format(f1_score(test_labels, test_predict)))

# Train SVM

Next, we'll train a support vector machine with some autotuning to see the accuracy of this alogorithm.

In [None]:
# svm = SVC()
# svm.fit(train_data, train_labels)
# test_predict = svm.predict(test_data)


In [None]:
# test_predict = np.where(test_predict > .5, 1, 0)
# cfmx = confusion_matrix(test_labels, test_predict)
# np.set_printoptions(precision=2)
# plt.figure()
# plot_confusion_matrix(cfmx,['no-default','default'])

# print("F1 Score is: {}".format(f1_score(test_labels, test_predict)))

# Train Nueral Net

Now, we'll train a nueral net using the prepared data. It will use a sigmoid function so that we can predict probabilities from the result.

In [46]:
start = datetime.datetime.now()
print("began at {}".format(datetime.datetime.now()))

#bring in required packages
import tensorflow
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()

# Create a 20 neuron hidden layer with Linear Rectified activation function.
model.add(Dense(20, input_dim=train_data.shape[1], init='uniform', activation='relu'))

# Create a 8 neuron hidden layer.
model.add(Dense(20, init='uniform', activation='relu'))

# Adding a output layer with sigmoid activation
model.add(Dense(1, init='uniform', activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam', metrics=['accuracy'])

# fit the model on teh train data
model.fit(train_data, train_labels, epochs=25, batch_size=5)

print("complete at {}".format(datetime.datetime.now()))
print("total runtime: {}".format(datetime.datetime.now() - start))

began at 2018-08-21 14:33:18.471216




Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
complete at 2018-08-21 14:55:36.019068
total runtime: 0:22:17.547852


In [24]:
#get the predicted classes on the test data
test_predict = model.predict(test_to_model)
dev_predict = model.predict(dev_to_model)

cfmx = confusion_matrix(test_labels, test_predict)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfmx,['no-default','default'])

print("F1 Score is: {}".format(f1_score(test_labels, test_predict)))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [48]:
submission = pd.read_csv("./application_test.csv")
submission = pd.DataFrame(submission.loc[:,"SK_ID_CURR"])
dev_predict = model.predict(dev_data)
predict = pd.DataFrame(dev_predict, columns = ["TARGET"])
submission["TARGET"] = predict["TARGET"]

submission.to_csv("./submission.csv")