In [1]:
# Import necessary modules

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, confusion_matrix, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

print('done')

  from numpy.core.umath_tests import inner1d


done


In [2]:
%%time

# Load csv Data as pandas dataframe
df = pd.read_csv('Intermediate_data\Trimmed.csv',
                 parse_dates=['earliest_cr_line',
                              'issue_d', 'last_pymnt_d',
                              'next_pymnt_d', 'last_credit_pull_d'])

df['history'] = (df.issue_d - df.earliest_cr_line).dt.days

df.drop(['Unnamed: 0'], axis=1, inplace=True)

print('done')

done
Wall time: 5min 39s


In [3]:
# Drop unnecessary columns and the columns that leak the
# information. For example, non-zero value in
# 'collection recovery fee' column already mean
# that the loan is default. Also remove some columns that have
# very little or zero predictive power

df.drop(['last_pymnt_amnt', 'collection_recovery_fee',
         'recoveries', 'out_prncp_inv', 'out_prncp',
         'total_rec_prncp', 'int_rate', 'total_pymnt',
         'total_pymnt_inv', 'total_rec_late_fee',
         'total_rec_int', 'emp_title', 'debt_settlement_flag',
         'addr_state', 'purpose', 'delinq_amnt',
         'hardship_flag', 'pymnt_plan', 'collections_12_mths_ex_med',
         'chargeoff_within_12_mths', 'acc_now_delinq',
         'earliest_cr_line', 'issue_d', 'last_pymnt_d',
         'next_pymnt_d', 'last_credit_pull_d'],
        axis=1, inplace=True)

print('done')

done


In [4]:
# Some columns are categorical. I converted
# them to categorical dummies (0 or 1)

cate_list = list(df.dtypes[df.dtypes == 'object'].index)

df_cate = df[cate_list]

df_cat_dum = pd.get_dummies(df_cate, drop_first=True)
df_cat_dum.drop('loan_status_Late/Charged Off', axis=1, inplace=True)

print('done')

done


In [5]:
# Create the subset of numerical columns.

df_num_col = df.dtypes[df.dtypes == 'float64']

df_num = df[list(df_num_col.index)]

y = 1 - pd.get_dummies(df.loan_status, drop_first=True)

print('done')

done


In [6]:
# Some of the numerical columns contain missing values. I
# filled the missing values using
# the mean value of the column.

for col in df_num.columns:
    df_num[col].fillna(df_num[col].mean(), inplace=True)

# Combine categorical and numerical columns
df_com = pd.concat([df_num, df_cat_dum, y], axis=1)

# The majority of the data is contains non-default loans.
# Making equal population is
# essential to make an unbiased model.

df0 = df_com[df_com['Late/Charged Off'] == 0]
df1 = df_com[df_com['Late/Charged Off'] == 1].sample(n=df0.shape[0])

# Combine both subsample of default or non-default loans
df_combined = pd.concat([df0, df1], axis=0)
df_com = df_combined.sample(frac=1.0)

y = df_com['Late/Charged Off']
X = df_com.drop(['Late/Charged Off'], axis=1).values

# Scale the data to their standard values.
X = StandardScaler().fit_transform(X)

print('done')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


done


In [7]:
# Create parameter grid to perform randomized search
param_grid = {'max_features': np.arange(0.05, 0.5, 0.1),
              'n_estimators': np.arange(5, 2000, 500),
              'min_samples_leaf': np.arange(2, 1000, 300),
              'min_samples_split': np.arange(5, 2000, 500),
              'max_depth': np.arange(2, 2000, 500),
              'min_weight_fraction_leaf': np.arange(0.05, 0.3, 0.1)}

# Make a randomized search model
random_search = RandomizedSearchCV(GradientBoostingClassifier(),
                                   param_distributions=param_grid,
                                   cv=3, refit=True, n_jobs=-2,
                                   random_state=77)

# Perform a fit
random_search.fit(X, y)

# Print the tuned parameters and score
print("Randomized search best parameters: {}".
      format(random_search.best_params_))
print("Best score is {}".format(random_search.best_score_))

Randomized search best parameters: {'n_estimators': 1505, 'min_weight_fraction_leaf': 0.05, 'min_samples_split': 1505, 'min_samples_leaf': 302, 'max_features': 0.45000000000000007, 'max_depth': 1002}
Best score is 0.6694960822050899
