# ML Classifier State Farm 
## Data Science Position Challenge

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import numpy as np
import matplotlib.pyplot as pl
%matplotlib inline
l_encoder = LabelEncoder()

## STEP 1 Uploading the data

The data is from the Lending Club. For 2017, currently (November 2017) there are two files, one for each of the first two quarters of the year.

Upload each file separately and merge it into a single pandas data frame.

In [2]:
# Uploading data into two dataframes
data_loans = pd.read_csv('data_files/exercise_01_train.csv', low_memory=False)

In [3]:
display(data_loans.shape, 'HEAD', data_loans.head())
display('TAIL', data_loans.tail())

(40000, 101)

'HEAD'

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
0,10.142889,-15.67562,3.583176,-22.397489,27.221894,-34.110924,-0.072829,-0.544444,0.997601,-2.691778,...,1.916575,5.24082,euorpe,2.43117,0.454074,-18.572032,-14.291524,0.178579,18.11017,0
1,-52.21463,5.847135,-10.902843,-14.132351,20.588574,36.107322,0.115023,0.276093,-0.699168,-0.972708,...,0.370941,-3.794542,asia,2.592326,31.921833,3.317139,10.037003,-1.93087,-3.486898,0
2,67.7185,2.064334,12.394186,-18.667102,47.465504,-50.373658,0.253707,1.068968,2.939713,2.691218,...,1.449817,12.470532,asia,7.143821,9.40149,-10.604968,7.643215,-0.842198,-79.358236,0
3,-28.003111,8.565128,-8.592092,5.91896,-3.224154,78.315783,-0.879845,1.176889,-2.414752,0.589646,...,-3.274733,3.48445,asia,-4.998195,-20.31281,14.818524,-9.180674,1.356972,14.475681,0
4,80.703016,30.736353,-30.101857,-21.20114,-91.946233,-47.469246,-0.646831,-0.578398,0.980849,-1.426112,...,-0.644261,4.082783,asia,-0.012556,-29.334324,1.734433,-12.262072,-0.043228,-19.003881,0


'TAIL'

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x91,x92,x93,x94,x95,x96,x97,x98,x99,y
39995,20.844737,-33.785846,-0.346804,-3.406866,34.771517,-57.951056,-0.288205,1.37194,3.335447,1.76567,...,-2.985304,7.312132,asia,5.964857,-13.061671,-8.062604,16.618593,-3.609543,0.631066,0
39996,1.666154,16.241028,12.62309,-6.16854,-10.650748,69.840299,-0.965011,-4.321631,3.071324,-0.20994,...,-5.842786,3.5636,asia,4.895863,-1.342384,-10.275539,14.04699,-0.32044,46.051387,0
39997,1.795836,-15.706685,1.009672,-0.887671,-11.580529,3.237055,0.541397,2.56231,-0.623586,3.300388,...,7.503255,-11.064043,america,6.783607,15.293008,-6.194035,-4.725605,-1.321478,27.83663,0
39998,50.168318,-4.272643,2.409248,-11.697615,39.234827,31.353302,1.416008,1.825775,2.027886,-3.753114,...,-1.411384,-17.587621,america,6.278226,-18.743967,-8.067506,5.258203,-2.623772,-15.550075,0
39999,-8.653274,10.572796,1.377445,-21.472814,-42.686853,28.89336,3.379456,-1.241659,-0.040278,0.612898,...,7.622624,-6.473851,asia,0.05573,-6.506186,12.434701,-6.001283,-5.340633,18.276723,1


In [4]:
data_loans.y.unique()

array([0, 1])

## STEP 2 Data Cleaning

Remove the columns with only NaN and or with large number of NaN

In [5]:
# Create df with number of NaN on each column
use_col = pd.DataFrame(data_loans.isnull().sum().sort_values())
use_col.reset_index(drop = False, inplace=True)
use_col.columns = (['col_name','num_NaN'])
use_col.num_NaN.unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

There are not that many NAN's for a data set with 40K rows. 
____

#### NAN Distribution
Counting number of columns with NAN's

In [6]:
numbers = [5, 10, 15, 20, 25, 50, 75, 100]
print('Num_NANs\tNum_of_NANcols_Total\tNum_of_NANcols_range')
counter = 0
for pos, num in enumerate(numbers):
    if counter == 0:
        l1 = 0
    else:
        pos1 = pos - 1
        num1 = numbers[pos1]
        l1 = pd.DataFrame(data_loans.isnull().sum().sort_values()[data_loans.isnull().sum().sort_values()<num1]).shape[0]
    l2 = pd.DataFrame(data_loans.isnull().sum().sort_values()[data_loans.isnull().sum().sort_values()<num]).shape[0]
    counter = counter+1
    print('\t', num, '\t\t', l2, '\t\t\t', l2 - l1)

Num_NANs	Num_of_NANcols_Total	Num_of_NANcols_range
	 5 		 13 			 13
	 10 		 69 			 56
	 15 		 100 			 31
	 20 		 101 			 1
	 25 		 101 			 0
	 50 		 101 			 0
	 75 		 101 			 0
	 100 		 101 			 0


In [7]:
# Removing all NAN's impact
data_loans2 = data_loans.dropna(axis=0)
print('Original\tNoNAN')
print(data_loans.shape, data_loans2.shape)
print(20*'#', '\npct_chage = {}'.format(np.round((data_loans2.shape[0]/data_loans.shape[0]-1),2)))

Original	NoNAN
(40000, 101) (39199, 101)
#################### 
pct_chage = -0.02


It seems that the NA's are not that widespread in the data, only afecting 801 (~2%) rows. 

The impact is similar to both 'y' categories. 

In [8]:
impact = pd.concat([data_loans.y.value_counts(), data_loans2.y.value_counts() ], 1)
impact.columns =['pre', 'post']
impact.loc[:,'pct_change'] = round(impact.post/impact.pre -1, 4)
impact

Unnamed: 0,pre,post,pct_change
0,31953,31318,-0.0199
1,8047,7881,-0.0206


In [9]:
# Confirming the removal of the NAN in the second dataframe
use_col2 = pd.DataFrame(data_loans2.isnull().sum().sort_values())
use_col2.reset_index(drop = False, inplace=True)
use_col2.columns = (['col_name','num_NaN'])
use_col2.num_NaN.unique()

array([0])

In [10]:
n_loan_df = data_loans2.copy()

## Feature cleaning and selection

### Target feature (loan_status)


In [11]:
n_loan_df.y.unique()

array([0, 1])

### Other features

In [12]:
# Custom functiones needed later
def types_of_columns(df):
    '''
    This function determines the category of the columns and returns a tuple with 2 df with column names for
    numeric types and object types
    '''
    non_num_cols = df.dtypes[df.dtypes == 'object']
    float_cols   = df.dtypes[df.dtypes != 'object']
    return non_num_cols, float_cols

def column_checker (df1, df2, min_len):
    '''
    Check the contents of the provided columns in df1 and types in df2
    If the column has more unique values than min_len, the function prints
        column name, number of unique values, and an array with the unique values
    '''
    for col in range(0,len(df2)):
        col = df2.index[col]
        values = df1[col].unique()
        if len(values) > min_len:
            print('\n{}\n{}\n{}'.format(col, len(values), values))

Checkig data by column type

In [13]:
col_types_original = types_of_columns(n_loan_df)

#### Non-Numeric Features

In [14]:
nn_col_list = col_types_original[0]
nn_col_list

x34    object
x35    object
x41    object
x45    object
x68    object
x93    object
dtype: object

In [15]:
column_checker(n_loan_df, nn_col_list, 1)


x34
10
['bmw' 'nissan' 'Honda' 'Toyota' 'volkswagon' 'tesla' 'chrystler' 'ford'
 'mercades' 'chevrolet']

x35
8
['wed' 'thur' 'thurday' 'wednesday' 'friday' 'tuesday' 'monday' 'fri']

x41
37116
['$-54.1' '$-229.32' '$243.68' ... '$1215.91' '$-723.78' '$-426.49']

x45
10
['0.0%' '0.01%' '-0.01%' '0.02%' '-0.02%' '-0.0%' '-0.03%' '0.03%' '0.04%'
 '-0.04%']

x68
12
['Jun' 'July' 'May' 'Aug' 'Apr' 'Mar' 'Oct' 'sept.' 'Nov' 'Feb' 'Dev'
 'January']

x93
3
['euorpe' 'asia' 'america']


From the above print out:

Six columns are currently labeled as non-numeric. 

Of these, x41 appears to be a price and should be converted to numeric. 

x45 while has numbers on it, it appears as it contains categories of a range, and does not appear to need corrections

The other are categorical, but there are errors that need to be corrected. For example:

Column x35 appears to be days of the week, and 'friday' and 'fri' should be the same. 

All columns need to be corrrected for these errors.

In [16]:
# Converting x41 to numeric
n_loan_df.loc[:,'x41'] = n_loan_df.x41.str.replace('$', " ").astype(float)

Converting dates and other numbers into actual numeric variables

In [None]:
#### Fixing 'typos'
#x34 to title - just cosmetic
n_loan_df.loc[:,'x34'] = n_loan_df.x34.str[:3].str.upper()

#x35 and x68 to 3 letter day format to fix situations like 'friday' and 'fri'
n_loan_df.loc[:,'x35'] = n_loan_df.x35.str[:3].str.title().str.upper()
n_loan_df.loc[:,'x68'] = n_loan_df.x68.str[:3].str.title().str.upper()
n_loan_df.loc[:,'x68'] = np.where(n_loan_df.x68 == 'DEV', 'DEC', n_loan_df.x68)

#x93 Europe appears to have a typo
n_loan_df.loc[:,'x93'] = np.where(n_loan_df.x93 == 'euorpe', 'EUROPE', n_loan_df.x93.str.upper())


In [None]:
column_checker(n_loan_df, types_of_columns(n_loan_df)[0], 0)

In [None]:
n_loan_df.columns

In [None]:
nn_col_list = types_of_columns(n_loan_df)[0].index

row = 2
col = 3

fig, axs = pl.subplots(row, col, figsize=(20,10))
fig.subplots_adjust(wspace=0.5, hspace=0.25)
axs[1, 2].axis('off')

for num, name in enumerate(nn_col_list):
    y = n_loan_df[name]
    hue = n_loan_df['y']
    i = num%row
    j = num%col
    graph = sns.countplot(y = y, hue = hue, ax = axs[i,j], linewidth = 1.5)
    graph.set_xlabel(name, fontsize=15)
    graph.set_ylabel('', fontsize=1)
    graph.tick_params(labelsize=12)


In [None]:
for col in nn_col_list:
    series = n_loan_df.groupby([col, 'y'])['y'].count()
    print ('\n',series)

### Numeric Features

In [None]:
len(types_of_columns(n_loan_df)[1].index)

In [None]:
num_col_list = types_of_columns(n_loan_df)[1].index.drop(['y'])
for col in num_col_list:
    length = len(n_loan_df[col].unique())
    print (col, length)

In [None]:
num_col_list = types_of_columns(n_loan_df)[1].index.drop('y')
cor_df = pd.DataFrame()
for i in range(0,len(num_col_list)):
    cor_df = pd.concat([cor_df, n_loan_df[num_col_list[i]]],axis=1)    

In [None]:
f, ax = pl.subplots(figsize=(20, 15))
corr = cor_df.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(250, 15, as_cmap=True),
            square=True, ax=ax, annot=True, vmin=0, vmax=1, linewidths=2, xticklabels=1, yticklabels=True)

In [None]:
cor_summary = pd.DataFrame(columns = ['column', 'max_corr', 'min_corr'])
for col in corr.columns:
    df = corr.drop(col)
    df1 = {}
    df1['column']   = [col]
    df1['max_corr'] = [df[col].max()]
    df1['min_corr'] = [df[col].min()]
    cor_summary = pd.concat([cor_summary, pd.DataFrame.from_dict(df1)], 0)
cor_summary.describe()

Removing Outliers

In [None]:
n_loan_df = n_loan_df.reset_index(drop = True)
n_loans = n_loan_df.drop('y', 1)
outliers_all = np.array([], dtype='int64')
for column in types_of_columns(n_loans)[1].index:
    Q1 = np.percentile(n_loans[column], 25)
    Q3 = np.percentile(n_loans[column], 75)
    step = (Q3 - Q1)*2
    out_list_feat=list(n_loans[~((n_loans[column] >= Q1 - step) & (n_loans[column] <= Q3 + step))].index)
    outliers_all = np.append(outliers_all, out_list_feat)

from collections import Counter
out_count = Counter(outliers_all)
out_customer =[]
for customer, count in out_count.items():
    if count > 1:
        out_customer.append(customer.astype(int))
outliers  = list(sorted(out_customer))

print("%d outliers found in more than one feature will be remove" % len(outliers))
n_loan_no_out = n_loan_df.drop(n_loan_df.index[outliers])
print(n_loan_df.shape, n_loan_no_out.shape)

In [None]:
num_col_list = types_of_columns(n_loan_no_out)[1].index.drop('y')
cor_df = pd.DataFrame()
for i in range(0,len(num_col_list)):
    cor_df = pd.concat([cor_df, n_loan_df[num_col_list[i]]],axis=1)    

In [None]:
cor_summary = pd.DataFrame(columns = ['column', 'max_corr', 'min_corr'])
for col in corr.columns:
    df = corr.drop(col)
    df1 = {}
    df1['column']   = [col]
    df1['max_corr'] = [df[col].max()]
    df1['min_corr'] = [df[col].min()]
    cor_summary = pd.concat([cor_summary, pd.DataFrame.from_dict(df1)], 0)
cor_summary.describe()

Encoding categorical columns

In [None]:
for col in (nn_col_list):
    n_loan_no_out[col]=l_encoder.fit_transform(n_loan_no_out[col])
n_loan_no_out.head()

In [None]:
print(n_loan_no_out.shape)
n_loan_no_out.head(10)

### Building the model
#### Spliting the data for the model

Spliting into features and target

In [None]:
target = n_loan_no_out['y'].values
features = n_loan_no_out.drop(['y'], axis=1).values
print ('Loans original data %d, target: %d, features: %d' % (n_loan_no_out.shape[0], target.shape[0], features.shape[0]))

### Building an raw model

In [None]:
# FUNCTION FROM SKELEARN DOCUMENTATION WILL BE USED FOR THE CONFUSION MATRIX PLOT
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=pl.cm.Blues):
    """ FROM SCIKIT LEARN DOCUMENTATION
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    pl.imshow(cm, interpolation='nearest', cmap=cmap)
    pl.title(title)
    pl.colorbar()
    tick_marks = np.arange(len(classes))
    pl.xticks(tick_marks, classes, rotation=45)
    pl.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        pl.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    pl.tight_layout()
    pl.ylabel('True label')
    pl.xlabel('Predicted label')

#### Split data into training, valdation, and testing subsets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=1)
X_train, X_val, y_train, y_val   = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [None]:
classifier = Sequential()
classifier.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu', input_dim = 100))
classifier.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu'))
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

### Fitting the model with the training set

In [None]:
# Training with training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 10)

### Crossvalidation

In [None]:
# Crossvalidation on validation set
from keras.wrappers.scikit_learn import KerasClassifier # keras wrapper for sklearn
from sklearn.model_selection import cross_val_score

def build_classifier():
    classifier_cv = Sequential()
    classifier_cv.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu', input_dim = 100))
    classifier_cv.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu'))
    classifier_cv.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier_cv.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier_cv

classifier_cv = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 10)
accuracies = cross_val_score(estimator = classifier_cv, X = X_train, y = y_train, cv = 10)

mean = accuracies.mean()
variance = accuracies.std()

In [None]:
print('Mean cv accuracy = {:.4f}% +/- {:.4f}'.format(mean *100, variance *100) )

### Validation and optimization

In [None]:
# Accuracy on the validation set
loss, accuracy = classifier.evaluate(X_val, y_val,batch_size=128, verbose=0)
print("Accuracy = {:.4f}%, Loss = {:.4f}".format(accuracy* 100, loss))

In [None]:
# Optimization
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
def build_classifier_imp(optimizer):
    classifier_op = Sequential()
    classifier_op.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu', input_dim = 100))
    classifier_op.add(Dense(units = 14, kernel_initializer = 'uniform', activation = 'relu'))
    classifier_op.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier_op.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier_op

classifier_op = KerasClassifier(build_fn = build_classifier_imp)

parameters = {'batch_size': [10, 30],
              'epochs': [10, 20],
              'optimizer': ['adam', 'rmsprop']}

grid_search = GridSearchCV(estimator = classifier_op,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)

grid_search = grid_search.fit(X_train, y_train)

best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

In [None]:
print (best_parameters, best_accuracy )

### Fitting and prediction with optimized model

In [None]:
# Training with optimize settings
classifier.fit(X_train, y_train, batch_size = 10, epochs = 20)

In [None]:
from keras.models import load_model
classifier.save('data_files/classifier_opt.hdf5') 

In [None]:
# Accuracy on the validation set
loss, accuracy = classifier.evaluate(X_val, y_val,batch_size=128, verbose=0)
print("Accuracy = {:.4f}%, Loss = {:.4f}".format(accuracy* 100, loss))

### Implementation

In [None]:
# Predicting on the test set 
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [None]:
# Accuracy on the test set
loss, accuracy = classifier.evaluate(X_test, y_test,batch_size=128, verbose=0)
print("Accuracy = {:.4f}%, Loss = {:.4f}".format(accuracy* 100, loss))

In [None]:
from sklearn.metrics import confusion_matrix
cnf_matrix = confusion_matrix(y_test, y_pred, labels=[0, 1])
cnf_matrix

In [None]:
pl.figure(figsize=(8,8))
plot_confusion_matrix(cnf_matrix, classes=n_loan_no_out.loan_status.unique(),
                      cmap=pl.cm.Reds, normalize=False)


pl.savefig('data_files/Conf_matrix_2017.png', dpi = 300)

In [None]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred.round(), average='binary')

In [None]:
n_loan_df.loan_status.unique()