Data Exploration

In [None]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels
import scipy
import matplotlib.pyplot as plt

#Import Data, store into data frame df_original
df_original = pd.read_csv('bank-full.csv',sep=";")

# check top 5 attributes
df_original.head()

In [None]:
# Summary of data frame
# 45211 entries, 17 features including 1 class feature
# Besides, all these 16 features are either categorical or integer types

df_original.info()

In [None]:
# checking NA data
missing_data = df_original.isnull().mean()*100

# 0.0 ,no missing data 
missing_data.sum()

# no empty/NA data exists, however, we will examine the "unknown" values in the following sections

In [None]:
# Summary for categorical data
for column in df_original.select_dtypes(include='object').columns:
    df_original[column] = df_original[column].astype('category')
    print(column)
    print(df_original[column].unique())
    
def PrintDataframeCategoricalSummary(df):
    for column in df.dtypes[df.dtypes == 'category'].index:
        print(df[column].name, df[column].unique())
    
# Check categorical features with "unknown" value
for column in df_original.dtypes[df_original.dtypes == 'category'].index:
    num_of_unknown = df_original[column].str.contains('unknown').sum()
    print(df_original[column].name, 'unknown: ', num_of_unknown)
    
# we can see that job, education, contact and poutcome these four features have unknow values
# we will deal with the unknown data handling in the following sections

In [None]:
# Categorical Attributes Exploration

# only for categorical data and excluding class feature
cat_columns = df_original.columns[df_original.dtypes == 'category']
cat_columns = cat_columns.drop(['y'])

fig, axs = plt.subplots(3, 3, sharex=False, sharey=False, figsize=(20, 15))

counter = 0
for column in cat_columns:
    value_counts = df_original[column].value_counts()
    trace_x = counter // 3
    trace_y = counter % 3
    x_pos = np.arange(0, len(value_counts))
    
    axs[trace_x, trace_y].bar(x_pos, value_counts.values, tick_label = value_counts.index)
    axs[trace_x, trace_y].set_title(column)
    
    for tick in axs[0, 0].get_xticklabels():
        tick.set_rotation(90)
    
    counter += 1

plt.show()

#For now,nothing Special for the categorical attributes

In [None]:
# Summary for continous data 
df_original.describe()

In [None]:
# Continous Attributes Exploration
def NumericalHistPlot(df):
    num_columns = df.columns[~(df.dtypes == 'category')]

    fig, axs = plt.subplots(3, 3, sharex=False, sharey=False, figsize=(20, 15))

    counter = 0
    for num_column in num_columns:
        trace_x = counter // 3
        trace_y = counter % 3
    
        axs[trace_x, trace_y].hist(df[num_column])
        axs[trace_x, trace_y].set_title(num_column)
    
        counter += 1
        
    plt.show()

NumericalHistPlot(df_original)


In [None]:
# Categorical: Handling of unknown data 

## job unknown:  288
## marital unknown:  0
## education unknown:  1857
## default unknown:  0
## housing unknown:  0
## loan unknown:  0
## contact unknown:  13020
## month unknown:  0
## poutcome unknown:  36959
## y unknown:  0

# poutcome has 36959 out of 45211 unknown values, we decide to drop the feature
df_clean = df_original.drop(columns=['poutcome'], errors='ignore')

# job and education, given the small number of unknown records, we are dropping the unknown records
#[TODO]df_clean.drop(df_clean[df_clean.job == 'unknown'].index, inplace=True)
df_clean.drop(df_clean[df_clean.education == 'unknown'].index, inplace=True)

PrintDataframeCategoricalSummary(df_clean)

# for contact, we will just leave the unknown as one of the categories for now
# [TODO, how to impute]

In [None]:
# Categorical: Feature Engineering

# we are mainly concerned with the number of job categories provided and observe that some of simply duplicate 
# or there are overlaps among different job categories by common sense

for jobcat in df_clean.job.unique():
    print (jobcat)
    
## management
## technician
## entrepreneur
## retired
## admin.
## services
## blue-collar
## self-employed
## unemployed
## housemaid
## student

# merge entrepreneur into self_employed 
df_clean.job.replace(['entrepreneur', 'self-employed'], 'self-employed', inplace=True)

# merge admin. into management
df_clean.job.replace(['admin.', 'management'], 'management', inplace=True)

# merge technician into blue-collar
df_clean.job.replace(['blue-collar', 'technician'], 'blue-collar', inplace=True)

# merge housemaid into services
df_clean.job.replace(['services', 'housemaid'], 'services', inplace=True)

    
# after the merging, we have reduced the number of job categories to 7
#for jobcat in df_clean.job.unique():
#    print (jobcat)
    
PrintDataframeCategoricalSummary(df_clean)
df_clean.info()

## management
## blue-collar
## self-employed
## unemployed
## retired
## services
## student

In [None]:
# Numberical/Continuous: Feature Engineering

# numerical correlation matrix among numerical features
correlations = df_clean.corr()

print(correlations)

plt.figure(figsize=(15,10)) 

ax = sns.heatmap(
    correlations, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 320, n=200),
    square=True,
    linewidths=1,
    annot=True
)

ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);


In [None]:
# Numerical/Continuous: Feature Engineering

## for these 7 numercial featuers, we have done the following modifications
## a) new categorical feature balanceGroup from balance feature, and balance will be removed
## b) removal of outliers for campaign, pdays and previous
## c) apply normalization to all numerical values 

# replot the numerical features here for ease of reference
NumericalHistPlot(df_clean)


## creation of categorical balanceGroup and removal of balance
df_clean.loc[df_clean['balance'] < 0,'balanceGroup'] = 'negative'
df_clean.loc[(df_clean['balance'] >= 0) & (df_clean['balance'] < 1000),'balanceGroup'] = 'low'
df_clean.loc[(df_clean['balance'] >= 1000) & (df_clean['balance'] < 5000),'balanceGroup'] = 'medium'
df_clean.loc[df_clean['balance'] >= 5000,'balanceGroup'] = 'high'
df_clean['balanceGroup'] = df_clean['balanceGroup'].astype('category')
df_clean = df_clean.drop(columns=['balance'], errors='ignore')


## campaign > 38 consider as outliers
## pdays >435 consider as outliers
## previous >28 consider as outliers
df_clean.drop(df_clean[df_clean.campaign > 38].index, inplace=True)
#df_clean.drop(df_clean[df_clean.pdays > 435].index, inplace=True)
df_clean.drop(df_clean[df_clean.previous > 28].index, inplace=True)


PrintDataframeCategoricalSummary(df_clean)

## age, day, duration untouched

## [TODO] scaling minmax or standard, review later

## plot after feature engineering
NumericalHistPlot(df_clean)


In [None]:
#Data engineering using tableau
#Age 
#Age range with y

#Duration
##Duration with y

# Occupation 
##Number of Occupation
##Occupation with balance
##Occupation with y

# Marital Status
##Marital with y

In [None]:
# checking whether class is balanced or imbalanced
df_clean.y.value_counts()/df_clean.y.count()


# indeed, there is a very imbalanced dataset, we will apply upsampling or downsampling later for the modeling

In [None]:
# Replace class feature y's 'yes' and 'no' with numerical 1 and 0 so that it won't be dummied at later stage
df_clean.y.replace(['yes'], 1, inplace=True)
df_clean.y.replace(['no'], 0, inplace=True)

# save and read it back to drop some data frame cache issue
df_clean.to_csv('cleandata.csv', index=False)
df_model = pd.read_csv('cleandata.csv')


In [None]:
# start modeling

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline, Pipeline

from sklearn.metrics import matthews_corrcoef, roc_curve, roc_auc_score, auc, accuracy_score, confusion_matrix, classification_report

from sklearn.utils import resample


# convert categorical features to numerical dummy variables
df_model = pd.get_dummies(df_model)


In [None]:
df_model_1 = df_model.copy();

# separate dependent features and class feature
targets = df_model_1['y']
df_model_1.drop(columns=['y'], inplace=True)

inputs = df_model_1



In [None]:
# seperate data into training and testing data 

test_size = 0.2
seed = 1337

train, test, train_labels, test_labels = train_test_split(inputs, targets, test_size=test_size, random_state=seed)


In [None]:
# upsampling minority for those with 'yes' label

from sklearn.utils import resample

def Run_SimpleResample():

    train_with_labels = pd.concat([train, train_labels], axis=1)

    # separate minority and majority classes
    train_yes = train_with_labels[train_with_labels.y==1]
    train_no = train_with_labels[train_with_labels.y==0]

    # upsample ''yes''
    yes_upsampled = resample(train_yes,
                          replace=True,            # with replacement
                          n_samples=len(train_no), # match number of 'no' class
                          random_state=0)          # make it reproducible

    # combine
    after_resample = pd.concat([train_no, yes_upsampled])

    train_labels_1 = after_resample.y
    train_1 = after_resample.drop('y', axis=1)

    # check new class counts
    after_resample.y.value_counts()/after_resample.y.count()
    
    return (train_1, train_labels_1)

In [None]:
# another resampling from imblearn.ensemble

from imblearn.over_sampling import SMOTE, ADASYN

def Run_OverSampling(model):
    train_2, train_labels_2 = model.fit_resample(train, train_labels)
    after_resample_2 = (pd.concat([pd.DataFrame(data = train_2, columns = train.columns.values), 
                               pd.DataFrame(data = train_labels_2, columns=['y'])], axis=1))
    after_resample_2.y.value_counts()/after_resample_2.y.count()
    
    after_resample_train_labels = after_resample_2.y
    after_resample_train = after_resample_2.drop('y', axis=1)

    return (after_resample_train, after_resample_train_labels)

In [None]:
resample_train_x_0, resample_train_y_0 = Run_SimpleResample()

In [None]:
resample_train_x_1, resample_train_y_1 = Run_OverSampling(SMOTE(random_state=0))

In [None]:
resample_train_x_2, resample_train_y_2 = Run_OverSampling(ADASYN(random_state=0))

In [None]:
# make pipelines

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPClassifier

def Make_Pipelines():
    pipelines = list()
    pipelines.append(('LogisticRegression', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
    #pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
    #pipelines.append(('ScaledRIDGE', Pipeline([('Scaler', StandardScaler()),('RIDGE', Ridge())])))
    #pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
    #pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
    #pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
    #pipelines.append(('NN', Pipeline([('Scaler', StandardScaler()),('MLP', MLPClassifier())])))
    
    return pipelines

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import matthews_corrcoef

import collections

def Run_Pipelines(pipelines, train_x, train_y):
    for name, model in pipelines:
        print("Running {0}{1}".format(name, "\n"))
        ten_fold = KFold(n_splits=10, random_state=seed) # using 10-fold
        model.fit(train_x, train_y)
        
        cross_validation = cross_val_score(model, train_x, train_y, cv=ten_fold, scoring='accuracy')

        accuracy_score = model.score(test, test_labels);
        pred = model.predict(test)
        pred_freq = collections.Counter(pred)

        print(pred_freq)
        print('Test accuracy: %.3f' % accuracy_score)
        print(confusion_matrix(test_labels, pred))
        print(matthews_corrcoef(test_labels, pred))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import matthews_corrcoef

import collections

def Tune_LogisticRegression(train_x, train_y):
    
    params=[{'penalty':['l1','l2']}]
    
    LR = GridSearchCV(LogisticRegression(tol=1e-4), params, cv=10)
    LR.fit(train_x,train_y)
    print('Best parameters set found:',LR.best_params_)
    
    return LR
    


def Run_LogisticRegression(train_x, train_y):
    
    # go with a Linear Regression Model
    five_fold = KFold(n_splits=5, random_state=seed)

    # define default StandardScaler
    sc = StandardScaler()

    LR = Tune_LogisticRegression(train_x, train_y) 
    #pipeline_LR = Pipeline([('Scaler', sc),
    #                    ('LR', LogisticRegression(random_state=seed))])
    
    pipeline_LR = Pipeline([('Scaler', sc),
                        ('LR', LR)])

                        
    pipeline_LR.fit(train_x, train_y)

    # [TODO] check later on the roc_auc_score
    cross_validation = cross_val_score(pipeline_LR, train_x, train_y, cv=five_fold, scoring='accuracy')

    accuracy_score = pipeline_LR.score(test, test_labels);
    pred = pipeline_LR.predict(test)
    pred_freq = collections.Counter(pred)
    

    print(pred_freq)
    print('Test accuracy: %.3f' % accuracy_score)

    print(confusion_matrix(test_labels, pred))

    print(matthews_corrcoef(test_labels, pred))


In [None]:
Run_LogisticRegression(resample_train_x_0, resample_train_y_0)

In [155]:
Run_LogisticRegression(resample_train_x_1, resample_train_y_1)



Counter({0: 8177, 1: 488})
Test accuracy: 0.899
[[7478  180]
 [ 699  308]]
0.3925181020680494


In [148]:
Run_LogisticRegression(resample_train_x_2, resample_train_y_2)



Counter({0: 8175, 1: 490})
Test accuracy: 0.899
[[7477  181]
 [ 698  309]]
0.39296080412684636


In [191]:
pipelines = Make_Pipelines()
Run_Pipelines(pipelines,resample_train_x_0, resample_train_y_0)

Running LogisticRegression





Counter({0: 6556, 1: 2109})
Test accuracy: 0.828
[[6361 1297]
 [ 195  812]]
0.4757155691320589
Running NN





KeyboardInterrupt: 

In [188]:
pipelines = Make_Pipelines()
Run_Pipelines(pipelines,resample_train_x_1, resample_train_y_1)

Running LogisticRegression





Counter({0: 8177, 1: 488})
Test accuracy: 0.899
[[7478  180]
 [ 699  308]]
0.3925181020680494
Running ScaledCART

Counter({0.0: 7621, 1.0: 1044})
Test accuracy: -0.208
[[7102  556]
 [ 519  488]]
0.4056183067300137


In [189]:
pipelines = Make_Pipelines()
Run_Pipelines(pipelines,resample_train_x_2, resample_train_y_2)

Running LogisticRegression





Counter({0: 8175, 1: 490})
Test accuracy: 0.899
[[7477  181]
 [ 698  309]]
0.39296080412684636
Running ScaledCART

Counter({0.0: 7590, 1.0: 1075})
Test accuracy: -0.229
[[7077  581]
 [ 513  494]]
0.4031613455791261


In [119]:
from sklearn.metrics import matthews_corrcoef

print(pred_freq)
print('Test accuracy: %.3f' % accuracy_score)

print(confusion_matrix(test_labels, pred))

print(matthews_corrcoef(test_labels, pred))


Counter({0: 8175, 1: 490})
Test accuracy: 0.899
[[7477  181]
 [ 698  309]]
0.39296080412684636


In [78]:
##[TODO]
1. Handling unknown data with mean...
2. outlier handling, might just keep 2, and without dropping
3. imbalanced up/down sampling
4. more models (LR, NN, DTR)
5. ensembling, automated param tuning GradientSearchCV + pipeline + cross_score_var
6. slides 
7. report (background, highlights, tableau file, python)

SyntaxError: invalid syntax (<ipython-input-78-ca1b76fa7697>, line 2)