CODE APPENDIX

#### Dataset I - Bank.csv

In [None]:
#Load Packages
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_curve, auc, plot_confusion_matrix
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [None]:
#Load Dataset and set delimiter
bank = pd.read_csv(r'C:\Users\Helen\Desktop\MoA\COMP809\bank.csv', delimiter=';')

In [None]:
bank.head()

In [None]:
bank.info()

In [None]:
#Create Scatter matrix plots
scatter = pd.plotting.scatter_matrix(bank,figsize= (20,20))
plt.show()

In [None]:
#Values examination
bank.describe()

#### a) Data Pre-processing

In [None]:
#Make a copy to use for safety before commencing Pre-processing
bank1= bank.copy()

In [None]:
#Check for null values in dataset
bank1[bank.isnull().any(axis=1)].count()

In [None]:
#Drop 'duration' column
bank1.drop('duration',axis=1, inplace=True)

#Drop 'contact' column
bank1.drop('contact',axis=1, inplace=True)

#Drop 'day' & 'month' columns
bank1.drop('day',axis=1,inplace=True)
bank1.drop('month',axis=1,inplace=True)

In [None]:
#For 'poutcome'- combine 'other' and 'unknown'
bank1.poutcome.value_counts() #Check initial 'poutcome' entries
bank1['poutcome'] = bank1['poutcome'].replace(['other'],'unknown')
bank1.poutcome.value_counts() #Check 'poutcome' entries after combining.

In [None]:
print("Customers not previously contacted:", len(bank1[bank1.pdays==-1]))
print("Maximum values on padys:", bank1['pdays'].max())
bank1.loc[bank1['pdays'] == -1, 'pdays'] = 100000

In [None]:
bank1.head()

In [None]:
#Check Outlier presence with Box Plots
#Box plot visualization of feature variables' relationship with target variable
fig,axarr = plt.subplots(2,3, figsize=(17,10), dpi=300, facecolor='w', edgecolor='k')
sns.set(style="white")
sns.boxplot(x='age', hue = 'y',data = bank1, ax=axarr[0][0], palette="viridis")
axarr[0][0].set_title('Distribution of Age')
sns.boxplot(x='balance', hue = 'y',data = bank1, ax=axarr[0][1], palette="viridis")
axarr[0][1].set_title('Distribution of balance')
sns.boxplot(x='campaign', hue = 'y',data = bank1,ax=axarr[0][2], palette="viridis")
axarr[0][2].set_title('Distribution of campaign')
sns.boxplot(x='pdays', hue = 'y',data = bank1, ax=axarr[1][0], palette="viridis")
axarr[1][0].set_title('Distribution of pdays')
sns.boxplot(x='previous', hue = 'y',data = bank1, ax=axarr[1][1], palette="viridis")
axarr[1][1].set_title('Distribution of previous')

fig.suptitle('Box plot of Numeric Feature Variables vs Target Variable', fontsize=16);
plt.show()

In [None]:
bank1.hist(column=['age', 'balance','campaign', 'pdays', 'previous'],figsize= (15,15))

In [None]:
#Implement RobustScaler
Rscaler = RobustScaler()
num_cols = ['age', 'balance', 'campaign', 'pdays', 'previous']
bank1[num_cols] = Rscaler.fit_transform(bank1[num_cols])
bank1.head()

In [None]:
#Use StandardScaler for Normalization of data
Sscaler = StandardScaler()
num_cols = ['age', 'balance', 'campaign', 'pdays', 'previous']
bank1[num_cols] = Sscaler.fit_transform(bank1[num_cols])
bank1.head()

In [None]:
# Import LabelEncoder
from sklearn.preprocessing import LabelEncoder

In [None]:
# Encoding catergorical data to numeric with LabelEncoder
le = preprocessing.LabelEncoder()
CatCols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'poutcome','y']
for i in CatCols:
    bank1[i] = le.fit_transform(bank[i].values)

bank1.head()

In [None]:
print('Shape of dataframe:', bank1.shape)

**End of Pre-processing**

**b) Top 5 Classification Feature Selection**

In [None]:
#Develop Correlation Matrix & Plot Heatmap
corr= bank1.corr()
plt.figure(figsize = (11,11))
cmap = sns.diverging_palette(240, 10, n=9,as_cmap=True)
sns.heatmap(corr, xticklabels=corr.columns.values,
            yticklabels=corr.columns.values, cmap=cmap,center=0,
            vmax=.3, linewidths=.5, square=True,
            cbar_kws={"shrink": .82},annot=True)
plt.title('Correlation Matrix Heatmap')
plt.show()


In [None]:
corr.head()

In [None]:
#Extract the y_cat colum (Target Variable) 
#- to see each predictor (feature) variable's correlation with 'y' in descending order
corr_y =pd.DataFrame(corr['y'].drop('y'))
corr_final=abs(corr_y.sort_values(by='y', ascending = False))
print(corr_final.nlargest(5,'y').head(5))

**c) Build Decision Tree Model and adjust two parameters**

In [None]:
#Set Features (based on top 5 most inflential feature from previous step)
X = bank1[['pdays','previous','housing','poutcome','loan']]

#Set Target
y = bank1['y']

#Prepare Training and Testing Data (20% test data)
X_train,X_test,y_train,y_test= train_test_split(X,y, shuffle=True, test_size=0.2, random_state=42)

#Display Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

In [None]:
#Construct Decision Tree Model
dt = tree.DecisionTreeClassifier(random_state=42)

#Train Decision Tree Classifier
tdt=dt.fit(X_train,y_train)

#Plot Initial Decision Tree
plt.figure(dpi=300)
tree.plot_tree(tdt)
plt.show()


In [None]:
#10 fold cross-validation score:
cv= cross_val_score(dt,X,y,cv=10)
print(cv)
print("Averaged 10-Fold CV Score:{}".format(np.mean(cv)))

In [None]:
#Tune 'max_depth' parameter.
maxdepth_cv=[]
node_counts=[]

for k in range(1,6,1):
     dt=DecisionTreeClassifier(max_depth=k,random_state=42)
     dt.fit(X_train,y_train)
     predict=dt.predict(X_test)
     cv= cross_val_score(dt,X,y,cv=10)
     nodecount = dt.tree_.node_count
     print("max_depth={}".format(k),
           "Average 10-Fold CV Score:{}".format(np.mean(cv)),
           "Node count:{}".format(nodecount))
     maxdepth_cv.append(np.mean(cv))
     node_counts.append(nodecount)

In [None]:
#Plot averaged CV scores for all max_depth tunings
fig,axes=plt.subplots(1,1,figsize=(8,5))
axes.set_xticks(range(1,6,1))
k=range(1,6,1)
plt.plot(k,maxdepth_cv)
plt.xlabel("max_depth")
plt.ylabel("Averaged 10-fold CV score")
plt.show()

In [None]:
#Plot Decision Tree with (max_depth=2)
dt_depth2 = tree.DecisionTreeClassifier(max_depth=2,random_state=42)
tdt_depth2=dt_depth2.fit(X_train,y_train)

plt.figure(dpi=300)
tree.plot_tree(tdt_depth2)
plt.title("Decision Tree Diagram (max_depth = 2)")
plt.show()

In [None]:
#Tune 'max_leaf_nodes' parameter.
maxleaf_cv=[]
node_counts=[]


for k in range(2,11,1):
     dt=DecisionTreeClassifier(max_leaf_nodes=k,random_state=42)
     dt.fit(X_train,y_train)
     predict=dt.predict(X_test)
     cv= cross_val_score(dt,X,y,cv=10)
     nodecount = dt.tree_.node_count
     print("max_leaf_nodes={}".format(k),
           "Average 10-Fold CV Score:{}".format(np.mean(cv)),
           "Node count:{}".format(nodecount))
     maxleaf_cv.append(np.mean(cv))
     node_counts.append(nodecount)

In [None]:
#Plot averaged CV scores for all tuned max_leaf_nodes tunings
fig,axes=plt.subplots(1,1,figsize=(8,5))
axes.set_xticks(range(2,11,1))
k=range(2,11,1)
plt.plot(k,maxleaf_cv)
plt.xlabel("max_leaf_nodes")
plt.ylabel("Averaged 10-fold CV score")
plt.show()

In [None]:
#Plot Decision Tree with (max_leaf_nodes=10)
dt_leaf5 = tree.DecisionTreeClassifier(max_leaf_nodes=5,random_state=42)
tdt_leaf5=dt_leaf5.fit(X_train,y_train)

plt.figure(dpi=300)
tree.plot_tree(tdt_leaf5)
plt.title("Decision Tree Diagram (max_leaf_nodes = 5)")
plt.show()

In [None]:
#Decision Tree with (max_leaf_nodes=5)
dt_leaf5 = tree.DecisionTreeClassifier(max_leaf_nodes=5,random_state=42)
tdt_leaf5=dt_leaf5.fit(X_train,y_train)

#Plot Confusion Matrix
plot_confusion_matrix(dt_leaf5, X_test, y_test, normalize= 'all')
plt.show()



In [None]:
#Class Imbalance Issue Visualized 
#Histogram of '1' and '0' in 'y'
bank1.hist(column=['y'])
plt.title("Historgraph of 1 and 0 in 'y'")
plt.show()


In [None]:
bank1.y.value_counts()