# Data Exploration

### Imports

In [3]:
# Imports
import numpy as np
import pandas as pd

# Import train-test split function
from sklearn.model_selection import train_test_split

# Import the RandomForestClassifier and DecisionTreeClassifier
from sklearn import tree
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import metrics for evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score
from sklearn.utils import resample

# Import Visualization Modules
import matplotlib.pyplot as plt
import plotly_express as px

df = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv")
mini = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv")


# Create Models
# Basic Hot encoding, data splitting, model training and testing

X = pd.get_dummies(df.drop("y", axis=1)).drop("default_yes", axis=1)
X
y = df["y"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42)

#parameters
md = 85
min_samp = 150
class_weight = np.where(y_train == "no", 0.15, 1.0)

t_model = DecisionTreeClassifier(random_state=42, max_depth=md, min_samples_split=min_samp)
f_model = RandomForestClassifier(random_state=42,
    max_depth=17, 
    min_samples_split=min_samp,
    n_estimators=100)

t_model.fit(X_train, y_train, sample_weight=class_weight)
f_model.fit(X_train, y_train)

tree_predict = t_model.predict(X_test)
for_predict = f_model.predict(X_test)

tree_acc = accuracy_score(y_test, tree_predict)
for_acc = accuracy_score(y_test, for_predict)
tree_recall = recall_score(y_test, tree_predict, pos_label='yes')
for_recall = recall_score(y_test, for_predict, pos_label='yes')
tree_pre = precision_score(y_test, tree_predict, pos_label='yes')
for_pre = precision_score(y_test, for_predict, pos_label='yes')


display(tree_acc)
display(for_acc)
display(tree_recall)
display(for_recall)
display(tree_pre)
display(for_pre)


0.8131631001618413

0.8948030929688905

0.6511278195488722

0.23458646616541354

0.34919354838709676

0.6724137931034483

# Pickling

In [10]:
from pickle import dump
with open("Decision Tree.pkl", "wb") as f:
    dump(t_model, f, protocol=5)


with open("Random Forest.pkl", "wb") as f:
    dump(f_model, f, protocol=5)

In [11]:
from pickle import load
with open("Decision Tree.pkl", "rb") as f:
    clf = load(f)

tree_predict = t_model.predict(X_test)
tree_acc = accuracy_score(y_test, tree_predict)

tree_recall = recall_score(y_test, tree_predict, pos_label='yes')
tree_pre = precision_score(y_test, tree_predict, pos_label='yes')

display(tree_acc)
display(tree_recall)
display(tree_pre)

0.8131631001618413

0.6511278195488722

0.34919354838709676

### Exploring Correlations

In [5]:
df.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0
mean,40.025493,2.564407,962.221803,0.17373,0.081526,93.576551,-40.494829,3.621945,5167.01065
std,10.435288,2.764084,187.531477,0.496159,1.572287,0.579339,4.628895,1.734496,72.294476
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [7]:
df[df["y"] == "yes"].describe()


Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,4208.0,4208.0,4208.0,4208.0,4208.0,4208.0,4208.0,4208.0,4208.0
mean,40.948432,2.05347,789.912548,0.495722,-1.240162,93.356961,-39.824097,2.113721,5094.514924
std,13.868956,1.647362,404.917402,0.862267,1.618691,0.675806,6.147387,1.738969,87.617082
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,31.0,1.0,999.0,0.0,-1.8,92.893,-46.2,0.849,5017.5
50%,37.0,2.0,999.0,0.0,-1.8,93.2,-40.4,1.266,5099.1
75%,50.0,2.0,999.0,1.0,-0.1,93.918,-36.1,4.223,5191.0
max,98.0,17.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1


In [9]:
df[df["y"] == "no"].describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,32861.0,32861.0,32861.0,32861.0,32861.0,32861.0,32861.0,32861.0,32861.0
mean,39.907307,2.629835,984.286784,0.132497,0.250774,93.604671,-40.580719,3.815079,5176.294057
std,9.904101,2.869389,119.964157,0.409307,1.483529,0.559621,4.389456,1.636452,64.449486
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.405,5099.1
50%,38.0,2.0,999.0,0.0,1.1,93.918,-41.8,4.857,5195.8
75%,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.962,5228.1
max,95.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1
