# Data Exploration

### Imports

In [2]:
# Imports
import numpy as np
import pandas as pd

# Import train-test split function
from sklearn.model_selection import train_test_split

# Import the RandomForestClassifier and DecisionTreeClassifier
from sklearn import tree
from sklearn import ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import metrics for evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, recall_score, precision_score
from sklearn.utils import resample

# Import Visualization Modules
import matplotlib.pyplot as plt
import plotly_express as px

df = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv")
mini = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv")


# Create Models
# Basic Hot encoding, data splitting, model training and testing

X = pd.get_dummies(df.drop("y", axis=1)).drop("default_yes", axis=1)
X
y = df["y"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42)

#parameters
md = 85
min_samp = 150
class_weight = np.where(y_train == "no", 0.15, 1.0)

t_model = DecisionTreeClassifier(random_state=42, max_depth=md, min_samples_split=min_samp)
f_model = RandomForestClassifier(random_state=42,
    max_depth=17, 
    min_samples_split=min_samp,
    n_estimators=100)

t_model.fit(X_train, y_train, sample_weight=class_weight)
f_model.fit(X_train, y_train)

tree_predict = t_model.predict(X_test)
for_predict = f_model.predict(X_test)

tree_acc = accuracy_score(y_test, tree_predict)
for_acc = accuracy_score(y_test, for_predict)
tree_recall = recall_score(y_test, tree_predict, pos_label='yes')
for_recall = recall_score(y_test, for_predict, pos_label='yes')
tree_pre = precision_score(y_test, tree_predict, pos_label='yes')
for_pre = precision_score(y_test, for_predict, pos_label='yes')


display(tree_acc)
display(for_acc)
display(tree_recall)
display(for_recall)
display(tree_pre)
display(for_pre)


0.8131631001618413

0.8948030929688905

0.6511278195488722

0.23458646616541354

0.34919354838709676

0.6724137931034483

### Exploring Correlations

In [5]:
df.describe()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0,37069.0
mean,40.025493,2.564407,962.221803,0.17373,0.081526,93.576551,-40.494829,3.621945,5167.01065
std,10.435288,2.764084,187.531477,0.496159,1.572287,0.579339,4.628895,1.734496,72.294476
min,17.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [None]:
df[df["y"] == True]

In [6]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37064,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
37065,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
37066,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
37067,44,technician,married,professional.course,no,no,no,cellular,nov,fri,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
