# Import Data and Modules

In [6]:
# Imports
import numpy as np
import pandas as pd
from pickle import dump

# Import train-test split function
from sklearn.model_selection import train_test_split

# Import the RandomForestClassifier and DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import metrics for evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score

# Import Visualization Modules
import matplotlib.pyplot as plt
import plotly_express as px

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv")
mini = pd.read_csv("https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank_holdout_test_mini.csv")

# Data Exploration

### Exploring unknown values

In the categorical columns there are missing values labeled as 'unknown'. Here we find them so we can figure out what to do with them.

In [None]:
unknown_mask = df.isin(['unknown'])

# Filter the DataFrame to show rows with 'unknown' values
unknown_rows = df[unknown_mask.any(axis=1)]
unknown_rows.shape

(9592, 20)

Going a little deeper, here are the counts of unkown values in each categorical column

In [None]:
# Define a custom function to count "unknown" values
def count_unknown(series):
    return (series == 'unknown').sum()

# Apply the function to each column using agg
unknown_counts = df.agg(count_unknown).sort_values(ascending=False).reset_index()
unknown_counts = unknown_counts[unknown_counts[0]> 0]
unknown_counts

Unnamed: 0,index,0
0,default,7725
1,education,1535
2,housing,894
3,loan,894
4,job,294
5,marital,69


## Making connections

Here we show some percentages to try and make connections between who has a term deposit and what key demographics they have

### Percentage of people who have term deposits, by job

In [None]:
((df[df['y'] == 'yes']['job'].value_counts() / df['job'].value_counts()) * 100).sort_values(ascending=False)

job
student          31.399748
retired          25.228162
unemployed       14.676889
admin.           13.127660
unknown          11.904762
management       11.442598
technician       10.863831
self-employed    10.753532
housemaid         9.979210
entrepreneur      8.639144
services          8.258174
blue-collar       6.819546
Name: count, dtype: float64

### Percentage of people who have term deposits, by age


In [5]:
((df[df['y'] == 'yes']['age'].value_counts() / df['age'].value_counts()) * 100).sort_values(ascending=False)

age
98    100.000000
89    100.000000
87    100.000000
92     75.000000
86     71.428571
         ...    
49      6.710526
47      6.521739
91           NaN
94           NaN
95           NaN
Name: count, Length: 78, dtype: float64

# Data Cleaning

### Map y values to 0 and 1

In [8]:
df['y'] = df['y'].map({"no": 0, "yes": 1})
df['y'].value_counts()

y
0    31406
1     4176
Name: count, dtype: int64

### Drop duplicates

In [7]:
print('Before', len(df))
df = df.drop_duplicates()
print('After', len(df))

Before 37069
After 35582


### Reduce distinct values for Eurob3r

Eurob3r has too many distinct values. Reducing the amount of these through rounding will help us to lessen the number of columns we have to deal with when we one-hot encode our data.

In [None]:
print('Rounding euribor3m')
print('Distinct vaues before:', len(df['euribor3m'].value_counts()))
df['euribor3m'] = df['euribor3m'].round(1)
print('Distinct values after:', len(df['euribor3m'].value_counts()))

Rounding euribor3m
Distinct vaues before: 314
Distinct values after: 31


# Predict Mini Holdout

In [15]:
# Do same transformations as on the training set
test = pd.get_dummies(mini)
mini_predictions = t_model.predict(test)

# Convert the predictions to a dataframe and label the column 'y'
my_predictions = pd.DataFrame(mini_predictions, columns = ['y'])

# Replace no with 0 and yes with 1
my_predictions = my_predictions["y"].replace({"no":0, "yes":1})

# Replace PUTTEAMNUMBERHERE with your team
my_predictions.to_csv("lilJiimy's-module2-predictions.csv",index=False)

# Large Eval

In [17]:
# full = []
# i = 0
# for test in np.arange(.1, .4, .05):
#     for samp_size in range(50, 500, 100):
#         for md in range(10, 200, 25):
#             for wght in np.arange(.0, .55, .05):
#                 acc, recall, pre = get_nums(test, md, samp_size, wght)
#                 nummies = [test, samp_size, md, wght, acc, recall, pre]
#                 full.append(nummies)
#                 print(i)
#                 i +=1

# big = pd.DataFrame(full, columns=["test", "samp_size", "md", "wght", "acc", "recall", "pre"])
# big.to_csv("nummies.csv")


In [18]:
# full = []
# for test in np.arange(.1, .4, .05):
#     for samp_size in range(50, 500, 100):
#         for md in range(10, 200, 25):
#             for wght in np.arange(.0, .55, .05):
#                 nummies = [test, samp_size, md, wght]
#                 full.append(nummies)

# pd.DataFrame(full, columns=["test", "samp_size", "md", "wght"])



In [20]:
# big[(big["recall"] > .7) & (big["acc"] > .8)]

In [21]:
# Basic One Hot encoding, data splitting, model training and testing

X = pd.get_dummies(df.drop("y", axis=1)).drop("default_yes", axis=1)
y = df["y"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=42)

#parameters
md = 85
min_samp = 150
class_weight = np.where(y_train == "no", 0.15, 1.0)

t_model = DecisionTreeClassifier(random_state=42, max_depth=md, min_samples_split=min_samp)
f_model = RandomForestClassifier(random_state=42,
    max_depth=17, 
    min_samples_split=min_samp,
    n_estimators=100)

t_model.fit(X_train, y_train, sample_weight=class_weight)
f_model.fit(X_train, y_train)

tree_predict = t_model.predict(X_test)
for_predict = f_model.predict(X_test)

tree_acc = accuracy_score(y_test, tree_predict)
for_acc = accuracy_score(y_test, for_predict)
tree_recall = recall_score(y_test, tree_predict, pos_label='yes')
for_recall = recall_score(y_test, for_predict, pos_label='yes')
tree_pre = precision_score(y_test, tree_predict, pos_label='yes')
for_pre = precision_score(y_test, for_predict, pos_label='yes')


display(tree_acc)
display(for_acc)
display(tree_recall)
display(for_recall)
display(tree_pre)
display(for_pre)


0.5207446808510638

0.7845744680851063

0.8675282714054927

0.5476575121163166

0.39601769911504425

0.7306034482758621

# Pickling the model

In [None]:
with open("Decision Tree.pkl", "wb") as f:
    dump(t_model, f, protocol=5)

with open("Random Forest.pkl", "wb") as f:
    dump(f_model, f, protocol=5)

In [None]:
from pickle import load
with open("Decision Tree.pkl", "rb") as f:
    clf = load(f)

tree_predict = t_model.predict(X_test)
tree_acc = accuracy_score(y_test, tree_predict)

tree_recall = recall_score(y_test, tree_predict, pos_label='yes')
tree_pre = precision_score(y_test, tree_predict, pos_label='yes')

display(tree_acc)
display(tree_recall)
display(tree_pre)

0.8131631001618413

0.6511278195488722

0.34919354838709676