In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/bank.csv')

# Intro and Cleaning

Just going to go over some base stats of the data. how many rows, how many bad values that may need to be edited or thrown away.

In the categorical columns there are missing values labeled as unknown. I want to find them so that I can start to figure out what to do with them.

In [None]:
df.shape

(37069, 20)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37069 entries, 0 to 37068
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             37069 non-null  int64  
 1   job             37069 non-null  object 
 2   marital         37069 non-null  object 
 3   education       37069 non-null  object 
 4   default         37069 non-null  object 
 5   housing         37069 non-null  object 
 6   loan            37069 non-null  object 
 7   contact         37069 non-null  object 
 8   month           37069 non-null  object 
 9   day_of_week     37069 non-null  object 
 10  campaign        37069 non-null  int64  
 11  pdays           37069 non-null  int64  
 12  previous        37069 non-null  int64  
 13  poutcome        37069 non-null  object 
 14  emp.var.rate    37069 non-null  float64
 15  cons.price.idx  37069 non-null  float64
 16  cons.conf.idx   37069 non-null  float64
 17  euribor3m       37069 non-null 

In [None]:
unknown_mask = df.isin(['unknown'])

# Filter the DataFrame to show rows with 'unknown' values
unknown_rows = df[unknown_mask.any(axis=1)]
unknown_rows.shape

(9592, 20)

Going a little deeper, here are the counts of unkown values in each categorical column

In [None]:
# Define a custom function to count "unknown" values
def count_unknown(series):
    return (series == 'unknown').sum()

# Apply the function to each column using agg
unknown_counts = df.agg(count_unknown).sort_values(ascending=False).reset_index()
unknown_counts = unknown_counts[unknown_counts[0]> 0]
unknown_counts

Unnamed: 0,index,0
0,default,7725
1,education,1535
2,housing,894
3,loan,894
4,job,294
5,marital,69


# Begining Attempts

What is it that we are actually trying to predict? We already know that some people have a term deposit. does that mean that what we are really looking to do is train models on the data from previous campaigns and find out what(of those people) would be good to reach out to?

In [None]:
df[df["pdays"]< 999]["poutcome"].value_counts()

Unnamed: 0_level_0,count
poutcome,Unnamed: 1_level_1
success,1238
failure,135


In [None]:
df["poutcome"].value_counts()

Unnamed: 0_level_0,count
poutcome,Unnamed: 1_level_1
nonexistent,31988
failure,3843
success,1238


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

df2 = df[df["poutcome"]!="nonexistent"]
X = df2.drop("y", axis=1)
y = df2["y"]

X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)


# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.75
