In [7]:
# Dataset selection and exploration
# chose https://archive.ics.uci.edu/dataset/222/bank+marketing and using bank-full.csv
# pip install patsy

import pandas as pd

bank_df = pd.read_csv('bank-full.csv', delimiter=';')
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [8]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [12]:
# machine learning task = binary classification
# need to predict if the client will subscribe a term deposit (yes/no)

# it is noted by the dataset authors that duration should not be used in predictive models, as it is highly correlated with the target variable
# if duration is 0, it is known that y is 'no', as such, we will remove this column

bank_slim_df = bank_df.drop('duration', axis=1)

# we will choose SVM as the base model to use for this dataset

# split out the target column
target = bank_slim_df['y']
feature_matrix = bank_slim_df.drop('y', axis=1)

target.head()


0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [13]:
feature_matrix.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,unknown


In [20]:
# drop any rows that aren't numeric
numerical_feature_matrix = feature_matrix[["age", "balance", "day", "campaign", "pdays", "previous"]]

# Implement a baseline model without any feature engineering
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

X_train, X_test, y_train, y_test = train_test_split(numerical_feature_matrix, target, test_size=0.2)

model = SVC()

# Train the baseline model
model.fit(X_train, y_train)

In [21]:
# Run the model
pred_y = model.predict(X_test)

In [46]:
# Evaluate the baseline model using appropriate metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(y_test, pred_y):
	accuracy = accuracy_score(y_test, pred_y) * 100
	precision = precision_score(y_test, pred_y, average='weighted') * 100
	recall = recall_score(y_test, pred_y, average='weighted') * 100
	f1 = f1_score(y_test, pred_y, average='weighted') * 100

	print(f'Scores:\nAccuracy Score:\t{accuracy:.2f}\nPrecision:\t{precision:.2f}\nRecall:\t\t{recall:.2f}\nF1 Score:\t{f1:.2f}')

evaluate_model(y_test, pred_y)

Scores:
Accuracy Score:	88.13
Precision:	89.54
Recall:		88.13
F1 Score:	82.59


In [44]:
# Feature engineering techniques

def test_model(feature_matrix):
	X_train, X_test, y_train, y_test = train_test_split(feature_matrix, target, test_size=0.2)

	model = SVC()

	model.fit(X_train, y_train)

	pred_y = model.predict(X_test)

	evaluate_model(y_test, pred_y)

# Encoding categorical variables
# categorical variables: job, marital, education, default, housing, loan, contact, month, poutcome
# we will use one-hot encoding to encode these variables

job_dummies = pd.get_dummies(feature_matrix['job'], prefix='job')
marital_dummies = pd.get_dummies(feature_matrix['marital'], prefix='marital')
education_dummies = pd.get_dummies(feature_matrix['education'], prefix='education')
default_dummies = pd.get_dummies(feature_matrix['default'], prefix='default')
housing_dummies = pd.get_dummies(feature_matrix['housing'], prefix='housing')
loan_dummies = pd.get_dummies(feature_matrix['loan'], prefix='loan')
contact_dummies = pd.get_dummies(feature_matrix['contact'], prefix='contact')
month_dummies = pd.get_dummies(feature_matrix['month'], prefix='month')
poutcome_dummies = pd.get_dummies(feature_matrix['poutcome'], prefix='poutcome')

# drop the original columns
one_hot_feature_matrix = feature_matrix.drop(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'], axis=1)
one_hot_feature_matrix = pd.concat([one_hot_feature_matrix, job_dummies, marital_dummies, education_dummies, default_dummies, housing_dummies, loan_dummies, contact_dummies, month_dummies, poutcome_dummies], axis=1)

# view the new df
one_hot_feature_matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 50 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   age                  45211 non-null  int64
 1   balance              45211 non-null  int64
 2   day                  45211 non-null  int64
 3   campaign             45211 non-null  int64
 4   pdays                45211 non-null  int64
 5   previous             45211 non-null  int64
 6   job_admin.           45211 non-null  bool 
 7   job_blue-collar      45211 non-null  bool 
 8   job_entrepreneur     45211 non-null  bool 
 9   job_housemaid        45211 non-null  bool 
 10  job_management       45211 non-null  bool 
 11  job_retired          45211 non-null  bool 
 12  job_self-employed    45211 non-null  bool 
 13  job_services         45211 non-null  bool 
 14  job_student          45211 non-null  bool 
 15  job_technician       45211 non-null  bool 
 16  job_unemployed       4

In [48]:
# test the new feature matrix
test_model(one_hot_feature_matrix)

Scores:
Accuracy Score:	88.42
Precision:	89.76
Recall:		88.42
F1 Score:	83.00


In [49]:
# given the minimal model impact, we will go on without these additional features
numerical_feature_matrix = feature_matrix[["age", "balance", "day", "campaign", "pdays", "previous"]]

In [50]:
# Handling missing values
numerical_feature_matrix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       45211 non-null  int64
 1   balance   45211 non-null  int64
 2   day       45211 non-null  int64
 3   campaign  45211 non-null  int64
 4   pdays     45211 non-null  int64
 5   previous  45211 non-null  int64
dtypes: int64(6)
memory usage: 2.1 MB


In [52]:
# check for missing values
def print_min_max(df, column):
	print(f'{column}:\nMin: {df[column].min()}\nMax: {df[column].max()}')
	print()

print_min_max(numerical_feature_matrix, 'age')
print_min_max(numerical_feature_matrix, 'balance')
print_min_max(numerical_feature_matrix, 'day')
print_min_max(numerical_feature_matrix, 'campaign')
print_min_max(numerical_feature_matrix, 'pdays')
print_min_max(numerical_feature_matrix, 'previous')

# pdays has a value of -1 which is an indication that the client was not previously contacted, as indicated by the dataset authors
# previous has a value of 0 which is an indication that the client was contacted twice within 24 hours

# it appears that there are no missing values to handle

age:
Min: 18
Max: 95

balance:
Min: -8019
Max: 102127

day:
Min: 1
Max: 31

campaign:
Min: 1
Max: 63

pdays:
Min: -1
Max: 871

previous:
Min: 0
Max: 275



In [53]:
# Scaling and normalization
# we will now scale and normalize each column

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

def test_scaled_model(feature_matrix, scaler):
	X_train, X_test, y_train, y_test = train_test_split(feature_matrix, target, test_size=0.2)

	scaler.fit(X_train)
	X_train_scaled = scaler.transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	model = SVC()
	model.fit(X_train_scaled, y_train)

	pred_y = model.predict(X_test_scaled)

	evaluate_model(y_test, pred_y)

test_scaled_model(numerical_feature_matrix, scaler)

Scores:
Accuracy Score:	88.75
Precision:	85.43
Recall:		88.75
F1 Score:	83.65


In [None]:
# scaling appears to have worsened the model performance
# with precision dropping from the original results
# and the other results staying around the same values

In [60]:
# Combining two or more existing features to generate new features
# I believe that individuals will account for age and available balance when deciding to subscribe to a term deposit
# as such, we will create a new feature that combines these two features

new_feature_matrix = numerical_feature_matrix.copy()
life_expectancy = 75
years_to_life_expectancy = life_expectancy - new_feature_matrix['age']

# eliminate negative values and division by 0
years_to_life_expectancy[years_to_life_expectancy <= 0] = 0.5

new_column_feature_matrix = new_feature_matrix.copy()
new_column_feature_matrix['age_balance'] = new_feature_matrix['balance'] / years_to_life_expectancy

test_model(new_column_feature_matrix)

Scores:
Accuracy Score:	88.23
Precision:	85.71
Recall:		88.23
F1 Score:	82.82


In [None]:
# it appears that this did not improve the model performance

In [61]:
# Creating new features by raising existing features to a power
# we will raise the balance column to the power of 3

new_column_feature_matrix = new_feature_matrix.copy()
new_column_feature_matrix['balance_cubed'] = new_feature_matrix['balance'] ** 3
new_column_feature_matrix = new_column_feature_matrix.drop('balance', axis=1)

test_model(new_column_feature_matrix)

Scores:
Accuracy Score:	88.42
Precision:	82.07
Recall:		88.42
F1 Score:	83.02


In [None]:
# it appears that this did not improve the model performance

In [62]:
# Dimensionality reduction (e.g., PCA)

from sklearn.decomposition import PCA

def test_pca_model(feature_matrix, n_components):
	X_train, X_test, y_train, y_test = train_test_split(feature_matrix, target, test_size=0.2)

	pca = PCA(n_components=n_components)
	pca.fit(X_train)
	X_train_pca = pca.transform(X_train)
	X_test_pca = pca.transform(X_test)

	model = SVC()
	model.fit(X_train_pca, y_train)

	pred_y = model.predict(X_test_pca)

	evaluate_model(y_test, pred_y)

test_pca_model(numerical_feature_matrix, 2)

Scores:
Accuracy Score:	88.27
Precision:	89.64
Recall:		88.27
F1 Score:	82.78


In [64]:
test_pca_model(numerical_feature_matrix, 3)

Scores:
Accuracy Score:	88.16
Precision:	89.56
Recall:		88.16
F1 Score:	82.62


In [None]:
# it appears that this did not improve the model performance
# but performance did not decrease

In [74]:
# Clustering (e.g., K-means)

from sklearn.cluster import KMeans

def test_kmeans_model(feature_matrix, n_clusters):
	X_train, X_test, y_train, y_test = train_test_split(feature_matrix, target, test_size=0.2)

	kmeans = KMeans(n_clusters=n_clusters, n_init=10)
	X_train_clusters = kmeans.fit_predict(X_train)
	X_test_clusters = kmeans.predict(X_test)
	X_train['cluster'] = X_train_clusters.reshape(-1, 1)
	X_test['cluster'] = X_test_clusters.reshape(-1, 1)

	model = SVC()
	model.fit(X_train, y_train)

	pred_y = model.predict(X_test)

	evaluate_model(y_test, pred_y)

test_kmeans_model(numerical_feature_matrix, 2)

Scores:
Accuracy Score:	88.07
Precision:	89.49
Recall:		88.07
F1 Score:	82.49


In [None]:
# it appears that this did not improve the model performance