In [57]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
import time # to measure how long the models take
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

## Part One

In [58]:
train = pd.read_csv(r"/Users/frieda/Desktop/schulich/data/archive/train.csv",sep=";")

In [59]:
test = pd.read_csv(r"/Users/frieda/Desktop/schulich/data/archive/test.csv",sep=";")

Data Cleaning:

In [60]:
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)


In [61]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 6.2+ MB


In [62]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 635.8+ KB


The datasets perform a train-test split with a 10% ratio, which is a small percentage

In [63]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


We drop repetitive columns from the dataset

In [64]:
# drop columns
df = train.drop(['day','month','previous','pdays'], axis=1)

In [65]:
# drop columns
df2 = test.drop(['day','month','previous','pdays'], axis=1)

In [66]:
df = pd.get_dummies(df, columns=['y'], drop_first=True)
df2 = pd.get_dummies(df2, columns=['y'], drop_first=True)


In [67]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,poutcome,y_yes
0,58,management,married,tertiary,no,2143,yes,no,unknown,261,1,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,151,1,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,76,1,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,92,1,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,198,1,unknown,0


In [68]:
df2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,poutcome,y_yes
0,30,unemployed,married,primary,no,1787,no,no,cellular,79,1,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,220,1,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,185,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,199,4,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,226,1,unknown,0


In [69]:
df.describe()

Unnamed: 0,age,balance,duration,campaign,y_yes
count,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,258.16308,2.763841,0.116985
std,10.618762,3044.765829,257.527812,3.098021,0.321406
min,18.0,-8019.0,0.0,1.0,0.0
25%,33.0,72.0,103.0,1.0,0.0
50%,39.0,448.0,180.0,2.0,0.0
75%,48.0,1428.0,319.0,3.0,0.0
max,95.0,102127.0,4918.0,63.0,1.0


In [70]:
df2.describe()

Unnamed: 0,age,balance,duration,campaign,y_yes
count,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,263.961292,2.79363,0.11524
std,10.576211,3009.638142,259.856633,3.109807,0.319347
min,19.0,-3313.0,4.0,1.0,0.0
25%,33.0,69.0,104.0,1.0,0.0
50%,39.0,444.0,185.0,2.0,0.0
75%,49.0,1480.0,329.0,3.0,0.0
max,87.0,71188.0,3025.0,50.0,1.0


Combining train and test dataset into one

In [71]:
combined_df = pd.concat([df, df2], ignore_index=True)

In [72]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49732 entries, 0 to 49731
Data columns (total 13 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        49732 non-null  int64 
 1   job        49732 non-null  object
 2   marital    49732 non-null  object
 3   education  49732 non-null  object
 4   default    49732 non-null  object
 5   balance    49732 non-null  int64 
 6   housing    49732 non-null  object
 7   loan       49732 non-null  object
 8   contact    49732 non-null  object
 9   duration   49732 non-null  int64 
 10  campaign   49732 non-null  int64 
 11  poutcome   49732 non-null  object
 12  y_yes      49732 non-null  uint8 
dtypes: int64(4), object(8), uint8(1)
memory usage: 4.6+ MB


In [73]:

(df['y_yes'] ==1).sum()/(df['y_yes']==0).sum()


0.1324833425179099

In [74]:
(df2['y_yes'] == 1).sum()/(df2['y_yes'] == 0).sum()

0.13025

We can observe that both df and df2 have nearly the same percentage of 'y' equal to 'yes'

In [75]:
knn = KNeighborsClassifier(n_neighbors=10)
log_reg = LogisticRegression()
dt = DecisionTreeClassifier(max_depth=20)
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
bag = BaggingClassifier()
voting = VotingClassifier(estimators=[('lr', log_reg), ('knn', knn), ('dt', dt)])

In [76]:
classifiers = {
    'K-Nearest Neighbors': knn,
    'Logistic Regression': log_reg,
    'Decision Tree': dt,
    'Random Forest': rf,
    'AdaBoost': ada,
    'Bagging': bag,
    'Voting': voting
}

In [77]:
# Create dictionary to store the results of each model
results = {}


In [78]:
y = combined_df["y_yes"]
X = combined_df.drop("y_yes", axis=1)

In [79]:
# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns


In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [81]:
# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)])

In [82]:
# Loop through list of models to compare performance
for name, clf in classifiers.items():
    start_time = time.time()
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Compute metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Store results
    results[name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Accuracy': accuracy,
        'Time (s)': elapsed_time
    }

# Convert results to DataFrame for easier viewing
results_df = pd.DataFrame(results).T
print(results_df)

                     Precision    Recall  F1-Score  Accuracy  Time (s)
K-Nearest Neighbors   0.636872  0.285238  0.394009  0.894239  3.711995
Logistic Regression   0.668385  0.324437  0.436833  0.899166  0.613035
Decision Tree         0.532432  0.492911  0.511910  0.886700  0.902847
Random Forest         0.689441  0.462886  0.553892  0.910124  6.289239
AdaBoost              0.651494  0.381985  0.481598  0.900875  3.106491
Bagging               0.656619  0.417848  0.510703  0.903488  2.657977
Voting                0.673540  0.326939  0.440202  0.899769  3.085443


### Interpretation

* The KNN model has a moderate precision and F1-Score, but a relatively low recall. This means the model is not so good at predicting survivors. Accuracy is fairly high. However, it takes a longer time to train and make predictions compared to other models.

* The Logistic Regression performs well in terms of precision, F1-Score, and accuracy. It is the fastest model to train and make predictions, making it an ok choice, despite the low recall value.

* The Decision Tree has the highest recall but a lower precision. Its F1-Score is reasonable, and it achieves an accuracy of 88.7%. However, it takes a bit more time to train compared to Logistic Regression.

* The Random Forest model has the highest precision, Accuracy and F1-Score model of all. However, it is the slowest among the models.

* AdaBoost performs good precision, F1-Score, and accuracy. It is relatively efficient in terms of training and prediction time.

* Bagging outperforms both AdaBoost and RandomForest, and is efficient.

* Voting, which allowed us to combine models we are interested in is the most efficient and has decent metrics across the board, has moderate values. 

## Part 2 and 3: Build Models

In [83]:
import warnings

warnings.filterwarnings('ignore', category=UserWarning)

In [84]:
# Import additional libraries
from sklearn.model_selection import GridSearchCV

In [86]:
# Import additional libraries
from sklearn.model_selection import GridSearchCV

# Hyperparameter grids for tuning
knn_params = {'classifier__n_neighbors': [3, 5, 7, 20, 30, 50, 100]}
log_reg_params = {'classifier__C': [0.1, 1, 10]}
dt_params = {'classifier__max_depth': [10,20,30,40,50]}
rf_params = {'classifier__n_estimators': [50, 100, 150], 'classifier__max_depth': [None, 10, 20, 30, 50]}
ada_params = {'classifier__n_estimators': [25, 50, 75]}
bag_params = {'classifier__n_estimators': [5, 10, 20]}
voting_params = {'classifier__voting': ['hard', 'soft']}

params_dict = {
    'K-Nearest Neighbors': knn_params,
    'Logistic Regression': log_reg_params,
    'Decision Tree': dt_params,
    'Random Forest': rf_params,
    'AdaBoost': ada_params,
    'Bagging': bag_params,
    'Voting': voting_params
}

# Initialize results dictionary for tuned models
tuned_results = {}

# Loop through classifiers for tuning
for name, clf in classifiers.items():
    start_time = time.time()
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Create GridSearchCV object
    grid = GridSearchCV(pipeline, params_dict[name], cv=5)
    
    # Fit the model
    grid.fit(X_train, y_train)
    
    # Get the best estimator and predict
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    
    # Compute metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Store results
    tuned_results[name] = {
        'Best Params': grid.best_params_,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Time (s)': elapsed_time
    }

# Convert results to DataFrame for easier viewing
tuned_results_df = pd.DataFrame(tuned_results).T
print(tuned_results_df);


                                                           Best Params  \
K-Nearest Neighbors                    {'classifier__n_neighbors': 30}   
Logistic Regression                             {'classifier__C': 0.1}   
Decision Tree                            {'classifier__max_depth': 10}   
Random Forest        {'classifier__max_depth': 30, 'classifier__n_e...   
AdaBoost                              {'classifier__n_estimators': 50}   
Bagging                               {'classifier__n_estimators': 20}   
Voting                                  {'classifier__voting': 'hard'}   

                    Precision    Recall  F1-Score   Time (s)  
K-Nearest Neighbors  0.679458  0.251043  0.366626  69.086226  
Logistic Regression  0.671304  0.321935  0.435175   6.486513  
Decision Tree        0.597701  0.390325   0.47225  10.490081  
Random Forest        0.688645  0.470392  0.558969  323.44253  
AdaBoost             0.651494  0.381985  0.481598  40.580599  
Bagging              0.680851

## Part Four

1. Why did the ensemble models perform the way they did?

Beyond the hyperparameters, use your understanding of how the models work to explain why you think the models performed they way they did on the given data set. Was the result what you were expecting? Why or why not?

2. If you had to pick one model to implement in business process, which would it be and why?

Discuss the business implications.
Consider not only performance metrics but also computational cost and interpretability.

3. What decision criteria did you use to arrive at this conclusion?

Precision-Recall trade-off? Computational cost? Others?