In [1]:
# import relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
import time # to measure how long the models take
from sklearn import datasets
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
train = pd.read_csv(r"/Users/frieda/Desktop/schulich/data/archive/train.csv",sep=";")

In [3]:
test = pd.read_csv(r"/Users/frieda/Desktop/schulich/data/archive/test.csv",sep=";")

In [11]:
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 6.2+ MB


In [13]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 635.8+ KB


In [14]:
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [15]:
# drop columns
df = train.drop(['day','month','previous','pdays'], axis=1)

In [16]:
# drop columns
df2 = test.drop(['day','month','previous','pdays'], axis=1)

In [17]:
df = pd.get_dummies(df, columns=['y'], drop_first=True)
df2 = pd.get_dummies(df2, columns=['y'], drop_first=True)


In [18]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,poutcome,y_yes
0,58,management,married,tertiary,no,2143,yes,no,unknown,261,1,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,151,1,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,76,1,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,92,1,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,198,1,unknown,0


In [19]:
df2.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,duration,campaign,poutcome,y_yes
0,30,unemployed,married,primary,no,1787,no,no,cellular,79,1,unknown,0
1,33,services,married,secondary,no,4789,yes,yes,cellular,220,1,failure,0
2,35,management,single,tertiary,no,1350,yes,no,cellular,185,1,failure,0
3,30,management,married,tertiary,no,1476,yes,yes,unknown,199,4,unknown,0
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,226,1,unknown,0


In [32]:
df.describe()

Unnamed: 0,age,balance,duration,campaign,y_yes
count,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,258.16308,2.763841,0.116985
std,10.618762,3044.765829,257.527812,3.098021,0.321406
min,18.0,-8019.0,0.0,1.0,0.0
25%,33.0,72.0,103.0,1.0,0.0
50%,39.0,448.0,180.0,2.0,0.0
75%,48.0,1428.0,319.0,3.0,0.0
max,95.0,102127.0,4918.0,63.0,1.0


In [33]:
df2.describe()

Unnamed: 0,age,balance,duration,campaign,y_yes
count,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,263.961292,2.79363,0.11524
std,10.576211,3009.638142,259.856633,3.109807,0.319347
min,19.0,-3313.0,4.0,1.0,0.0
25%,33.0,69.0,104.0,1.0,0.0
50%,39.0,444.0,185.0,2.0,0.0
75%,49.0,1480.0,329.0,3.0,0.0
max,87.0,71188.0,3025.0,50.0,1.0


In [31]:

(df['y_yes'] ==1).sum()/(df['y_yes']==0).sum()


0.1324833425179099

In [28]:
(df2['y_yes'] == 1).sum()/(df2['y_yes'] == 0).sum()

0.13025

In [34]:
y_train = df["y_yes"]
X_train = df.drop("y_yes", axis=1)
y_test = df2["y_yes"]
X_test = df2.drop("y_yes", axis=1)


In [35]:
knn = KNeighborsClassifier(n_neighbors=10)
log_reg = LogisticRegression()
dt = DecisionTreeClassifier(max_depth=20)
rf = RandomForestClassifier()
ada = AdaBoostClassifier()
bag = BaggingClassifier()
voting = VotingClassifier(estimators=[('lr', log_reg), ('knn', knn), ('dt', dt)])

In [36]:
classifiers = {
    'K-Nearest Neighbors': knn,
    'Logistic Regression': log_reg,
    'Decision Tree': dt,
    'Random Forest': rf,
    'AdaBoost': ada,
    'Bagging': bag,
    'Voting': voting
}

In [37]:
# Create dictionary to store the results of each model
results = {}


In [38]:
# Identify numerical and categorical columns
num_cols = X_train.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X_train.select_dtypes(include=['object']).columns


In [39]:
# Identify numerical and categorical columns
num_cols2 = X_test.select_dtypes(include=['float64', 'int64']).columns
cat_cols2 = X_test.select_dtypes(include=['object']).columns


In [41]:
# Create a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(), cat_cols)])

In [42]:
# Loop through list of models to compare performance
for name, clf in classifiers.items():
    start_time = time.time()
    
    # Create pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Fit the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Compute metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    
    # Store results
    results[name] = {
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Accuracy': accuracy,
        'Time (s)': elapsed_time
    }

# Convert results to DataFrame for easier viewing
results_df = pd.DataFrame(results).T
print(results_df)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


                     Precision    Recall  F1-Score  Accuracy  Time (s)
K-Nearest Neighbors   0.726141  0.335893  0.459318  0.908870  2.341458
Logistic Regression   0.658333  0.303263  0.415243  0.901570  1.069440
Decision Tree         0.940803  0.854127  0.895372  0.976996  0.699642
Random Forest         1.000000  1.000000  1.000000  1.000000  7.082864
AdaBoost              0.588040  0.339731  0.430657  0.896483  3.184320
Bagging               0.989691  0.921305  0.954274  0.989825  2.674613
Voting                0.827309  0.395393  0.535065  0.920814  2.299298


Part 2: Build Baseline Models