In [755]:
import pandas as pd
import numpy as np
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore")
np.random.seed(42)

In [756]:
bdf = pd.read_csv("featured_bank_imputed_wo_duration.csv")

In [757]:
dropped_columns= ['s.no','age','emp.var.rate','cons.conf.idx','euribor3m','nr.employed', 'f.pattern']
categorical_columns=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome','f.euribor','f.age']

In [758]:
#columns before dropping
bdf.columns

Index(['s.no', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'f.euribor', 'f.age',
       'f.pattern'],
      dtype='object')

In [759]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s.no            41188 non-null  int64  
 1   age             41188 non-null  int64  
 2   job             41188 non-null  object 
 3   marital         41188 non-null  object 
 4   education       41188 non-null  object 
 5   default         41188 non-null  object 
 6   housing         41188 non-null  object 
 7   loan            41188 non-null  object 
 8   contact         41188 non-null  object 
 9   month           41188 non-null  object 
 10  day_of_week     41188 non-null  object 
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [760]:
# before dropping the columns, keep a copy
bdf_copy = bdf.copy()

In [761]:
# drop the columns
bdf.drop(dropped_columns, axis=1, inplace=True)

In [762]:
bdf.shape

(41188, 17)

In [763]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   default         41188 non-null  object 
 4   housing         41188 non-null  object 
 5   loan            41188 non-null  object 
 6   contact         41188 non-null  object 
 7   month           41188 non-null  object 
 8   day_of_week     41188 non-null  object 
 9   campaign        41188 non-null  int64  
 10  pdays           41188 non-null  int64  
 11  previous        41188 non-null  int64  
 12  poutcome        41188 non-null  object 
 13  cons.price.idx  41188 non-null  float64
 14  y               41188 non-null  object 
 15  f.euribor       41188 non-null  object 
 16  f.age           41188 non-null  object 
dtypes: float64(1), int64(3), object

In [764]:
#convert columns of object type to categorical columns
bdf_cat = bdf[categorical_columns].astype('category')

In [765]:
# info after converting to categorical columns
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   default         41188 non-null  object 
 4   housing         41188 non-null  object 
 5   loan            41188 non-null  object 
 6   contact         41188 non-null  object 
 7   month           41188 non-null  object 
 8   day_of_week     41188 non-null  object 
 9   campaign        41188 non-null  int64  
 10  pdays           41188 non-null  int64  
 11  previous        41188 non-null  int64  
 12  poutcome        41188 non-null  object 
 13  cons.price.idx  41188 non-null  float64
 14  y               41188 non-null  object 
 15  f.euribor       41188 non-null  object 
 16  f.age           41188 non-null  object 
dtypes: float64(1), int64(3), object

In [766]:
bdf_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   job          41188 non-null  category
 1   marital      41188 non-null  category
 2   education    41188 non-null  category
 3   default      41188 non-null  category
 4   housing      41188 non-null  category
 5   loan         41188 non-null  category
 6   contact      41188 non-null  category
 7   month        41188 non-null  category
 8   day_of_week  41188 non-null  category
 9   poutcome     41188 non-null  category
 10  f.euribor    41188 non-null  category
 11  f.age        41188 non-null  category
dtypes: category(12)
memory usage: 485.3 KB


In [767]:
# drop bdf categorical columns from the dataframe
bdf_noncat = bdf.drop(categorical_columns,axis=1)

In [768]:
bdf_noncat.columns

Index(['campaign', 'pdays', 'previous', 'cons.price.idx', 'y'], dtype='object')

In [769]:
# use one hot encoding for categorical columns
bdf_cat_one_hot = pd.get_dummies(bdf_cat)

In [770]:
bdf_cat_one_hot.shape

(41188, 59)

In [771]:
#concat categorical df with non categorical df
bdf_master = pd.concat([bdf_noncat, bdf_cat_one_hot], axis=1)

In [772]:
bdf_master.head()

Unnamed: 0,campaign,pdays,previous,cons.price.idx,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,f.euribor_>4,f.age_11_to_20,f.age_21_to_30,f.age_31_to_40,f.age_41_to_50,f.age_51_to_60,f.age_61_to_70,f.age_71_to_80,f.age_81_to_90,f.age_91_to_100
0,1,999,0,93.994,no,0,0,0,1,0,...,1,0,0,0,0,1,0,0,0,0
1,1,999,0,93.994,no,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,1,999,0,93.994,no,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,1,999,0,93.994,no,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,1,999,0,93.994,no,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


In [773]:
# get the target variable column
y = bdf_master.pop('y')

In [774]:
bdf_master.shape

(41188, 63)

In [775]:
# remove the target variable column from the master dataset
X = bdf_master

In [776]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [777]:
X_train.shape

(30891, 63)

In [778]:
y_test.shape

(10297,)

In [779]:
y_train.value_counts()

no     27416
yes     3475
Name: y, dtype: int64

In [780]:
y_test.value_counts()

no     9132
yes    1165
Name: y, dtype: int64

In [781]:
# the propotion of yes and no looks the same

In [782]:
# function that fits and predicts
def fit_predict(algo,X_train, X_test, y_train, y_test):
    algo.fit(X_train, y_train)
    y_pred = algo.predict(X_test)
    print(accuracy_score(y_test, y_pred))

## Logistic Regression

In [783]:
logit = LogisticRegression()

In [784]:
print("Accuracy with Logistic Regression", end=': ')
fit_predict(logit, X_train, X_test, y_train, y_test)

Accuracy with Logistic Regression: 0.8988054773234923


## Decision Tree

In [785]:
tree = DecisionTreeClassifier()

In [786]:
print("Accuracy with Decision Tree", end=': ')
fit_predict(tree, X_train, X_test, y_train, y_test)

Accuracy with Decision Tree: 0.854714965523939


## Random Forest

In [787]:
forest = RandomForestClassifier()

In [788]:
print("Accuracy with Random Forest", end=': ')
fit_predict(forest, X_train, X_test, y_train, y_test)

Accuracy with Random Forest: 0.8890939108478197
