In [183]:
import pandas as pd
import numpy as np
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

warnings.filterwarnings("ignore")
np.random.seed(42)

In [184]:
bdf = pd.read_csv("featured_bank_imputed_wo_duration.csv")

In [185]:
dropped_columns= ['s.no','age','emp.var.rate','cons.conf.idx','euribor3m','nr.employed','default']
categorical_columns=['job','marital','education','housing','loan','contact','month','day_of_week','poutcome','f.euribor','f.age','f.pattern']

In [186]:
#columns before dropping
bdf.columns

Index(['s.no', 'age', 'job', 'marital', 'education', 'default', 'housing',
       'loan', 'contact', 'month', 'day_of_week', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y', 'f.euribor', 'f.age',
       'f.pattern'],
      dtype='object')

In [187]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   s.no            41188 non-null  int64  
 1   age             41188 non-null  int64  
 2   job             41188 non-null  object 
 3   marital         41188 non-null  object 
 4   education       41188 non-null  object 
 5   default         41188 non-null  object 
 6   housing         41188 non-null  object 
 7   loan            41188 non-null  object 
 8   contact         41188 non-null  object 
 9   month           41188 non-null  object 
 10  day_of_week     41188 non-null  object 
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [188]:
# before dropping the columns, keep a copy
bdf_copy = bdf.copy()

In [189]:
# drop the columns
bdf.drop(dropped_columns, axis=1, inplace=True)

In [190]:
bdf.shape

(41188, 17)

In [191]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   housing         41188 non-null  object 
 4   loan            41188 non-null  object 
 5   contact         41188 non-null  object 
 6   month           41188 non-null  object 
 7   day_of_week     41188 non-null  object 
 8   campaign        41188 non-null  int64  
 9   pdays           41188 non-null  int64  
 10  previous        41188 non-null  int64  
 11  poutcome        41188 non-null  object 
 12  cons.price.idx  41188 non-null  float64
 13  y               41188 non-null  object 
 14  f.euribor       41188 non-null  object 
 15  f.age           41188 non-null  object 
 16  f.pattern       41188 non-null  object 
dtypes: float64(1), int64(3), object

In [192]:
#convert columns of object type to categorical columns
bdf_cat = bdf[categorical_columns].astype('category')

In [193]:
# info after converting to categorical columns
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   job             41188 non-null  object 
 1   marital         41188 non-null  object 
 2   education       41188 non-null  object 
 3   housing         41188 non-null  object 
 4   loan            41188 non-null  object 
 5   contact         41188 non-null  object 
 6   month           41188 non-null  object 
 7   day_of_week     41188 non-null  object 
 8   campaign        41188 non-null  int64  
 9   pdays           41188 non-null  int64  
 10  previous        41188 non-null  int64  
 11  poutcome        41188 non-null  object 
 12  cons.price.idx  41188 non-null  float64
 13  y               41188 non-null  object 
 14  f.euribor       41188 non-null  object 
 15  f.age           41188 non-null  object 
 16  f.pattern       41188 non-null  object 
dtypes: float64(1), int64(3), object

In [194]:
bdf_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   job          41188 non-null  category
 1   marital      41188 non-null  category
 2   education    41188 non-null  category
 3   housing      41188 non-null  category
 4   loan         41188 non-null  category
 5   contact      41188 non-null  category
 6   month        41188 non-null  category
 7   day_of_week  41188 non-null  category
 8   poutcome     41188 non-null  category
 9   f.euribor    41188 non-null  category
 10  f.age        41188 non-null  category
 11  f.pattern    41188 non-null  category
dtypes: category(12)
memory usage: 486.0 KB


In [195]:
# drop bdf categorical columns from the dataframe
bdf_noncat = bdf.drop(categorical_columns,axis=1)

In [196]:
bdf_noncat.columns

Index(['campaign', 'pdays', 'previous', 'cons.price.idx', 'y'], dtype='object')

In [197]:
# use one hot encoding for categorical columns
bdf_cat_one_hot = pd.get_dummies(bdf_cat)

In [198]:
bdf_cat_one_hot.shape

(41188, 75)

In [199]:
#concat categorical df with non categorical df
bdf_master = pd.concat([bdf_noncat, bdf_cat_one_hot], axis=1)

In [200]:
bdf_master.head()

Unnamed: 0,campaign,pdays,previous,cons.price.idx,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,f.pattern_PE1C,f.pattern_PE1D,f.pattern_PE1F,f.pattern_PE1G,f.pattern_PE1H,f.pattern_PE1K,f.pattern_PE1L,f.pattern_PE4A,f.pattern_PEIJ,f.pattern_no
0,1,999,0,93.994,no,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,999,0,93.994,no,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,999,0,93.994,no,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [201]:
# get the target variable column
y = bdf_master.pop('y')

In [202]:
bdf_master.shape

(41188, 79)

In [203]:
# remove the target variable column from the master dataset
X = bdf_master

In [204]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [205]:
X_train.shape

(30891, 79)

In [206]:
y_test.shape

(10297,)

In [207]:
y_train.value_counts()

no     27416
yes     3475
Name: y, dtype: int64

In [208]:
y_test.value_counts()

no     9132
yes    1165
Name: y, dtype: int64

In [209]:
# the propotion of yes and no looks the same

In [210]:
# function that fits and predicts
def fit_predict(algo,X_train, X_test, y_train, y_test):
    algo.fit(X_train, y_train)
    y_pred = algo.predict(X_test)
    return accuracy_score(y_test, y_pred)

## Logistic Regression

In [211]:
logit = LogisticRegression()

In [212]:
print("Accuracy with Logistic Regression", end=': ')
logit_accuracy = fit_predict(logit, X_train, X_test, y_train, y_test)
print(logit_accuracy)

Accuracy with Logistic Regression: 0.898222783334952


## Decision Tree

In [213]:
tree = DecisionTreeClassifier()

In [214]:
print("Accuracy with Decision Tree", end=': ')
dt_accuracy = fit_predict(tree, X_train, X_test, y_train, y_test)
print(dt_accuracy)

Accuracy with Decision Tree: 0.8566572788190735


## Parameterization in Decision Trees

In [215]:
def fit_predict_dt(train, test, y_train, y_test, scaler, max_depth,
               criterion = 'entropy', max_features=1, min_samples_split=4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth,
                               random_state= 42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

### Max depth parameter tuning

In [216]:
# for i in range(1,30):
#     print("Accuracy score using max_depth = ", i, end = ':')
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), i)

### Max features tuning

In [217]:
# for i in np.arange(0.1,1.0,0.1):
#     print('Accuracy score using max features =', i, end = ":")
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=9, max_features=i)

### Min samples split tuning

In [218]:
# for i in range(2,10):
#     print('Accuracy score using min samples split=', i, end=":")
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), 16, max_features=0.8, min_samples_split=i)

### Criterion tuning

In [219]:
# for i in ['gini','entropy']:
#     print("Accuracy score using criterion: ", i, end = ':')
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), 16, max_features = 0.8, min_samples_split=6, criterion= i)

### Using polynomial features

In [220]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [221]:
# for degree in [1,2,3,4]:
#     train_poly, test_poly = create_poly(X_train, X_test, degree)
#     print("polynomial degree", degree)
#     fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 16, max_features = 0.2,min_samples_split=4, criterion='entropy')
#     print(10 *'-')

## Random Forest

In [222]:
forest = RandomForestClassifier(criterion='entropy', oob_score=True)

In [223]:
print("Accuracy with Random Forest", end=': ')
rf_accuracy = fit_predict(forest, X_train, X_test, y_train, y_test)
print(rf_accuracy)

Accuracy with Random Forest: 0.8897737205011168


### Parameter tuning using Grid Search

In [224]:
from sklearn.model_selection import GridSearchCV

In [225]:
params = {
    'n_estimators': [200,500,700],
    'max_depth': [10,15,18,20],
    'min_samples_leaf': [3,5,7]
}

In [226]:
#gs = GridSearchCV(forest, params, cv=3, verbose=3, n_jobs=3)

In [227]:
#gs.fit(X_train, y_train)

In [228]:
#gs.best_params_

In [229]:
#gs.best_estimator_

### Retraining with best estimators

In [230]:
forest1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [231]:
forest1.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [232]:
pred_forest1 = forest1.predict(X_test)

In [233]:
print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
rf_hyper_accuracy = accuracy_score(y_test, pred_forest1)
print(rf_hyper_accuracy)

Accuracy with Random Forest after Hyper Parameter Tuning: 0.9005535592891133


In [234]:
forest1.feature_importances_

array([3.69638815e-02, 6.32020254e-02, 3.83314401e-02, 8.71727148e-02,
       9.00681535e-03, 1.02913602e-02, 2.51447748e-03, 2.15538069e-03,
       3.90320261e-03, 7.08468308e-03, 2.71337504e-03, 5.90683565e-03,
       4.71690798e-03, 6.96118469e-03, 1.86031960e-03, 5.87333827e-03,
       9.75708785e-03, 9.64867470e-03, 5.42267391e-03, 3.05591805e-03,
       7.83795676e-03, 8.49595615e-03, 7.71637253e-06, 5.91152592e-03,
       1.06858068e-02, 1.02193759e-02, 1.05775972e-02, 6.63154779e-03,
       6.23355160e-03, 2.34997418e-02, 2.45651971e-02, 9.89625008e-03,
       5.63951077e-03, 1.34739367e-03, 6.75140972e-03, 1.26862768e-02,
       1.13675601e-02, 2.65313108e-02, 5.49757457e-03, 1.33153686e-02,
       5.45482483e-03, 1.07354651e-02, 1.06210030e-02, 1.09186680e-02,
       9.29630843e-03, 9.24442070e-03, 3.32069490e-02, 2.55868657e-02,
       1.26152688e-01, 2.46173022e-02, 7.89923053e-06, 1.21840985e-01,
       9.19831108e-04, 8.96786330e-03, 1.16545785e-02, 9.06572285e-03,
      

In [235]:
sorted(list(zip(forest1.feature_importances_, X_train.columns)), reverse=True)

[(0.12615268809813016, 'f.euribor_<1'),
 (0.12184098531377129, 'f.euribor_>4'),
 (0.08717271478514656, 'cons.price.idx'),
 (0.06320202544164485, 'pdays'),
 (0.03833144006610462, 'previous'),
 (0.0369638815277234, 'campaign'),
 (0.03320694901182908, 'poutcome_failure'),
 (0.03216607179845515, 'f.pattern_no'),
 (0.02653131084128941, 'month_may'),
 (0.025586865683672334, 'poutcome_success'),
 (0.024617302193016822, 'f.euribor_>1 and <2'),
 (0.024565197056061226, 'contact_telephone'),
 (0.023499741767573988, 'contact_cellular'),
 (0.01331536857914013, 'month_oct'),
 (0.012686276750705766, 'month_jun'),
 (0.011654578546418621, 'f.age_31_to_40'),
 (0.01136756006130689, 'month_mar'),
 (0.010918667966593347, 'day_of_week_thu'),
 (0.010735465057689889, 'day_of_week_fri'),
 (0.010685806847720997, 'education_university.degree'),
 (0.010621002965887307, 'day_of_week_mon'),
 (0.010577597236752327, 'housing_yes'),
 (0.010291360224537293, 'job_blue-collar'),
 (0.010219375928888115, 'housing_no'),
 (0

In [236]:
### Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

In [237]:
print(confusion_matrix(y_test, pred_forest1))

[[9021  111]
 [ 913  252]]


## Accuracy Summary

In [238]:
print("Accuracy with Logistic Regression", end=': ')
print(logit_accuracy)
print("Accuracy with Decision Tree", end=': ')
print(dt_accuracy)
print("Accuracy with Random Forest", end=': ')
print(rf_accuracy)
print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
print(rf_hyper_accuracy)

Accuracy with Logistic Regression: 0.898222783334952
Accuracy with Decision Tree: 0.8566572788190735
Accuracy with Random Forest: 0.8897737205011168
Accuracy with Random Forest after Hyper Parameter Tuning: 0.9005535592891133


In [239]:
# todo -- for rows where there are no patterns but td = yes, check if any patterns emerge and record it and re-run the models
# todo -- try adaboost algorithm