In [1]:
import pandas as pd
import numpy as np
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

warnings.filterwarnings("ignore")
np.random.seed(42)

In [2]:
bdf = pd.read_csv("featured_bank_imputed_wo_du_common_pattern.csv")

In [3]:
dropped_columns= ['s.no','age','emp.var.rate','cons.conf.idx','euribor3m','nr.employed','default']
categorical_columns=['job','marital','education','housing','loan','contact','month','day_of_week','poutcome','f.euribor','f.age','f.pattern']

In [4]:
#columns before dropping
bdf.columns

Index(['f.pattern', 'no', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'f.pattern.1', 'no.1'],
      dtype='object')

In [5]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   f.pattern    4 non-null      object 
 1   no           2 non-null      object 
 2   Unnamed: 2   0 non-null      float64
 3   Unnamed: 3   0 non-null      float64
 4   Unnamed: 4   0 non-null      float64
 5   Unnamed: 5   0 non-null      float64
 6   Unnamed: 6   0 non-null      float64
 7   Unnamed: 7   0 non-null      float64
 8   Unnamed: 8   0 non-null      float64
 9   Unnamed: 9   0 non-null      float64
 10  Unnamed: 10  0 non-null      float64
 11  f.pattern.1  4 non-null      object 
 12  no.1         2 non-null      object 
dtypes: float64(9), object(4)
memory usage: 648.0+ bytes


In [6]:
# before dropping the columns, keep a copy
bdf_copy = bdf.copy()

In [7]:
# drop the columns
#bdf.drop(dropped_columns, axis=1, inplace=True)

In [8]:
bdf.shape

(5, 13)

In [9]:
bdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   f.pattern    4 non-null      object 
 1   no           2 non-null      object 
 2   Unnamed: 2   0 non-null      float64
 3   Unnamed: 3   0 non-null      float64
 4   Unnamed: 4   0 non-null      float64
 5   Unnamed: 5   0 non-null      float64
 6   Unnamed: 6   0 non-null      float64
 7   Unnamed: 7   0 non-null      float64
 8   Unnamed: 8   0 non-null      float64
 9   Unnamed: 9   0 non-null      float64
 10  Unnamed: 10  0 non-null      float64
 11  f.pattern.1  4 non-null      object 
 12  no.1         2 non-null      object 
dtypes: float64(9), object(4)
memory usage: 648.0+ bytes


In [10]:
#convert columns of object type to categorical columns
bdf_cat = bdf[categorical_columns].astype('category')

KeyError: "['month', 'day_of_week', 'contact', 'marital', 'education', 'housing', 'poutcome', 'job', 'loan', 'f.euribor', 'f.age'] not in index"

In [None]:
# info after converting to categorical columns
bdf.info()

In [None]:
bdf_cat.info()

In [None]:
# drop bdf categorical columns from the dataframe
bdf_noncat = bdf.drop(categorical_columns,axis=1)

In [None]:
bdf_noncat.columns

In [None]:
# use one hot encoding for categorical columns
bdf_cat_one_hot = pd.get_dummies(bdf_cat)

In [None]:
bdf_cat_one_hot.shape

In [None]:
#concat categorical df with non categorical df
bdf_master = pd.concat([bdf_noncat, bdf_cat_one_hot], axis=1)

In [None]:
bdf_master.head()

In [None]:
# get the target variable column
y = bdf_master.pop('y')

In [None]:
bdf_master.shape

In [None]:
# remove the target variable column from the master dataset
X = bdf_master

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=101)

In [None]:
X_train.shape

In [None]:
y_test.shape

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
# the propotion of yes and no looks the same

In [None]:
# function that fits and predicts
def fit_predict(algo,X_train, X_test, y_train, y_test):
    algo.fit(X_train, y_train)
    y_pred = algo.predict(X_test)
    return accuracy_score(y_test, y_pred)

## Logistic Regression

In [None]:
logit = LogisticRegression()

In [None]:
print("Accuracy with Logistic Regression", end=': ')
logit_accuracy = fit_predict(logit, X_train, X_test, y_train, y_test)
print(logit_accuracy)

## Decision Tree

In [None]:
tree = DecisionTreeClassifier()

In [None]:
print("Accuracy with Decision Tree", end=': ')
dt_accuracy = fit_predict(tree, X_train, X_test, y_train, y_test)
print(dt_accuracy)

## Parameterization in Decision Trees

In [None]:
def fit_predict_dt(train, test, y_train, y_test, scaler, max_depth,
               criterion = 'entropy', max_features=1, min_samples_split=4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth,
                               random_state= 42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    print(accuracy_score(y_test, y_pred))

### Max depth parameter tuning

In [None]:
# for i in range(1,30):
#     print("Accuracy score using max_depth = ", i, end = ':')
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), i)

### Max features tuning

In [None]:
# for i in np.arange(0.1,1.0,0.1):
#     print('Accuracy score using max features =', i, end = ":")
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=9, max_features=i)

### Min samples split tuning

In [None]:
# for i in range(2,10):
#     print('Accuracy score using min samples split=', i, end=":")
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), 16, max_features=0.8, min_samples_split=i)

### Criterion tuning

In [None]:
# for i in ['gini','entropy']:
#     print("Accuracy score using criterion: ", i, end = ':')
#     fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), 16, max_features = 0.8, min_samples_split=6, criterion= i)

### Using polynomial features

In [None]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [None]:
# for degree in [1,2,3,4]:
#     train_poly, test_poly = create_poly(X_train, X_test, degree)
#     print("polynomial degree", degree)
#     fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 16, max_features = 0.2,min_samples_split=4, criterion='entropy')
#     print(10 *'-')

## Random Forest

In [None]:
forest = RandomForestClassifier(criterion='entropy', oob_score=True)

In [None]:
print("Accuracy with Random Forest", end=': ')
rf_accuracy = fit_predict(forest, X_train, X_test, y_train, y_test)
print(rf_accuracy)

### Parameter tuning using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'n_estimators': [200,500,700],
    'max_depth': [10,15,18,20],
    'min_samples_leaf': [3,5,7]
}

In [None]:
#gs = GridSearchCV(forest, params, cv=3, verbose=3, n_jobs=3)

In [None]:
#gs.fit(X_train, y_train)

In [None]:
#gs.best_params_

In [None]:
#gs.best_estimator_

### Retraining with best estimators

In [None]:
forest1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=18, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=7, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
forest1.fit(X_train, y_train)

In [None]:
pred_forest1 = forest1.predict(X_test)

In [None]:
print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
rf_hyper_accuracy = accuracy_score(y_test, pred_forest1)
print(rf_hyper_accuracy)

In [None]:
forest1.feature_importances_

In [None]:
sorted(list(zip(forest1.feature_importances_, X_train.columns)), reverse=True)

In [None]:
### Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test, pred_forest1))

## Accuracy Summary

In [None]:
print("Accuracy with Logistic Regression", end=': ')
print(logit_accuracy)
print("Accuracy with Decision Tree", end=': ')
print(dt_accuracy)
print("Accuracy with Random Forest", end=': ')
print(rf_accuracy)
print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
print(rf_hyper_accuracy)

In [None]:
# todo -- for rows where there are no patterns but td = yes, check if any patterns emerge and record it and re-run the models
# todo -- try adaboost algorithm

## Ada Boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adabst_fit = AdaBoostClassifier(base_estimator= forest, random_state=42, n_estimators=300)

In [None]:
adabst_fit.fit(X_train, y_train)

In [None]:
print("Accuracy with Ada Boost", end=': ')
print(round(accuracy_score(y_test,adabst_fit.predict(X_test)),4))