In [None]:
import pandas as pd
import numpy as np
import warnings

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.preprocessing import StandardScaler, PolynomialFeatures

warnings.filterwarnings("ignore")
np.random.seed(42)

In [None]:
from yellowbrick.classifier import ClassificationReport
from yellowbrick.features import rank2d

In [None]:
grid_search_enable = True
cv = 2
verbose = 3
number_of_jobs = -1
rand_state = 101
test_size = 0.4

In [None]:
bdf = pd.read_csv("featured_bank_imputed_wo_duration_year.csv")

### Feature engineering

In [None]:
def festivals(month):
    # imputing festivals - apr: easter; jun:carnival; dec:christmas
    if month == 'apr' or month == 'jun' or month == 'dec':
        return 1
    else:
        return 0

In [None]:
bdf['f.festivals'] = bdf['month'].apply(festivals)

In [None]:
def bonus_months(month):
    # bonus paid twice in a year - may and nov
    if month == 'jun' or month == 'dec':
        return 1
    else:
        return 0

In [None]:
bdf['f.bonus_months']= bdf['month'].apply(bonus_months)

In [None]:
def commitment(housing, personal, marital):
    # if a person is married and has housing and personal loans, flag it as high commitment
    if housing == 'yes' and personal == 'yes' and marital == 'married':
        return 'high'
    if housing == 'no' and personal == 'no' and marital in ['single']:
        return 'low'
    else:
        return 'medium'

In [None]:
bdf['f.commitment'] = bdf.apply(lambda x: commitment(x.housing, x.loan, x.marital), axis=1)

In [None]:
def seasons(month):
    if month in ['mar','apr','may']:
        return 'spring'
    if month in ['jun','jul','aug']:
        return 'summer'
    if month in ['sep','oct','nov']:
        return 'autumn'
    if month in ['dec','jan','feb']:
        return 'winter'

In [None]:
bdf['f.seasons'] = bdf['month'].apply(lambda mon: seasons(mon))

In [None]:
def seasons_weightage(season):
    if season == 'autumn':
        return 21
    if season == 'spring':
        return 36
    if season == 'summer':
        return 40
    if season == "winter":
        return 2

In [None]:
#bdf ['f.season_weight'] = bdf['f.seasons'].apply(lambda season: seasons_weightage(season))

In [None]:
def retired(age):
    if age >= 65:
        return 'retired'
    else:
        return 'not-retired'

In [None]:
bdf['f.retired_status'] = bdf['age'].apply(lambda age: retired(age))

In [None]:
def marital_weightage(marital):
    if marital == 'divorced':
        return 1
    if marital == 'single':
        return 35
    if marital == 'married':
        return 54

In [None]:
#bdf['f.marital_weightage'] = bdf['marital'].apply(lambda status: marital_weightage(status))

In [None]:
def age_weightage(age):
    if age == '11_to_20':
        return 0.01
    if age == '21_to_30':
        return 0.23
    if age == '31_to_40':
        return 0.34
    if age == '41_to_50':
        return 0.18
    if age == '51_to_60':
        return 0.14
    if age == '61_to_70':
        return 0.04
    if age == '71_to_80':
        return 0.03
    if age == '81_to_90':
        return 0.01
    if age == '91_to_100':
        return 0.00
        

In [None]:
#bdf['f.age_weightage'] = bdf['f.age'].apply(lambda age: age_weightage(age))

In [None]:
def previous_user(previous):
    if previous == 0:
        return 'new user'
    else:
        return 'existing user'

In [None]:
bdf['f.user_type'] = bdf['previous'].apply(lambda previous: previous_user(previous))

In [None]:
def previous_no_of_times(previous):
    # if a user is part of the previous campaign, bin them
    if previous == 0:
        return 'no previous'
    if previous >= 1 and previous <= 3:
        return '1_to_3'
    if previous > 3:
        return 'gt_3'
    

In [None]:
bdf['f.previous_campaigns'] = bdf['previous'].apply(lambda previous: previous_no_of_times(previous))

In [None]:
def current_campaign_calls(calls):
    # how many times a user is called
    if calls == 1:
        return 'once'
    if calls >= 2 and calls <= 3:
        return 'twice to thrice '
    if calls > 3 and calls <= 6:
        return 'four to six times'
    if calls >= 7 and calls <= 10:
        return 'seven to ten times'
    if calls > 10:
        return 'more than ten times'
    

In [None]:
bdf['f.current_campaign_calls'] = bdf['campaign'].apply(lambda calls: current_campaign_calls(calls))

In [None]:
def first_time_user_calls(previous, campaigns):
    # first time user - not a part of previous campaign
    # first time the user is speaking - what's the conversion rate
    if previous == 0:
        if campaigns == 1:
            return 'first time called'
        if campaigns >= 2 and campaigns <= 3:
            return 'called atleast twice'
        if campaigns > 3:
            return 'called atleast thrice'
        else:
            return 'more than thrice'
    else:
        return 'returning user'

In [None]:
bdf['f.first_time_user_calls'] = bdf.apply(lambda x: first_time_user_calls(x.previous, x.campaign), axis=1)

In [None]:
def savings_intention(job):
    if job in ['admin.','blue-collar','technician']:
        return 100
    elif job in ['retired','management','services']:
        return 50
    else:
        return 25

In [None]:
bdf['f.savings_intent_factor'] = bdf['job'].apply(lambda job: savings_intention(job))

In [None]:
def age_range_weightage(age):
    if age <= 24:
        return 0.15
    if age >=25 and age <= 69:
        return 0.61
    if age >= 70 and age <= 80:
        return 0.13
    if age > 80:
        return 0.04

In [None]:
bdf['f.age_range_weightage'] = bdf['age'].apply(lambda age: age_range_weightage(age))

In [None]:
def avg_income_distribution(salary, marital):
    if marital == 'married':
        return round((salary/3),2)
    if marital == 'single':
        return salary
    if marital == 'divorced':
        return round((salary/2),2)
    

In [None]:
#bdf['f.avg_income_dist'] = bdf.apply(lambda x: avg_income_distribution(x['f.salary'], x['marital']), axis=1)

In [None]:
def p_days_bin(pdays):
    if pdays >=0 and pdays <=6:
        return 'within a week'
    if pdays >=7 and pdays <=13:
        return 'within two weeks'
    if pdays >= 14 and pdays <=20:
        return 'within three weeks'
    if pdays > 20:
        return 'more than three weeks'

In [None]:
bdf['f.pdays'] = bdf['pdays'].apply(lambda days: p_days_bin(days))

In [None]:
def year_weightage(year):
    if year == "Y2008":
        return 5
    if year == "Y2009":
        return 14
    if year == "Y2010":
        return 52

In [None]:
#bdf['f.year_weightage'] = bdf['f.year'].apply(lambda year: year_weightage(year))

In [None]:
def recession_strength(year, month):
    if year == "Y2008":
        if month in ['mar','apr','may','jun','jul','aug','sep','oct','nov','dec']:
            return 'peak'
    elif year == "Y2009":
        if month in ['mar','apr','may']:
            return 'peak'
        else:
            return 'non-peak'
    else:
        return 'non-peak'
    

In [None]:
bdf['f.recession_strength'] = bdf.apply(lambda x: recession_strength(x['f.year'],x['month']), axis=1)

In [None]:
# To be removed
def week_breakup(day):
    if day == "mon":
        return 'week_start'
    if day in ['tue','wed','thu']:
        return 'week_mid'
    if day in ['fri']:
        return 'week_end'

In [None]:
bdf['f.week_breakup'] = bdf['day_of_week'].apply(week_breakup)

In [None]:
# to be removed
def week_day_weightage(weekday):
    if weekday == 'week_start':
        return 14
    if weekday == 'week_mid':
        return 38
    if weekday == 'week_end':
        return 14

In [None]:
#bdf['f.week_day_weightage'] = bdf['f.week_breakup'].apply(week_day_weightage)

In [None]:
def emp_rate_change(rate):
    if rate > 0:
        return 'positive'
    else:
        return 'negative'

In [None]:
bdf['f.emp_var_rate'] = bdf['emp.var.rate'].apply(emp_rate_change)

In [None]:
def cons_price_bin(index):
	if index >=92 and index <=92.5:
		return 'gt_92_lt_92.5'
	if index >92.5 and index <= 93:
		return 'gt_925_lt_93'
	if index >93 and index <= 93.5:
		return 'gt_93_lt_935'
	if index >93.5 and index <= 94:
		return 'gt_935_lt_94'
	if index >94 and index <= 94.5:
		return 'gt_94_lt_94.5'
	if index >94.5 and index <= 95:
		return 'gt_945_lt_95'
	if index <92:
		return 'lt_92'
	if index >95:
		return 'gt_95'

In [None]:
bdf['f.cons.price.bin'] = bdf['cons.price.idx'].apply(cons_price_bin)

In [None]:
def emp_var_bin(index):
	if index < -3.5:
		return 'lt_min_3.5'
	if index >= -3.5 and index <= -3:
		return 'gt_min_35_lt_3'
	if index > -3 and index <= -2.5:
		return 'gt_min_3_lt_25'
	if index >-25 and index <= -2:
		return 'gt_min_25_lt_2'
	if index >-2 and index <= -1.5:
		return 'gt_min_2_lt_15'
	if index >-1.5 and index <= -1:
		return 'gt_min_15_lt_1'
	if index >-1 and index <= -0.5:
		return 'gt_min_1_lt_point5'
	if index >-0.5 and index <= 0:
		return 'gt_min_05_lt_0'
	if index >0 and index <= 0.5:
		return 'gt_0_lt_05'
	if index >0.5 and index <= 1:
		return 'gt_05_lt_05'
	if index > 1:
		return 'gt_1'

In [None]:
bdf['f.emp.var.bin'] = bdf['emp.var.rate'].apply(emp_var_bin)

In [None]:
def cons_conf_bin(index):
	if index < -51:
		return 'lt_51'
	if index >= -51 and index <= -46:
		return 'gt_51_lt_46'
	if index > -46 and index <= -41:
		return 'gt_46_lt_41'
	if index > -41 and index <= -36:
		return 'gt_41_lt_36'
	if index > -36 and index <= -31:
		return 'gt_36_lt_31'
	if index > -31 and index <= -26:
		return 'gt_31_lt_26'
	if index > -26:
		return 'gt_26'

In [None]:
bdf['f.cons.conf.bin'] = bdf['cons.conf.idx'].apply(cons_conf_bin)

In [None]:
def buy_prob_user_job(user_type, job):
    if user_type == "existing user":
        if job in ['admin.','housemaid','management','retired','student','technician','unemployed']:
            return 'high'
        else:
            return 'low'
    else:
        return 'low'

In [None]:
#bdf['f.userjob_buy.prob'] = bdf.apply(lambda x: buy_prob_user_job(x['f.user_type'], x['job']), axis=1)

In [None]:
def salaried_or_not(job):
    if job in ['admin.','blue-collar', 'housemaid','management','services','retired','technician']:
        return 'salaried'
    else:
        return 'not salaried'

In [None]:
bdf['f.sal_or_not'] = bdf['job'].apply(salaried_or_not)

In [None]:
def quarter(month):
    if month in ['apr','may','jun']:
        return 'Q1'
    if month in ['jul','aug','sep']:
        return 'Q2'
    if month in ['oct','nov','dec']:
        return 'Q3'
    if month in ['jan','feb','mar']:
        return 'Q4'

In [None]:
bdf['f.quarter'] = bdf['month'].apply(quarter)

In [None]:
def into_loans(housing, personal):
    if housing == 'no' and personal == "no":
        return 'not_into_loans'
    else:
        return 'into_loans'

In [None]:
bdf['f.into_loans'] = bdf.apply(lambda x: into_loans(x.housing, x.loan), axis=1)

In [None]:
base_sal = {
    'housemaid' : 500,
    'services' : 700,
    'admin.' : 600,
    'blue-collar' : 500,
    'technician' : 600,
    'retired' : 500,
    'management' : 1600,
    'unemployed' : 400,
    'self-employed' : 800,
    'entrepreneur' : 1200,
    'student' : 400
}

In [None]:
edu_sal = {
    'basic.4y' : 1,
    'high.school' : 1.6,
    'basic.6y' : 1.2,
    'basic.9y' : 1.4,
    'professional.course' : 1.6,
    'university.degree' : 2,
    'illiterate' : 1
}

In [None]:
def salary_job_education(job, education):
    job_base_sal = base_sal[job]
    edu_factor = edu_sal[education]
    if job in ['retired','unemployed','self-employed','entrepreneur']:
        sal = job_base_sal * edu_factor * 12
    else:
        sal = job_base_sal * edu_factor * 14
    return sal

In [None]:
bdf['f.income'] = bdf.apply(lambda x: salary_job_education(x.job, x.education), axis=1)

In [None]:
def income_tax(salary):
    if salary >=0 and salary <=7112:
        return 14.5
    if salary >=7113 and salary <=10732:
        return 23
    if salary >=10733 and salary <=20322:
        return 28.5
    if salary >=20323 and salary <=25075:
        return 35
    if salary >=25076 and salary <=36967:
        return 37
    if salary >=36968:
        return 45

In [None]:
bdf['f.income_tax'] = bdf['f.income'].apply(lambda salary: income_tax(salary))

In [None]:
def econ_status(income):
    if income <= 7800:
        return 'lower class'
    if income >7800 and income <= 26400:
        return 'middle class'
    if income > 26400:
        return 'upper class'

In [None]:
bdf['f.econ_status'] = bdf['f.income'].apply(econ_status)

In [None]:
def buy_prob_status_marital(marital_status):
    if marital_status in ['married','single']:
        return 'high'
    else:
        return 'low'

In [None]:
bdf['f.user.buy.prob'] = bdf['marital'].apply(buy_prob_status_marital)

In [None]:
def bailout_period(year):
    if year == "Y2010":
        return "bailout"
    else:
        return "non-bailout"

In [None]:
#bdf['f.bailout_status'] = bdf['f.year'].apply(bailout_period)

In [None]:
def education_level(education):
    if education in ['illiterate','basic.4y','basic.6y']:
        return 'low-education'
    if education in ['basic.9y','high.school']:
        return 'medium-education'
    if education in ['professional.course','university.degree']:
        return 'high-education'

In [None]:
bdf['f.education'] = bdf['education'].apply(education_level)

In [None]:
def age_to_retirement(age):
    if age < 65:
        return 65 - age
    else:
        return 0

In [None]:
bdf['f.age_to_retire'] = bdf['age'].apply(age_to_retirement)

In [None]:
def age_range_to_retire(diff_age):
    if diff_age == 0:
        return 0
    if diff_age >=1 and diff_age <=5:
        return 0.3
    if diff_age >=6 and diff_age <=35:
        return 0.1
    if diff_age >= 36 and diff_age <= 40:
        return 0.14
    if diff_age >= 41 and diff_age <= 45:
        return 0.22
    if diff_age >= 46:
        return 0.45

In [None]:
bdf['f.age_to_retire_weight'] = bdf['f.age_to_retire'].apply(age_range_to_retire)

In [None]:
def age_to_eol(age):
    if age < 80:
        return 80 - age
    else:
        return 0

In [None]:
bdf['f.age_to_death'] = bdf['age'].apply(age_to_eol)

In [None]:
def age_range_to_death(diff_age):
    if diff_age >=0 and diff_age <= 19:
        return '0_to_19'
    if diff_age >=20 and diff_age <= 39:
        return '20_to_39'
    if diff_age >=40 and diff_age <=59:
        return '40_to_59'
    if diff_age >=60:
        return '60_to_'

In [None]:
bdf['f.remaining_age'] = bdf['f.age_to_death'].apply(age_range_to_death)

In [None]:
def unemployment_rate(recession_strength, age_range, salaried):
    if recession_strength == "peak":
        if age_range in ['21_to_30','31_to_40']:
            return -30
        else:
            return -8
    else:
        return -10

In [None]:
bdf['f.unemployment'] = bdf.apply(lambda x: unemployment_rate(x['f.recession_strength'],x['f.age'],x['f.sal_or_not']), axis=1)

In [None]:
bdf.to_csv('testv4.csv')

### Feature Analysis

In [None]:
dropped_columns= ['s.no', 'age','nr.employed','default','pdays', 'emp.var.rate',
                  'campaign','cons.price.idx','cons.conf.idx', 'f.quarter','f.income_tax','f.age_to_death',
                  'f.age_to_retire',
                 ]
categorical_columns=['job','marital','housing','loan','education','contact','month','day_of_week',
                     'poutcome','f.euribor1','f.age', 'f.commitment','f.week_breakup',
                    'f.seasons','f.retired_status', 'f.user_type', 'f.pattern',
                     'f.previous_campaigns','f.current_campaign_calls','f.first_time_user_calls', 
                     'f.pdays', 'f.emp_var_rate','f.year','f.cons.price.bin',
                     'f.emp.var.bin','f.cons.conf.bin','f.recession_strength',
                     'f.econ_status','f.sal_or_not','f.into_loans','f.remaining_age','f.education',
                     'f.user.buy.prob',
                    ]

In [None]:
# drop the columns
bdf.drop(dropped_columns, axis=1, inplace=True)

In [None]:
#convert columns of object type to categorical columns
bdf_cat = bdf[categorical_columns].astype('category')

In [None]:
bdf[categorical_columns] = bdf[categorical_columns].astype('category')

In [None]:
# drop bdf categorical columns from the dataframe
bdf_noncat = bdf.drop(categorical_columns,axis=1)

In [None]:
bdf_noncat.columns

In [None]:
# use one hot encoding for categorical columns
bdf_cat_one_hot = pd.get_dummies(bdf_cat)

In [None]:
bdf_cat_one_hot.shape

In [None]:
#concat categorical df with non categorical df
bdf_master = pd.concat([bdf_noncat, bdf_cat_one_hot], axis=1)

In [None]:
# store only the target variable column
y = bdf_master.pop('y')

In [None]:
# remove the target variable column from the master dataset
X = bdf_master

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=101)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
# the propotion of yes and no looks the same

### Visualizing classification report

In [None]:
def show_classification_rpt(model):
    visualizer = ClassificationReport(model, classes=['no','yes'], cmap="YlGn", size=(600,300))
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()

# Model Building

In [None]:
# function that fits and predicts
def fit_predict(algo,X_train, X_test, y_train, y_test):
    algo.fit(X_train, y_train)
    y_pred = algo.predict(X_test)
    return y_pred

## Logistic Regression

In [None]:
logit = LogisticRegression()

In [None]:
print("Accuracy with Logistic Regression", end=': ')
y_pred_logit = fit_predict(logit, X_train, X_test, y_train, y_test)
logit_accuracy = accuracy_score(y_test, y_pred_logit)
print(logit_accuracy)

In [None]:
show_classification_rpt(logit)

In [None]:
print(classification_report(y_test, y_pred_logit))

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test,y_pred_logit))

## Decision Tree

In [None]:
tree = DecisionTreeClassifier(random_state=rand_state)

In [None]:
print("Accuracy with Decision Tree", end=': ')
y_pred_tree = fit_predict(tree, X_train, X_test, y_train, y_test)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
print(tree_accuracy)

In [None]:
show_classification_rpt(tree)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test,y_pred_tree))

### Hyper Parameter Tuning

In [None]:
def fit_predict_dt(train, test, y_train, y_test, scaler, max_depth,
               criterion = 'entropy', max_features=1, min_samples_split=4):
    train_scaled = scaler.fit_transform(train)
    test_scaled = scaler.transform(test)
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth,
                               random_state= 101, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train_scaled, y_train)
    y_pred = dt.predict(test_scaled)
    return y_pred

### Max depth parameter tuning

In [None]:
list_max_depth = []

In [None]:
for i in range(1,30):
    print("Accuracy score using max_depth = ", i, end = ':')
    y_pred_tree_hpt = fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), i)
    dt_max_depth = accuracy_score(y_test, y_pred_tree_hpt)
    print(dt_max_depth)
    list_max_depth.append(dt_max_depth)
    
max_depth_tuned = list_max_depth.index(max(list_max_depth))+1
print(max_depth_tuned)

### Max features tuning

In [None]:
dict_max_features = {}

In [None]:
for i in np.arange(0.1,1.0,0.1):
    print('Accuracy score using max features =', i, end = ":")
    y_pred_max_features = fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=max_depth_tuned, max_features=i)
    dt_max_features = accuracy_score(y_test, y_pred_max_features)
    print(dt_max_features)
    dict_max_features[i]=dt_max_features

max_feature_tuned = max(dict_max_features, key=dict_max_features.get)
print(max_feature_tuned)

### Min samples split tuning

In [None]:
dict_min_samples_split = {}

In [None]:
for i in range(2,10):
    print('Accuracy score using min samples split=', i, end=":")
    y_pred_min_split = fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=max_depth_tuned, max_features=max_feature_tuned, min_samples_split=i)
    dt_min_sample_split = accuracy_score(y_test, y_pred_min_split)
    print(dt_min_sample_split)
    dict_min_samples_split[i] = dt_min_sample_split
    
min_sample_split_tuned = max(dict_min_samples_split, key=dict_min_samples_split.get)
print(min_sample_split_tuned)

### Criterion tuning

In [None]:
dict_index_tuned = {}

In [None]:
for i in ['gini','entropy']:
    print("Accuracy score using criterion: ", i, end = ':')
    y_pred_index_score = fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=max_depth_tuned, max_features = max_feature_tuned, min_samples_split=min_sample_split_tuned, criterion= i)
    dt_index_score = accuracy_score(y_test, y_pred_index_score)
    print(dt_index_score)
    dict_index_tuned[i] = dt_index_score

dt_index_tuned = max(dict_index_tuned, key=dict_index_tuned.get)
print(dt_index_tuned)

### Rerunning decision tree with best parameters

In [None]:
tree_best_pred  =  fit_predict_dt(X_train, X_test, y_train, y_test, StandardScaler(), max_depth=max_depth_tuned, max_features = max_feature_tuned, min_samples_split=min_sample_split_tuned, criterion= dt_index_tuned)
dt_best_param_score = accuracy_score(y_test, tree_best_pred)
print("Accuracy score for decision tree using best param: ", end = ':')
print(dt_best_param_score)

In [None]:
tree1 = DecisionTreeClassifier(criterion = dt_index_tuned, max_depth=max_depth_tuned,
                               random_state= rand_state, max_features=max_feature_tuned,
                               min_samples_split=min_sample_split_tuned)

In [None]:
show_classification_rpt(tree1)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, tree_best_pred))

### Using polynomial features

In [None]:
def create_poly(train, test, degree):
    poly = PolynomialFeatures(degree = degree)
    train_poly = poly.fit_transform(train)
    test_poly = poly.fit_transform(test)
    return train_poly, test_poly

In [None]:
## polynomial features taking a lot of time as the number of columns are more. Hence commented the following piece of code

In [None]:
# for degree in [1,2,3,4]:
#     train_poly, test_poly = create_poly(X_train, X_test, degree)
#     print("polynomial degree", degree)
#     fit_predict(train_poly, test_poly, y_train, y_test, StandardScaler(), 16, max_features = 0.2,min_samples_split=4, criterion='entropy')
#     print(10 *'-')

## Random Forest

In [None]:
forest = RandomForestClassifier(criterion='entropy', oob_score=True)

In [None]:
print("Accuracy with Random Forest", end=': ')
rf_pred = fit_predict(forest, X_train, X_test, y_train, y_test)
rf_accuracy = accuracy_score(y_test, rf_pred)
print(rf_accuracy)

In [None]:
show_classification_rpt(forest)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test,rf_pred))

### Parameter tuning using Grid Search

In [None]:
params = {
    'n_estimators': [200,500,700],
    'max_depth': [10,15,20,25],
    'min_samples_leaf': [3,5,7]
}

In [None]:
gs = GridSearchCV(forest, params, cv=cv, verbose=verbose, n_jobs=number_of_jobs)

In [None]:
if grid_search_enable == True:
    gs.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    print(gs.best_params_)

In [None]:
if grid_search_enable == True:
    forest_best_max_depth = gs.best_params_['max_depth']
    forest_best_min_samples = gs.best_params_['min_samples_leaf']
    forest_best_n_estimators = gs.best_params_['n_estimators']
    #forest_best_criterion = gs.best_params_['criterion']

In [None]:
if grid_search_enable == True:
    print(gs.best_estimator_)

### Retraining with best estimators

In [None]:
if grid_search_enable == True:
    forest1 = RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=forest_best_max_depth, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=forest_best_min_samples, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=forest_best_n_estimators,
                       n_jobs=number_of_jobs, oob_score=True, random_state=rand_state,
                       verbose=0, warm_start=False)

In [None]:
if grid_search_enable == True:
    forest1.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    pred_forest1 = forest1.predict(X_test)

In [None]:
if grid_search_enable == True:
    print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
    rf_hyper_accuracy = accuracy_score(y_test, pred_forest1)
    print(rf_hyper_accuracy)

In [None]:
if grid_search_enable == True:
    for x in sorted(list(zip(forest1.feature_importances_, X_train.columns)), reverse=True):
        print(x)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(forest1)

In [None]:
if grid_search_enable == True:
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, pred_forest1))

## Accuracy Summary Without Boosting

In [None]:
print("Accuracy with Logistic Regression", end=': ')
print(logit_accuracy)
print("Accuracy with Decision Tree", end=': ')
print(tree_accuracy)
print("Accuracy of Decision Tree after Hyper Parameter Tuning: ", end = ':')
print(dt_best_param_score)
print("Accuracy with Random Forest", end=': ')
print(rf_accuracy)
if grid_search_enable == True:
    print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
    print(rf_hyper_accuracy)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
adabst_fit = AdaBoostClassifier(base_estimator=logit, random_state=rand_state, n_estimators=100)

In [None]:
adabst_fit.fit(X_train, y_train)

In [None]:
print("Accuracy with Ada Boost", end=': ')
ada_boost_predict = adabst_fit.predict(X_test)
ada_boost_accuracy = accuracy_score(y_test,ada_boost_predict)
print(ada_boost_accuracy)

In [None]:
show_classification_rpt(adabst_fit)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, ada_boost_predict))

### Adaboost with Grid Search

In [None]:
params_adaboost = {
    'n_estimators': [100,200,300],
    'base_estimator': [logit, tree1],
    'learning_rate': [0.5,0.75,1.0]
}

In [None]:
ada_gs = GridSearchCV(adabst_fit, params_adaboost, cv=cv, n_jobs=number_of_jobs)

In [None]:
if grid_search_enable == True:
    ada_gs.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    adabst_best_estimator = ada_gs.best_params_['base_estimator']
    adabst_best_learning_rate = ada_gs.best_params_['learning_rate']
    adabst_best_n_estimators = ada_gs.best_params_['n_estimators']

In [None]:
if grid_search_enable == True:
    ada_gs.best_params_

### Adaboost - Retraining with best estimators

In [None]:
if grid_search_enable == True:
    ada_best = AdaBoostClassifier(base_estimator=adabst_best_estimator, random_state=rand_state,
                                  n_estimators=adabst_best_n_estimators, learning_rate=adabst_best_learning_rate)

In [None]:
if grid_search_enable == True:
    ada_best.fit(X_train, y_train)
    ada_best_predict = ada_best.predict(X_test)

In [None]:
if grid_search_enable == True:
    print("Accuracy with Adaboost after Hyper Parameter Tuning", end=': ')
    ada_best_hyper_accuracy = accuracy_score(y_test, ada_best_predict)
    print(ada_best_hyper_accuracy)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(ada_best)

In [None]:
if grid_search_enable == True:
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, ada_best_predict))

## Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbc_fit = GradientBoostingClassifier(n_estimators=500,
                                     min_samples_split=2,min_samples_leaf=1,max_depth=1,random_state=rand_state)

In [None]:
gbc_fit.fit(X_train, y_train)

In [None]:
print("Accuracy with Gradient Boost", end=': ')
y_pred_gbc = gbc_fit.predict(X_test)
gbc_accuracy = accuracy_score(y_test, y_pred_gbc)
print(gbc_accuracy)

In [None]:
show_classification_rpt(gbc_fit)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gbc))

### Gradient boost with Grid Search

In [None]:
params_gbc = {
    'n_estimators': [100,300,500],
    'max_depth': [2,3,4],
    'learning_rate': [0.05,0.075,0.1]
}

In [None]:
gbc_gs = GridSearchCV(gbc_fit, params_gbc, cv=cv, n_jobs=number_of_jobs)

In [None]:
if grid_search_enable == True:
    gbc_gs.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    gbc_best_estimator = gbc_gs.best_params_['n_estimators']
    gbc_best_max_depth = gbc_gs.best_params_['max_depth']
    gbc_best_learning_rate = gbc_gs.best_params_['learning_rate']

In [None]:
if grid_search_enable == True:
    print(gbc_gs.best_params_)

### Gradient Boost - With Best Parameters

In [None]:
if grid_search_enable == True:
    gbc_best = GradientBoostingClassifier(n_estimators=gbc_best_estimator,
                                     min_samples_split=2,min_samples_leaf=1,max_depth=gbc_best_max_depth,random_state=rand_state,
                                         learning_rate=gbc_best_learning_rate)

In [None]:
if grid_search_enable == True:
    gbc_best.fit(X_train, y_train)
    gbc_best_predict = gbc_best.predict(X_test)

In [None]:
if grid_search_enable == True:
    print("Accuracy with Gradient Boost after Hyper Parameter Tuning", end=': ')
    gbc_best_hyper_accuracy = accuracy_score(y_test, gbc_best_predict)
    print(gbc_best_hyper_accuracy)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(gbc_best)

In [None]:
if grid_search_enable == True:
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, gbc_best_predict))

## LGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
LGBM = LGBMClassifier(n_jobs=number_of_jobs, num_leaves=31)
LGBM.fit(X_train, y_train)

In [None]:
LGBM_predict = LGBM.predict(X_test)

In [None]:
print("Accuracy with LGBM", end=': ')
lgbm_accuracy = accuracy_score(y_test,LGBM_predict)
print(lgbm_accuracy)

In [None]:
show_classification_rpt(LGBM)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, LGBM_predict))

### LGBM With Grid Search

In [None]:
lgbm_params = {
    'num_leaves': [31,35,41,45],
    'n_estimators': [50,75,100,150]
}

In [None]:
if grid_search_enable == True:
    lgbm_gs = GridSearchCV(LGBM, lgbm_params, cv=cv, n_jobs=number_of_jobs)
    lgbm_gs.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    lgbm_best_estimator = lgbm_gs.best_params_['n_estimators']
    lgbm_best_num_leaves = lgbm_gs.best_params_['num_leaves']
    print(lgbm_gs.best_params_)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(lgbm_gs)

### LGBM With Best Parameters

In [None]:
if grid_search_enable == True:
    LGBM_best = LGBMClassifier(n_jobs=number_of_jobs, num_leaves=lgbm_best_num_leaves, 
                               n_estimators=lgbm_best_estimator)
    LGBM_best.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    LGBM_best_predict = LGBM_best.predict(X_test)
    LGBM_accuracy_score = accuracy_score(y_test,LGBM_best_predict)

In [None]:
if grid_search_enable == True:
    print("Accuracy with LGBM with Hyper Parameter Tuning", end=': ')
    print(LGBM_accuracy_score)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(LGBM_best)

In [None]:
if grid_search_enable == True:
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, LGBM_best_predict))

## XG Boost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_fit = XGBClassifier(max_depth=2, n_estimators=5000, random_state=rand_state, n_jobs=number_of_jobs)
xgb_fit.fit(X_train, y_train)

In [None]:
print("Accuracy with XG Boost", end=': ')
xgb_predict = xgb_fit.predict(X_test)
xgb_accuracy = accuracy_score(y_test,xgb_predict)
print(xgb_accuracy)

In [None]:
show_classification_rpt(xgb_fit)

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_predict))

### XG Boost with Grid Search

In [None]:
xgb_params = {
    'max_depth': [2,3,4],
    'n_estimators': [300,500,700],
    'learning_rate': [0.05,0.075,0.1]
}

In [None]:
xgb_gs = GridSearchCV(xgb_fit, xgb_params, cv=cv, n_jobs=number_of_jobs)

In [None]:
if grid_search_enable == True:
    xgb_gs.fit(X_train, y_train)

In [None]:
if grid_search_enable == True:
    print(xgb_gs.best_params_)

In [None]:
if grid_search_enable == True:
    xgb_best_max_depth = xgb_gs.best_params_['max_depth']
    xgb_best_n_estimators = xgb_gs.best_params_['n_estimators']
    xgb_best_learning_rate = xgb_gs.best_params_['learning_rate']

### Re-running XG Boost with Best Parameters

In [None]:
if grid_search_enable == True:
    xgb_best = XGBClassifier(max_depth=xgb_best_max_depth,
                             n_estimators=xgb_best_n_estimators, random_state=rand_state,
                             n_jobs=number_of_jobs, learning_rate = xgb_best_learning_rate)
    xgb_best.fit(X_train, y_train)
    xgb_best_predict = xgb_best.predict(X_test)

In [None]:
if grid_search_enable == True:
    print("Accuracy with XG Boost after Hyper Parameter Tuning", end=': ')
    xgb_best_hyper_accuracy = accuracy_score(y_test, xgb_best_predict)
    print(xgb_best_hyper_accuracy)

In [None]:
if grid_search_enable == True:
    show_classification_rpt(xgb_best)

In [None]:
if grid_search_enable == True:
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, xgb_best_predict))

## Accuracy Summary With Boosting

In [None]:
print("Accuracy with Logistic Regression", end=': ')
print(logit_accuracy)

print("Accuracy with Decision Tree", end=': ')
print(tree_accuracy)
print("Accuracy with Decision Tree after Hyper Parameter Tuning: ", end = ':')
print(dt_best_param_score)

print("Accuracy with Random Forest", end=': ')
print(rf_accuracy)
if grid_search_enable == True:
    print("Accuracy with Random Forest after Hyper Parameter Tuning", end=': ')
    print(rf_hyper_accuracy)

print("Accuracy with Ada Boost", end=': ')
print(ada_boost_accuracy)
if grid_search_enable == True:
    print("Accuracy with Ada Boost after Hyper Parameter Tuning", end=': ')
    print(ada_best_hyper_accuracy)

print("Accuracy with Gradient Boost", end=': ')
print(gbc_accuracy)
if grid_search_enable == True:
    print("Accuracy with Gradient Boost after Hyper Parameter Tuning", end=': ')
    print(gbc_best_hyper_accuracy)
    
print("Accuracy with LGBM", end=': ')
print(lgbm_accuracy)
if grid_search_enable == True:
    print("Accuracy with LGBM after Hyper Parameter Tuning", end=': ')
    print(LGBM_accuracy_score)
    
print("Accuracy with XG Boost", end=': ')
print(xgb_accuracy)
if grid_search_enable == True:
    print("Accuracy with XG Boost after Hyper Parameter Tuning", end=': ')
    print(xgb_best_hyper_accuracy)