In [2]:
#Importing libraries

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold


In [3]:
df = pd.read_csv("train_bank_marketing.csv")

In [4]:
df.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                   int64
dtype: object

In [5]:
#Data Pre-Processing

job = {"admin.":1,"blue-collar":2,"entrepreneur":3,
       "housemaid":4,"management":5,"retired":6,
       "self-employed":7,"services":8,"student":9,
       "technician":10,"unemployed":11,"unknown":0}

civil = {"divorced":1,"married":2,"single":3,"unknown":0}

educ = {"basic.4y":1,"basic.6y":2,"basic.9y":3,
        "high.school":4,"illiterate":5,"professional.course":6,
        "university.degree":7,"unknown":0}

binary = {"no":0,"yes":1,"unknown":1}

contact = {"cellular":0,"telephone":1}

month = {'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9,
        'oct': 10,
        'nov': 11,
        'dec': 12}

day = {"mon":1,"tue":2,"wed":3,"thu":4,"fri":5}
poutcome = {"failure":0,"nonexistent":1,"success":2}


In [6]:
df['job'] = df['job'].apply(lambda x: job[x])
df['marital'] = df['marital'].apply(lambda x: civil[x])
df['education'] = df['education'].apply(lambda x: educ[x])
df['default'] = df['default'].apply(lambda x: binary[x])
df['housing'] = df['housing'].apply(lambda x: binary[x])
df['loan'] = df['loan'].apply(lambda x: binary[x])
df['contact'] = df['contact'].apply(lambda x: contact[x])
df['month'] = df['month'].apply(lambda x: month[x])
df['day_of_week'] = df['day_of_week'].apply(lambda x: day[x])
df['poutcome'] = df['poutcome'].apply(lambda x: poutcome[x])

In [7]:
#Feature Engineering (Numeric Aggregation)

grouped_df = df.groupby('job')

df[['emp_mean', 'price_mean', 'conf_mean', 'euri_mean', 'nremp_mean']] = grouped_df[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('mean')

df[['emp_std', 'price_std', 'conf_std', 'euri_std', 'nremp_std']] = grouped_df[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('std')

df[['emp_count', 'price_count', 'conf_count', 'euri_count', 'nremp_count']] = grouped_df[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('count')

In [8]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,emp_std,price_std,conf_std,euri_std,nremp_std,emp_count,price_count,conf_count,euri_count,nremp_count
0,43,5,2,0,1,0,0,0,4,4,...,1.530428,0.562251,4.552455,1.671139,68.939189,2228,2228,2228,2228,2228
1,30,1,2,7,0,0,0,0,7,5,...,1.619852,0.574460,4.750547,1.769459,75.893818,7840,7840,7840,7840,7840
2,55,1,2,1,0,0,0,1,6,4,...,1.619852,0.574460,4.750547,1.769459,75.893818,7840,7840,7840,7840,7840
3,34,10,3,7,0,0,0,0,10,5,...,1.515163,0.537007,4.560869,1.664106,71.333829,5067,5067,5067,5067,5067
4,32,2,2,2,1,0,0,1,5,4,...,1.444132,0.564063,4.141377,1.650958,59.721876,6914,6914,6914,6914,6914
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30886,34,2,2,4,0,1,0,1,5,5,...,1.444132,0.564063,4.141377,1.650958,59.721876,6914,6914,6914,6914,6914
30887,39,8,2,4,0,0,0,1,5,2,...,1.468619,0.561753,4.193546,1.671734,63.503402,2921,2921,2921,2921,2921
30888,23,9,3,0,0,0,0,0,7,2,...,1.484208,0.707133,6.156032,1.577079,79.439199,666,666,666,666,666
30889,43,2,2,1,1,0,0,1,6,4,...,1.444132,0.564063,4.141377,1.650958,59.721876,6914,6914,6914,6914,6914


In [9]:
X = df[df.columns.drop(['y'])]
y = df['y']

In [10]:
X1=X.to_numpy()
y1=y.to_numpy()

In [10]:
iterations = 30
max_features_params = [0.5, 0.6, 0.7, 0.8]
gbm_accuracy = {}

for mf in tqdm(max_features_params):
    train_acc_list = []
    test_acc_list = []
    
    for seed in range(iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=0.2, 
                                                            random_state=seed,
                                                            stratify=y)

        gbm_model = GradientBoostingClassifier(max_features=mf)
        gbm_model.fit(X_train, y_train)
    
        train_acc_list.append(gbm_model.score(X_train, y_train))
        test_acc_list.append(gbm_model.score(X_test, y_test))
    
    gbm_accuracy[str(mf)] = {'train_acc': np.mean(train_acc_list),
                             'test_acc': np.mean(test_acc_list)}

  0%|          | 0/4 [00:00<?, ?it/s]

In [14]:
from sklearn.model_selection import StratifiedKFold
#Stratified KFold

max_features_params = [0.5, 0.6, 0.7, 0.8]
gbm_accuracy3 = {}
kf = StratifiedKFold(n_splits=5)

train_acc_list=[]
test_acc_list=[]
for mf in tqdm(max_features_params):
    for train_index, test_index in kf.split(X1, y1):
        print(train_index, test_index)
        X_train, X_test = X1[train_index], X1[test_index]
        y_train, y_test = y1[train_index], y1[test_index]
        gbm_model3 = GradientBoostingClassifier(max_features=mf,max_depth=4)
        gbm_model3.fit(X_train,y_train)
        train_acc_list.append(gbm_model3.score(X_train, y_train))
        test_acc_list.append( gbm_model3.score(X_test, y_test))
    gbm_accuracy3[str(mf)] ={'train_acc' : np.mean(train_acc_list),
                                'test_acc' : np.mean(test_acc_list)}

  0%|          | 0/4 [00:00<?, ?it/s]

[ 6167  6168  6170 ... 30888 30889 30890] [   0    1    2 ... 6232 6234 6259]
[    0     1     2 ... 30888 30889 30890] [ 6167  6168  6170 ... 12432 12437 12438]
[    0     1     2 ... 30888 30889 30890] [12351 12352 12353 ... 18844 18850 18861]
[    0     1     2 ... 30888 30889 30890] [18491 18492 18493 ... 24810 24825 24834]
[    0     1     2 ... 24810 24825 24834] [24696 24697 24698 ... 30888 30889 30890]
[ 6167  6168  6170 ... 30888 30889 30890] [   0    1    2 ... 6232 6234 6259]
[    0     1     2 ... 30888 30889 30890] [ 6167  6168  6170 ... 12432 12437 12438]
[    0     1     2 ... 30888 30889 30890] [12351 12352 12353 ... 18844 18850 18861]
[    0     1     2 ... 30888 30889 30890] [18491 18492 18493 ... 24810 24825 24834]
[    0     1     2 ... 24810 24825 24834] [24696 24697 24698 ... 30888 30889 30890]
[ 6167  6168  6170 ... 30888 30889 30890] [   0    1    2 ... 6232 6234 6259]
[    0     1     2 ... 30888 30889 30890] [ 6167  6168  6170 ... 12432 12437 12438]
[    0    

In [15]:
gbm_accuracy3

{'0.5': {'train_acc': 0.9307160740752203, 'test_acc': 0.9167072642532211},
 '0.6': {'train_acc': 0.9308779322065679, 'test_acc': 0.916950071753285},
 '0.7': {'train_acc': 0.9310370931390451, 'test_acc': 0.9168583327164189},
 '0.8': {'train_acc': 0.9312218820456378, 'test_acc': 0.9168124789155468}}

In [None]:
gbm_model3.feature_importances_
plt.figure(figsize=(15, 8))
plt.barh(y=X.columns,
         width=gbm_model3.feature_importances_)
plt.show()

In [14]:
gbm_model3 = GradientBoostingClassifier(max_features=0.5,max_depth=4)
gbm_model3.fit(X,y)

joblib.dump(gbm_model3,'gbm_model3.joblib')

['gbm_model3.joblib']

In [15]:
test = pd.read_csv("/content/drive/MyDrive/Colab/DECSC/test_bank_marketing.csv")

grouped_test = test.groupby('job')

test[['emp_mean', 'price_mean', 'conf_mean', 'euri_mean', 'nremp_mean']] = grouped_test[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('mean')

test[['emp_std', 'price_std', 'conf_std', 'euri_std', 'nremp_std']] = grouped_test[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('std')

test[['emp_count', 'price_count', 'conf_count', 'euri_count', 'nremp_count']] = grouped_test[['emp.var.rate',  'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']].transform('count')

In [16]:
day = {"mon":1,"tue":2,"wed":3,"thu":4,"fri":5}
poutcome = {"failure":0,"nonexistent":1,"success":2}

test['job'] = test['job'].apply(lambda x: job[x])
test['marital'] = test['marital'].apply(lambda x: civil[x])
test['education'] = test['education'].apply(lambda x: educ[x])
test['default'] = test['default'].apply(lambda x: binary[x])
test['housing'] = test['housing'].apply(lambda x: binary[x])
test['loan'] = test['loan'].apply(lambda x: binary[x])
test['contact'] = test['contact'].apply(lambda x: contact[x])
test['month'] = test['month'].apply(lambda x: month[x])
test['day_of_week'] = test['day_of_week'].apply(lambda x: day[x])
test['poutcome'] = test['poutcome'].apply(lambda x: poutcome[x])

In [17]:
gbm = joblib.load('gbm_model3.joblib')
result6 = gbm.predict(test)
result6 = pd.DataFrame(result6)
result6['id'] = result6.index
result6.rename(columns = {0:'y'}, inplace = True)
result6.to_csv('submission6.csv',index=False)

titles = list(result6.columns)
titles[0], titles[1] = titles[1], titles[0]

result6 = result6[titles]

result6.to_csv('sub3.csv',index=False)
files.download('sub3.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>