<h1>Bank Institution Term Deposit
Predictive Model</h1>
<p>This notebook attempts the use of XGboost, MLP and Logisitc Regression algorithm to predict customers who would or would not subscribe to their term deposit in the
future </p>

In [1]:
#Importing necessary libraries
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE
import imblearn
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold,StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin,ClassifierMixin
import pickle

In [2]:
data=pd.read_csv('./bank-additional-full.csv',sep=';')

In [4]:
#encoding y values
data.y=data.y.map({'no':0,'yes':1})


In [3]:
#Check for missing data
data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
duration          0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [5]:
#Creating new features
data['nr.employed_marital']=data.groupby(['marital'])['nr.employed'].transform('sum')
data['customer index']=data['cons.price.idx'] * data['cons.conf.idx']

In [6]:
# creating instance of one-hot-encoder
enc = OneHotEncoder(handle_unknown='ignore')

In [7]:
#Identifying the categorical and non categorical columns
cat=[]
non_cat=[]
for i in data:
    if data[i].dtype==object:
        cat.append(i)
    else:
        non_cat.append(i)

In [8]:
enc_df = pd.DataFrame(enc.fit_transform(data[cat]).toarray())
# merge with main data on key values
newdata=data[non_cat]
new_data= newdata.join(enc_df)
new_data


Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,43,44,45,46,47,48,49,50,51,52
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


<p>Handling Outliers</p>

In [12]:
def remove_outlier(df):
    low = .05
    high = .95
    quant_df = df.quantile([low, high])
    for name in list(df.columns):
        if is_numeric_dtype(df[name]):
            df = df[(df[name] >= quant_df.loc[low, name]) 
                & (df[name] <= quant_df.loc[high, name])]
    return df
new_data_0=remove_outlier(new_data)

Scaling using MinMaxScaler

In [13]:
scaler=MinMaxScaler()

In [14]:
new_data_scaled=scaler.fit_transform(new_data_0.drop(['y'],axis=1))

In [15]:
new_data_scaled=pd.DataFrame(new_data_scaled,columns=new_data_0.drop(['y'],axis=1).columns)
new_data_scaled.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,43,44,45,46,47,48,49,50,51,52
0,0.96875,0.157821,0.0,0.0,0.0,0.930233,0.700382,0.972727,0.971813,0.75576,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.34375,0.265363,0.0,0.0,0.0,0.930233,0.700382,0.972727,0.971813,0.75576,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.4375,0.160615,0.0,0.0,0.0,0.930233,0.700382,0.972727,0.971813,0.75576,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.9375,0.378492,0.0,0.0,0.0,0.930233,0.700382,0.972727,0.971813,0.75576,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.59375,0.226257,0.0,0.0,0.0,0.930233,0.700382,0.972727,0.971813,0.75576,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


PCA ANALYSIS

Typically, we want the explained variance to be between 95–99%


In [4]:
%%writefile Util.py
pca = PCA().fit(new_data_scaled)
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, len(pca.explained_variance_ratio_)+1, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, len(pca.explained_variance_ratio_), step=1),rotation=90) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.axhline(y=0.99, color='g', linestyle='-')
plt.text(0.5, 1, '99% cut-off threshold', color = 'g', fontsize=16)
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.savefig('PCA.jpg',bbox_inches='tight')
plt.show()

Writing Util.py


The plot shows that if we choose the number of component to be 23, we can retain 95% of the useful data

In [17]:
pca = PCA(n_components = 23)
pca.fit(new_data_scaled)
reduced = pca.transform(new_data_scaled)

In [40]:
#xi = np.arange(1, len(pca.explained_variance_ratio_)+1, step=1)
y = np.cumsum(pca.explained_variance_ratio_)
y


array([0.13820214, 0.23316893, 0.31207808, 0.38524077, 0.44411103,
       0.49119352, 0.53757345, 0.58054005, 0.6207757 , 0.65686338,
       0.68889751, 0.72018289, 0.75066337, 0.78023173, 0.80835008,
       0.83532741, 0.85891909, 0.88207007, 0.90178741, 0.91855478,
       0.9314657 , 0.9430439 , 0.95422244])

We have an imbalanced dataset, we are going to employ the imblearn libary to resample the dataset using the following 
<p>SMOTE</p>
<p>TOMEKLINKS</p>

In [82]:
X=reduced
y=new_data_0['y']

In [84]:
len(X)

22341

In [None]:
#Under sampling
tl = imblearn.under_sampling.TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X, y)

In [20]:
#Over sampling
smote = SMOTE(sampling_strategy='minority')
X_sm, y_sm = smote.fit_resample(X, y)

In [21]:
#Under sampling didn't provide a good fit
X_train,X_test,y_train,y_test=train_test_split(X_sm,y_sm,test_size=0.1,random_state=1234,shuffle=True)
X_train,X_val,y_train,y_val=train_test_split(X_train,y_train,test_size=0.1,random_state=1234,shuffle=True)


In [22]:
Xf_train,Xf_test,yf_train,yf_test=train_test_split(X_sm,y_sm,test_size=0.1,random_state=1234,shuffle=True)

In [125]:
%%writefile Data.py
class Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, data, y=None):
        return self
    def remove_outlier(self, df):
        low = .05
        high = .95
        quant_df = df.quantile([low, high])
        for name in list(df.columns):
            if is_numeric_dtype(df[name]):
                df = df[(df[name] >= quant_df.loc[low, name]) 
                    & (df[name] <= quant_df.loc[high, name])]
        return df

    def transform(self,data):
        scaler=MinMaxScaler()
        enc = OneHotEncoder(handle_unknown='ignore')
        cat=[]
        non_cat=[]
        
        if 'y' in data.columns:
            data.y=data.y.map({'no':0,'yes':1})
            for i in data:
                if data[i].dtype==object:
                    cat.append(i)
                else:
                    non_cat.append(i)
            enc_df = pd.DataFrame(enc.fit_transform(data[cat]).toarray())
            # merge with main data on key values
            newdata=data[non_cat]
            newdata= newdata.join(enc_df)
            newdata=self.remove_outlier(newdata)
            new_data_scaled=scaler.fit_transform(newdata.drop(['y'],axis=1))
            new_data_scaled=pd.DataFrame(new_data_scaled,columns=newdata.drop(['y'],axis=1).columns)
        else:
            for i in data:
                if data[i].dtype==object:
                    cat.append(i)
                else:
                    non_cat.append(i)
            enc_df = pd.DataFrame(enc.fit_transform(data[cat]).toarray())
            # merge with main data on key values
            newdata=data[non_cat]
            newdata= newdata.join(enc_df)
            newdata=self.remove_outlier(newdata)
            new_data_scaled=scaler.fit_transform(newdata)
            new_data_scaled=pd.DataFrame(new_data_scaled,columns=newdata.columns)
        pca = PCA(n_components = 23)
        pca.fit(new_data_scaled)
        reduced = pca.transform(new_data_scaled)
        reduced=pd.DataFrame(reduced,columns=range(reduced.shape[1]))
        reduced['y']=newdata['y'].values
        return reduced
        
        
        
        



Writing Data.py


In [110]:
#Both kfold and Stratifiedfold strategy gave similar performance
kfold = KFold(n_splits=5)
stratfold=StratifiedKFold(n_splits=5)

XGB ALGORITHM

In [23]:

# define eval metrics

def xgb_f1(y, t, threshold=0.5):
    try:
        t = t.get_label()
    except AttributeError:
        pass
    y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
    return 'f1',f1_score(t,y_bin)

In [35]:
params = {"objective": "binary:logistic", # for classification
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.01,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 1.0,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10 ,  # Random number seed
           "n_estimators":300,
          "nfold":kfold
          }
num_boost_round = 4000

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_val, y_val)
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
# train the xgboost model
model = xgb.train(params, dtrain, num_boost_round, evals=watchlist, \
  early_stopping_rounds= 100,feval=xgb_f1,verbose_eval=True)

Parameters: { n_estimators, nfold, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.21784	eval-error:0.23982	train-f1:0.80338	eval-f1:0.78332
Multiple eval metrics have been passed: 'eval-f1' will be used for early stopping.

Will train until eval-f1 hasn't improved in 100 rounds.
[1]	train-error:0.19903	eval-error:0.22349	train-f1:0.81614	eval-f1:0.79329
[2]	train-error:0.15531	eval-error:0.17604	train-f1:0.84784	eval-f1:0.82558
[3]	train-error:0.13114	eval-error:0.15763	train-f1:0.87204	eval-f1:0.84482
[4]	train-error:0.11313	eval-error:0.14234	train-f1:0.88959	eval-f1:0.86062
[5]	train-error:0.10284	eval-error:0.13741	train-f1:0.90017	eval-f1:0.86657
[6]	train-error:0.09399	eval-error:0.12601	train-f1:0.90915	eval-f1:0.87758
[7]	train-error:0.09083	e

[100]	train-error:0.05916	eval-error:0.08608	train-f1:0.94267	eval-f1:0.91625
Stopping. Best iteration:
[0]	train-error:0.21784	eval-error:0.23982	train-f1:0.80338	eval-f1:0.78332



In [36]:
y_pred = model.predict(xgb.DMatrix(X_val))
xgb_f1(y_pred,y_val)

('f1', 0.9162462159434914)

In [46]:

#yf_train=pf.DataFRame(yf_train,coumns=new_data_scaled.columns[-1])
dtrain = xgb.DMatrix(Xf_train,yf_train)
dtest = xgb.DMatrix(Xf_test)
params = {"objective": "binary:logistic", # for classification
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.01,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 1.0,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10 ,  # Random number seed
           "n_estimators":300,
          "nfold":kfold
          }
num_round = 1000



# train the xgboost model
model = xgb.train(params, dtrain, num_round)
# make prediction
preds = model.predict(dtest)

Parameters: { n_estimators, nfold, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [26]:
class XGBClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        x_train_total, y_train_total = X,y
        dtrain = xgb.DMatrix(x_train_total, y_train_total)
        # specify parameters via map
        params = {"objective": "binary:logistic", # for classification
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.01,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 1.0,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10 ,  # Random number seed
           "n_estimators":300,
          }
        num_round = 2000
        self.model = xgb.train(params, dtrain, num_round)
        return self
    
    def predict(self, X):
        dtest = xgb.DMatrix(X)
        predict=np.expm1(0.995*(self.model.predict(dtest)))
        return predict
    def metric(self,y, t, threshold=0.5):
        try:
            t = t.get_label()
        except AttributeError:
            pass
        y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
        return 'f1',f1_score(t,y_bin)
    def score(self,X,y):
        y_pred=self.predict(X)
        score=self.metric(y_pred,y)
        return score
        

In [27]:
model=XGBClassifier()
model.fit(Xf_train,yf_train)

Parameters: { n_estimators, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBClassifier()

In [28]:
y_pred = model.predict(Xf_test)
print(xgb_f1(y_pred,yf_test))
model.score(Xf_test,yf_test)


('f1', 0.9653890824622532)


('f1', 0.9653890824622532)

In [168]:
#saving the model to pickle
pickle.dump(model, open('xgb.pkl','wb'))

LOGISTIC REGRESSION

In [33]:
class LogRegClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        stratfold=StratifiedKFold(n_splits=5)
        # Create logistic regression
        logistic = LogisticRegression()
        # Create regularization penalty space
        penalty = ['l1', 'l2']
        # Create regularization hyperparameter space
        C = np.logspace(0, 4, 100)

        # Create hyperparameter options
        hyperparameters = dict(C=C, penalty=penalty)
        clf = GridSearchCV(logistic, hyperparameters, cv=stratfold, verbose=0,scoring='f1')
        self.model = clf.fit(X_train, y_train)
        return self
    
    def predict(self, X):
        predict=self.model.predict(X)
        return predict
    def score(self,X,y):
        score=self.model.score(X,y)
        return score
        

In [None]:
model=LogRegClassifier()
model.fit(Xf_train,yf_train)

In [35]:
model.score(Xf_test,yf_test)

0.6490299823633157

In [147]:
# Create logistic regression
logistic = LogisticRegression()
# Create regularization penalty space
penalty = ['l1', 'l2']
# Create regularization hyperparameter space
C = np.logspace(0, 4, 100)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [148]:
clf = GridSearchCV(logistic, hyperparameters, cv=stratfold, verbose=0,scoring='f1')

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
# View best hyperparameters
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

In [151]:
best_model.score(X_val,y_val)

0.670632774472688

MLPCLASSIFIER

In [162]:
parameters = {'solver': ['lbfgs'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 'random_state':[0,1,2,3,4,5,6,7,8,9]}
clf = RandomizedSearchCV(MLPClassifier(), parameters, cv=kfold,n_jobs=-1,scoring='f1')

clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.best_params_)

0.9224704336399474
{'solver': 'lbfgs', 'random_state': 3, 'max_iter': 2000, 'hidden_layer_sizes': 13, 'alpha': 1e-07}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [None]:
class MultiLayerClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        stratfold=StratifiedKFold(n_splits=5)
        parameters = {'solver': ['lbfgs'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 
                      'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 
                      'random_state':[0,1,2,3,4,5,6,7,8,9]}
        clf = RandomizedSearchCV(MLPClassifier(), parameters, cv=stratfold,n_jobs=-1,scoring='f1')
        self.model = clf.fit(X_train, y_train)
        return self
    
    def predict(self, X):
        predict=self.model.predict(X)
        return predict
    def score(self,X,y):
        score=self.model.score(X,y)
        return score

In [None]:
model_mlp=MLPClassifier()
model_mlp.fit(Xf_train,yf_train)

In [37]:
#best_params_=[{'solver': 'lbfgs', 'random_state': 3, 'max_iter': 2000, 'hidden_layer_sizes': 13, 'alpha': 1e-07}]
model_mlp.score(Xf_test, y_test)

0.9507584597432905

Classification models Combining models into Model.py

In [1]:
%%writefile Model.py
class XGBClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        x_train_total, y_train_total = X,y
        dtrain = xgb.DMatrix(x_train_total, y_train_total)
        # specify parameters via map
        params = {"objective": "binary:logistic", # for classification
          "booster" : "gbtree",   # use tree based models 
          "eta": 0.01,   # learning rate
          "max_depth": 10,    # maximum depth of a tree
          "subsample": 1.0,    # Subsample ratio of the training instances
          "colsample_bytree": 0.7,   # Subsample ratio of columns when constructing each tree
          "silent": 1,   # silent mode
          "seed": 10 ,  # Random number seed
           "n_estimators":300,
          }
        num_round = 2000
        self.model = xgb.train(params, dtrain, num_round)
        return self
    
    def predict(self, X):
        dtest = xgb.DMatrix(X)
        predict=np.expm1(0.995*(self.model.predict(dtest)))
        return predict
    def metric(self,y, t, threshold=0.5):
        try:
            t = t.get_label()
        except AttributeError:
            pass
        y_bin = (y > threshold).astype(int) # works for both type(y) == <class 'numpy.ndarray'> and type(y) == <class 'pandas.core.series.Series'>
        return 'f1',f1_score(t,y_bin)
    def score(self,X,y):
        y_pred=self.predict(X)
        score=self.metric(y_pred,y)
        return score
        
class LogRegClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        stratfold=StratifiedKFold(n_splits=5)
        # Create logistic regression
        logistic = LogisticRegression()
        # Create regularization penalty space
        penalty = ['l1', 'l2']
        # Create regularization hyperparameter space
        C = np.logspace(0, 4, 100)

        # Create hyperparameter options
        hyperparameters = dict(C=C, penalty=penalty)
        clf = GridSearchCV(logistic, hyperparameters, cv=stratfold, verbose=0,scoring='f1')
        self.model = clf.fit(X, y)
        return self
    
    def predict(self, X):
        predict=self.model.predict(X)
        return predict
    def score(self,X,y):
        score=self.model.score(X,y)
        return score
        
class MultiLayerClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y):
        stratfold=StratifiedKFold(n_splits=5)
        parameters = {'solver': ['lbfgs'], 'max_iter': [1000,1100,1200,1300,1400,1500,1600,1700,1800,1900,2000 ], 
                      'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':np.arange(10, 15), 
                      'random_state':[0,1,2,3,4,5,6,7,8,9]}
        clf = RandomizedSearchCV(MLPClassifier(), parameters, cv=stratfold,n_jobs=-1,scoring='f1')
        self.model = clf.fit(X, y)
        return self
    
    def predict(self, X):
        predict=self.model.predict(X)
        return predict
    def score(self,X,y):
        score=self.model.score(X,y)
        return score

Writing Model.py


In [2]:
%%writefile Main.py
import pandas as pd
from pandas.api.types import is_numeric_dtype
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.preprocessing import OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold,StratifiedKFold,train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.neural_network import MLPClassifier
from sklearn.base import BaseEstimator, TransformerMixin,ClassifierMixin
from Model import MLPClassifier,XGBClassifier,LogRegClassifier
from Data import Preprocessing
import pickle
def model():
    data=pd.read_csv('./bank-additional-full.csv',sep=';')
    reduced_data=Preprocessing().fit_transform(data)
    smote = SMOTE(sampling_strategy='minority')
    X_sm, y_sm = smote.fit_resample(reduced_data.drop(['y'],axis=1), reduced_data.y)
    Xf_train,Xf_test,yf_train,yf_test=train_test_split(X_sm,y_sm,test_size=0.1,random_state=1234,shuffle=True)
    model_mlp=MultiLayerClassifier()
    model_xgb=XGBClassifier()
    model_logreg=LogRegClassifier()
    model_mlp.fit(Xf_train,yf_train)
    model_xgb.fit(Xf_train,yf_train)
    model_logreg.fit(Xf_train,yf_train)
    print ("The accuracy(f1 score) for {0} mode is {1}".format('Multilayer perceptron',model_mlp.score(Xf_test,yf_test)))
    print ("The accuracy(f1 score) for {0} mode is {1}".format('XGBClassifier',model_xgb.score(Xf_test,yf_test)))
    print ("The accuracy(f1 score) for {0} mode is {1}".format('Logistic Regression',model_logreg.score(Xf_test,yf_test)))
model()

Writing Main.py
