In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("bank-additional-full.csv", sep = ';') ; data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [7]:
#%%writefile data.py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, QuantileTransformer
from imblearn.over_sampling import SMOTE

def preprocess(df):#, linear = False):

    '''
    parameters
    ----------
    df : A pandas dataframe
          should contain the data

    Return
    ------
    X_train, X_test, y_train, y_test
      A pandas dataframe of the train and test set that has been transformed with PCA and minority class over sampling
    
    '''
    #perform feature engineering and data preprocessing
    d = {'yes':1, 'no':0}
    y = df.y.replace(d)
    df['new_pdays'] = df['pdays'].apply(lambda x: 1 if x < 16 else 0 if x > 30 else 2)
    df['new_pdays'] = df['previous'] * df['new_pdays']
    df['new_emp_rate'] = df['emp.var.rate'].apply(lambda x: 0 if x > 0 else 1)
    df['empxemp'] = (df['nr.employed'] / df['euribor3m'])# + df[emp.var.rate]
    
    #Turn categorical columns to dummies
    X = pd.get_dummies(df.drop(['y','cons.conf.idx','previous'], axis = 1))
    
    #Split data into train and test set
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = .1, random_state = 12)
    print('The shape of the train set is {}, the shape of the test set is {}'.format(X_train.shape, X_test.shape))
    
   # if linear:
#         print('Generating  polynomials')

#         poly = PolynomialFeatures(2)
#         X_train = poly.fit_transform(X_train)
#         X_test = poly.transform(X_test)
        
    #Scale the data
    S = StandardScaler()
    
    #Apply the transformation on the train and test set 
    X_train = S.fit_transform(X_train)
    X_test = S.transform(X_test)
    
    #Apply PCA on the train and test set
    pca = PCA(10)
    x_train_pca = pca.fit_transform(X_train)
    x_test_pca = pca.transform(X_test)
    
    #Oversample the minority class
    sm = SMOTE(k_neighbors=10, random_state= 10)
    
    X_m, y_m = sm.fit_sample(x_train_pca, y_train)
    
    return X_m, x_test_pca, y_m, y_test

Writing data.py


In [None]:
X_train, X_test, y_train, y_test = preprocess(data)

The shape of the train set is (37069, 64), the shape of the test set is (4119, 64)




In [12]:
%%writefile model.py
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
import numpy as np

class model():

  def __init__(self, X_train, X_test, y_train, y_test):

    '''
    This class implements Logistic Regression, Multi layer Perceptron and extreme gradient boosting algorithm
    Parameters
    ----------
    X_train : Dataframe, numpy 2D array
               The train set to be used for training
    X_test :  Dataframe, Numpy 2D array
              The hold out set, to be used for validating the model perfomancce
    y_train : pandas series, numpy 1D array
               Labels for the train set
    y_test : pandas series, numpy 1D array
              Label for the test set

    Methods
    -------
    logit : To fit the data using logistic regression
    MLP : To fit the data using Multi layered perceptron
    XGB : To fit the data using extreme gradient boosting

    Return
    ------
    Score
      A 5 fold cross validation score
    
    Example
    ------
    M = model(X_train, X_test, y_train, y_test)
    M.logit() #To fit logistic regression
    '''

    self.X_train = X_train
    self.X_test = X_test
    self.y_train = y_train
    self.y_test = y_test


  def evaluate(self, X_train, X_test, y_train, y_test, model):

    '''
    Evaluate the performance of the model

    Parameters
    ---------
    X_train : Dataframe, numpy 2D array
               The train set to be used for training
    X_test :  Dataframe, Numpy 2D array
              The hold out set, to be used for validating the model perfomancce
    y_train : pandas series, numpy 1D array
               Labels for the train set
    y_test : pandas series, numpy 1D array
              Label for the test set
    model : instance
            A fitted instance of the model

    Return
    ------
    f1_score on the train set and test set
    classification report of the test set

    Example
    -------
    evaluate(X_train, X_test, y_train, y_test)
    

    '''

    #Obtain train and test f1 score
    train_score = f1_score(y_train, model.predict(X_train))
    test_score =  f1_score(y_test, model.predict(X_test))

    #Print scores and classification report
    print (f'train score is {train_score}, test_score is {test_score}')
    print('----------------------------------------------------------')
    print(classification_report(y_test, model.predict(X_test)))

  
  def objective_xgb(self, params):

    '''
    Define optimization objective for XGBOOST

    Parameters
    ---------
    params : dict
            Model parameters to be optimized

    Return:
    ------
    Cross validationn score

    '''

    #set parameters to tune
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    #fit model with parameters
    xgb = XGBClassifier(random_state=23, **params, n_estimators=300)
    #get cv score
    skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    #Obtain cross validation score
    score_skf = cross_val_score(xgb, self.X_train, self.y_train, scoring='f1', cv = skf).mean()
    score_kf = cross_val_score(xgb, self.X_train, self.y_train, scoring='f1', cv = kf).mean()

    #print scores
    print("stratifiedKFold score {}, Kfold_score {}, params {}".format(score_skf,score_kf, params))
    return score_skf


  def logit(self):
    '''
    Fit Logistic regression model

    Return
    ------
    Cross validation score
    
    '''
    
    #Call the model
    print('fitting Logistic regression...')
    lr = LogisticRegression(random_state= 10, max_iter = 10000, )
    #fit model
    lr.fit(self.X_train, self.y_train)
    #Obtain and print AUC Score for test and train
    self.evaluate(self.X_train, self.X_test, self.y_train, self.y_test, lr)
    
    '''Hyper Parameter Search and Cross Validation for logistic Regression'''
    
    print('Searching for best hyperparameter... ')
    #Set params
    params = {'C': np.linspace(0.0001,0.001,20)}
    
    #Init and fit grid search
    lr_grid = RandomizedSearchCV(LogisticRegression(random_state = 10, max_iter = 10000), params,
                                 scoring='f1', cv =10,  n_iter = 20)
    lr_grid.fit(self.X_train, self.y_train)
    print('--------------------------------------\nDONE')
    print(f'Best Score {lr_grid.best_score_} Best Param {lr_grid.best_params_}')
    self.evaluate(self.X_train, self.X_test, self.y_train, self.y_test,lr_grid.best_estimator_ )
    
    print('Running Cross Val \nReturning 5fold CV Scores')
    print('--------------------------------------')
    
    #stratified kfold
    skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    skf_score = cross_val_score(lr_grid.best_estimator_, self.X_train, self.y_train, scoring = 'f1', cv = skf).mean()
    kf_score = cross_val_score(lr_grid.best_estimator_, self.X_train, self.y_train, scoring = 'f1', cv = kf).mean()

    
    print('--------------------------------------\nDONE')
    print('StratifiedKfold Score: {}, KFold Score: {}'.format(skf_score, kf_score))
  

  def MLP(self):
    '''
    Fit MLP model
    
    Return
    ------
    Cross validation score
    
    '''
    
    #Call the model
    print('fitting MLP...')
    mlp = MLPClassifier(random_state= 10, early_stopping= True, learning_rate= 'adaptive')
    #fit model
    mlp.fit(self.X_train, self.y_train)
    #Obtain and print AUC Score for test and train
    self.evaluate(self.X_train, self.X_test, self.y_train, self.y_test, mlp)
    
    '''Hyper Parameter Search and Cross Validation for logistic Regression'''
    
    print('Searching for best hyperparameter... ')
    #Set params
    params = {'hidden_layer_sizes': np.arange(100,600,100), 
             'learning_rate_init': np.linspace(0.001,0.003,5)}
    
    #Init and fit grid search
    mlp_grid = RandomizedSearchCV(MLPClassifier(random_state= 10, early_stopping= True, learning_rate= 'adaptive'), params,
                                 scoring='f1', cv =10,  n_iter = 20, n_jobs = 12)
    mlp_grid.fit(self.X_train, self.y_train)
    print('--------------------------------------\nDONE')
    print(f'Best Score {mlp_grid.best_score_} Best Param {mlp_grid.best_params_}')
    self.evaluate(self._train, self.X_test, self.y_train, self.y_test,mlp_grid.best_estimator_ )
    
    print('Running Cross Val \nReturning 5fold CV Scores')
    print('--------------------------------------')
    
    #stratified kfold
    skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    skf_score = cross_val_score(mlp_grid.best_estimator_, self.X_train, self.y_train, scoring = 'f1', cv = skf).mean()
    kf_score = cross_val_score(mlp_grid.best_estimator_, self.X_train, self.y_train, scoring = 'f1', cv = kf).mean()

    
    print('--------------------------------------\nDONE')
    print('StratifiedKfold Score: {}, KFold Score: {}'.format(skf_score, kf_score))


  def XGB(self):
    '''
    Fit XGBOOST
    
    Return
    ------
    Cross validation score
    '''
    
    #Call the model
    print('fitting xgboost...')
    xgb = XGBClassifier(random_state= 10, n_estimators = 1000,  use_best_model = True, verbosity=0)
    #fit model
    xgb.fit(self.X_train, self.y_train, eval_set = [(self.X_test, self.y_test)], eval_metric = 'auc', early_stopping_rounds = 100, verbose = 0 )
    #Obtain and print AUC Score for test and train
    self.evaluate(self.X_train, self.X_test, self.y_train, self.y_test, xgb)
    
    '''Hyper Parameter Search and Cross Validation for logistic Regression'''
    
    print('Searching for best hyperparameter... ')
    #Set params
    
    space_xgb = {
        'max_depth': hp.quniform('max_depth', 2, 5, 1),
        'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
        'gamma': hp.uniform('gamma', 0.0, 0.5),
        'learning_rate': hp.uniform('learning_rate', 0.01, 0.5)
    }

    print('Running Cross Val \nReturning 5fold CV Scores')
    print('--------------------------------------')
    
    best_xgb = fmin(fn=self.objective_xgb,
            space=space_xgb,
            algo=tpe.suggest,
            max_evals=10)
    
    best_xgb['max_depth'] = int(best_xgb['max_depth'])
    
    print('--------------------------------------\nDONE')
    print(f'Best Params {best_xgb}')
    
    skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    xgb = XGBClassifier(n_estimators=10, random_state = 42, **best_xgb)

    skf_score = cross_val_score(xgb, self.X_train, self.y_train, scoring = 'f1', cv = skf).mean()
    kf_score = cross_val_score(xgb, self.X_train, self.y_train, scoring = 'f1', cv = kf).mean()
    
    
    print('--------------------------------------\nDONE')
    print('StratifiedKfold Score: {}, KFold Score: {}'.format(skf_score, kf_score))

Overwriting model.py


In [5]:
m = model(*preprocess(data))

The shape of the train set is (37069, 64), the shape of the test set is (4119, 64)


In [None]:
m.logit()

fitting Logistic regression...
train score is 0.7204054431046774, test_score is 0.36596736596736595
----------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.74      0.83      3666
           1       0.25      0.69      0.37       453

    accuracy                           0.74      4119
   macro avg       0.60      0.72      0.60      4119
weighted avg       0.87      0.74      0.78      4119

Searching for best hyperparameter... 
--------------------------------------
DONE
Best Score 0.7207422962145781 Best Param {'C': 0.0001}
train score is 0.7205527831094051, test_score is 0.3648725212464589
----------------------------------------------------------
              precision    recall  f1-score   support

           0       0.95      0.73      0.83      3666
           1       0.25      0.71      0.36       453

    accuracy                           0.73      4119
   macro avg       0.60      0.7

In [None]:
m.MLP()

fitting MLP...
train score is 0.8332098074508795, test_score is 0.44731182795698926
----------------------------------------------------------
              precision    recall  f1-score   support

           0       0.96      0.83      0.89      3666
           1       0.33      0.69      0.45       453

    accuracy                           0.81      4119
   macro avg       0.64      0.76      0.67      4119
weighted avg       0.89      0.81      0.84      4119

Searching for best hyperparameter... 


In [None]:
m.XGB()

fitting xgboost...
[0]	validation_0-auc:0.774015
Will train until validation_0-auc hasn't improved in 100 rounds.
[1]	validation_0-auc:0.7773
[2]	validation_0-auc:0.779476
[3]	validation_0-auc:0.782204
[4]	validation_0-auc:0.785559
[5]	validation_0-auc:0.785522
[6]	validation_0-auc:0.786294
[7]	validation_0-auc:0.788089
[8]	validation_0-auc:0.787723
[9]	validation_0-auc:0.788459
[10]	validation_0-auc:0.787645
[11]	validation_0-auc:0.789069
[12]	validation_0-auc:0.789923
[13]	validation_0-auc:0.789804
[14]	validation_0-auc:0.789022
[15]	validation_0-auc:0.788505
[16]	validation_0-auc:0.786867
[17]	validation_0-auc:0.786994
[18]	validation_0-auc:0.787021
[19]	validation_0-auc:0.787275
[20]	validation_0-auc:0.78716
[21]	validation_0-auc:0.786897
[22]	validation_0-auc:0.786754
[23]	validation_0-auc:0.786899
[24]	validation_0-auc:0.786756
[25]	validation_0-auc:0.786995
[26]	validation_0-auc:0.786016
[27]	validation_0-auc:0.786167
[28]	validation_0-auc:0.78578
[29]	validation_0-auc:0.785639


In [13]:
%%writefile main.py
from data import preprocess
from model import model
def main(data):

  '''
  The Main function to automate the processess

  '''

  M = model(*preprocess(data))
  M.logit()
  M.XGB()
  M.MLP()

  if __name__ == 'main':
    main()

Writing main.py


In [14]:
!pip freeze > requirements.txt