In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy.stats import uniform
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
rcParams['figure.figsize'] = 12, 4
rcParams['lines.linewidth'] = 3
rcParams['xtick.labelsize'] = 'x-large'
rcParams['ytick.labelsize'] = 'x-large'


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.base import BaseEstimator, RegressorMixin
from imblearn import under_sampling, over_sampling
import statsmodels.api as sm

## Load Data

In [2]:
df = pd.read_csv('../data/data_modeling.csv',sep=',')
df.head(2)

Unnamed: 0,age,default,balance,housing,loan,duration,campaign,previous,y,contact_group,...,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,0,2143,1,0,4.35,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1,44,0,29,1,0,2.516667,1,0,0,0,...,0,0,1,0,1,0,0,0,0,1


In [3]:
y = df['y'].values
X = df.drop(labels = ['y'], axis = 1)
print("Shape of X is {} and that of y is {}".format(X.shape, y.shape))

Shape of X is (45206, 25) and that of y is (45206,)


## Class Imbalance (SMOTE)

In [4]:
X_sm, y_sm = over_sampling.SMOTE(0.5).fit_resample(X, y)
print("Shape of X is {} and that of y is {}".format(X_sm.shape, y_sm.shape))

Shape of X is (59878, 25) and that of y is (59878,)


## Split Train-Test

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size = 0.3, random_state = 42)

## Modeling

In [5]:
def eval_classification(model, xtrain, ytrain, xtest, ytest):
    ypred = model.predict(xtest)
    ypred = (ypred > 0.5).astype(int)
    y_train = model.predict(xtrain)
    y_train = (y_train > 0.5).astype(int)
    
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, ypred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, ypred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, ypred))
    print("AUC (Test Set - Proba) : %.2f" % roc_auc_score(ytest, ypred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, ypred))
    print("F1-Score (Train Set): %.2f" % f1_score(ytrain, y_train))
    
def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

def show_best_hyperparameter(model, hyperparameters):
    for key, value in hyperparameters.items() :
        print('Best '+key+':', model.get_params()[key])