In [None]:
# import required packages for data manupulation

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import os
import random
plt.show()

In [None]:
# INPUT: set path for data placement 
#os.chdir(r'C:\Users\mshafin\Desktop\xavient_churn_project')
#os.chdir(r'C:\Users\mohammad shafin\Desktop\xavient_churn_project')

os.chdir(r'/home/mohnkhan/xavient_binary_balanced')
random.seed(42)
os.getcwd()

In [None]:
# INPUT: data loaded to a pandas dataframe

df = pd.read_csv('telecom_churn_training.csv')
df.info()
df_length = df.shape[1]
df_row = df.shape[0]

In [None]:
df_row

In [None]:
# count number of missing values in each column
# sum True's

print("Columnwise missing value count")
df.isnull().sum().plot.bar(figsize=(df_length, 4))
#df["target"].value_counts().plot.pie(figsize=(4, 4))
#print(df.target.value_counts())

In [None]:
# INPUT: set customer identification code and target variable

customer_identity_code = 'customerID'
target_code = 'Churn'

In [None]:
# set index for customer identification code

df = df.set_index(customer_identity_code)

In [None]:
# identify target variable

df['target'] = df[target_code]

In [None]:
# INPUT: set target labels

label_target1 = 'Yes'
label_target0 = 'No'

In [None]:
# function to recode target levels

def dependent_col(row):
    if row['target'] == label_target1:
        val = 1  # input
    elif row['target'] == label_target0:
        val = 0  # imput
    else:
        val = 2
    return val

In [None]:
# view recoded data in the target variable

df['target'] = df.apply(dependent_col, axis=1)
print(df.loc[:,['target','Churn']].head(5)) #check changes in target recoding

In [None]:
# keep rows when target variable is finite

start = df.shape[1]
df = df[np.isfinite(df['target'])]
finish = df.shape[1]
print("The number of row/rows dropped because of missing target variable is " + str(start-finish))

In [None]:
# INPUT droping initial target
# when the target variable is identified and recoded it has to be removed by the program

df = df.drop(['Churn'],axis=1)

In [None]:
# target variable distibution

df["target"].value_counts().plot.pie(figsize=(4, 4))
print(df.target.value_counts())

In [None]:
# INPUT: populate with columns, which is considered discrete or non continious
# all categoies which have string input are categroized as object type in python by default

object_columns = ['gender','SeniorCitizen','Partner','Dependents','PhoneService','MultipleLines','InternetService',
                  'OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
                  'Contract','PaperlessBilling','PaymentMethod'] 

In [None]:
# typecasting 'object' type on variables that may be populated with numbers, but are of non-continious nature such as gender

for column in object_columns:
    df[column] = df[column].astype('object')

In [None]:
# view levels per columns in the non-continious variable

print("Value count of each level for datatype 'object'")

for column in object_columns:
    print(column)
    df[column].value_counts().plot.bar(figsize=(df_length/4, 2))
    plt.show()

In [None]:
# object variables converted to categories for cat.codes

object_columns = df.select_dtypes(['object']).columns

In [None]:
# missing value imputation strategy: create a new level called 'UNKNOWN'

print("The count of columns were successfully imputed with missing value indicator 'UKNOWN'")

for column in object_columns:
    if df[column].dtypes=="object":
        df[column] = df[column].fillna("UKNOWN").astype('object')    

In [None]:
# missing value imputation strategy: create a new level called 'UNKNOWN'

for column in object_columns:
    if df[column].dtypes=="object":
        df[column] = df[column].astype('category')

In [None]:
# recoding categories to number with numerical and alphabatical order 0-1 and a-z, respectively

df[object_columns] = df[object_columns].apply(lambda x: x.cat.codes)

In [None]:
# chisquare test function

import scipy.stats as scs

def categories(series):
    return range(int(series.min()), int(series.max()) + 1)

def chi_square_of_df_cols(df, col1, col2):
    df_col1, df_col2 = df[col1], df[col2]

    result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
               for cat2 in categories(df_col2)]
              for cat1 in categories(df_col1)]

    return scs.chi2_contingency(result)[1]

In [None]:
# function to check chisquare test for category variables

object_columns_final = []
chi_square_pvalue_final = []
for column in object_columns:
    chisquare_pvalue = round(chi_square_of_df_cols(df, column, 'target').astype('float64'),3)
    #if chi_square_of_df_cols(df, column, 'target') <= 0.05:
    object_columns_final.append(column)
    chi_square_pvalue_final.append(chisquare_pvalue)

In [None]:
# data frame create to view column wise p-value with respect to the target

df_catname = pd.DataFrame({'feature_name':object_columns_final})
df_pvalue = pd.DataFrame({'p-value':chi_square_pvalue_final})

frames = [df_catname,df_pvalue]
print(pd.concat(frames,axis=1))

In [None]:
# final list of categorical variables that survive the p-value check in the chi square test

object_columns_final = []

for column in object_columns:
    chisquare_pvalue = round(chi_square_of_df_cols(df, column, 'target').astype('float64'),3)
    if chi_square_of_df_cols(df, column, 'target') <= 0.05:
        object_columns_final.append(column)
print("Categorical variables selected for modeling post chi-square test")
print(object_columns_final)

In [None]:
# function to check near zero variance

object_columns_final_nz = []
object_columns_final_variance = []

for column in object_columns_final:
    object_columns_variance = df[column].var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None)
    #if  feature_columns_variance > 0.00:
    object_columns_final_nz.append(column)
    object_columns_final_variance.append(object_columns_variance)

In [None]:
df_object_columns_final_nz = pd.DataFrame({'feature_name':object_columns_final_nz})
df_variance = pd.DataFrame({'variance':object_columns_final_variance})
frames = [df_object_columns_final_nz,df_variance]

print(pd.concat(frames,axis=1))

In [None]:
# function to check near zero variance

object_columns_final_nz = []
object_columns_final_variance = []

for column in object_columns_final:
    object_columns_variance = df[column].var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None)
    if  object_columns_variance > 0.01:
        object_columns_final_nz.append(column)
        object_columns_final_variance.append(object_columns_variance)
print("Categorical variables with variance above 0.01")
print(object_columns_final_nz)
print("number of categorical columns: "+ str(len(object_columns_final_nz)))

In [None]:
# saving categorical columns for scoring

import pickle
object_columns_final_nz_index = object_columns_final_nz

pickle_out_cat = open("object_columns_final_nz.pickle","wb")
pickle.dump(object_columns_final_nz_index, pickle_out_cat)
pickle_out_cat.close()

In [None]:
# load numeric columns for scoring
import pickle
pickle_in_cat = open("object_columns_final_nz.pickle","rb")
object_columns_final_nz_index = pickle.load(pickle_in_cat)
len(object_columns_final_nz_index)
#df[object_columns_final_nz_index]

In [None]:
# INPUT: populate with numeric columns
# missing value treatments

numeric_columns_mean = ['MonthlyCharges','TotalCharges'] # impute missing numeric columns with mean

# impute missing numeric columns with zero

numeric_columns_zero = ['tenure'] # populate with numeric columns

scale_columns = numeric_columns_mean + numeric_columns_zero
length = len(scale_columns) # for figure width

In [None]:
df[scale_columns].plot.box(figsize=(length*2,4))
print("Check for continous variable scaling")

In [None]:
# for mean imputation

for column in numeric_columns_mean:
    if df[column].dtypes in ["int64","float64"] :
        df[column] = df[column].fillna(df[column].mean())

# for 0 imputation

for column in numeric_columns_zero:
    if df[column].dtypes in ["int64","float64"] :
        df[column] = df[column].fillna(0)

# append all numerical columns

scale_columns = numeric_columns_mean + numeric_columns_zero

In [None]:
# scale the numerical variables to remove outliers
from sklearn.preprocessing import scale
for column in scale_columns:
    if df[column].dtypes in ["int64","float64"] :
        df[column] = scale(df[column].astype('float64'))

In [None]:
print("scaled continious variables")
df[scale_columns].plot.box(figsize=(length*2,4))

In [None]:
# function to check near zero variance

scale_columns_final_nz = []
scale_columns_variance_final = []

for column in scale_columns:
    scale_columns_variance = df[column].var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None)
    #if  feature_columns_variance > 0.00:
    scale_columns_final_nz.append(column)
    scale_columns_variance_final.append(scale_columns_variance)

In [None]:
df_scale_columns_final = pd.DataFrame({'feature_name':scale_columns_final_nz})
df_variance = pd.DataFrame({'variance':scale_columns_variance_final})
frames = [df_scale_columns_final,df_variance]

print(pd.concat(frames,axis=1))

In [None]:
# function to check near zero variance

scale_columns_final_nz = []
scale_columns_variance_final = []

for column in scale_columns:
    scale_columns_variance = df[column].var(axis=None, skipna=None, level=None, ddof=1, numeric_only=None)
    if  scale_columns_variance > 0.01:
        scale_columns_final_nz.append(column)
        scale_columns_variance_final.append(scale_columns_variance)
print("numeric variables with variance above 0.01")
print(scale_columns_final_nz)
print("number of numeric columns: "+ str(len(scale_columns_final_nz)))

In [None]:
# saving numeric columns for scoring
import pickle

scale_columns_final_nz_index = scale_columns_final_nz
pickle_out_num = open("scale_columns_final_nz.pickle","wb")
pickle.dump(scale_columns_final_nz_index, pickle_out_num)
pickle_out_num.close()

In [None]:
# load numeric columns for scoring
import pickle

pickle_in_num = open("scale_columns_final_nz.pickle","rb")
scale_columns_final_nz_index = pickle.load(pickle_in_num)
len(scale_columns_final_nz_index)

In [None]:
# create features for modeling objective

target_column = ['target']
feature_columns = scale_columns_final_nz + object_columns_final_nz
print("target_column: " + str(target_column))
print("feature_columns: " + str(feature_columns))
print("dataframe shape: " + str(df[feature_columns].shape))
fig_length = df[feature_columns].shape[1]

In [None]:
from sklearn.model_selection import train_test_split

training_features, test_features,\
training_target, test_target, = train_test_split(df[feature_columns].values, df[target_column].values.ravel(), test_size = .1, random_state=12)
training_features.shape, test_features.shape, training_target.shape, test_target.shape

In [None]:
from imblearn.over_sampling import SMOTE

X_train, X_val, y_train, y_val \
= train_test_split(training_features, training_target, test_size = .1, random_state=12)

# Using smote to increase the number of under-represented class
sm = SMOTE(random_state = 12, ratio = 'minority')

X_train_res, y_train_res = sm.fit_sample(X_train, y_train)
X_train_res.shape, y_train_res.shape, X_val.shape, y_val.shape

In [None]:
# import metrics for model evaluation

from sklearn.metrics import recall_score,accuracy_score,confusion_matrix,classification_report,precision_score
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_val_score
from keras.wrappers.scikit_learn import KerasClassifier
# create function to evaluate model performance

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    cohen_kappa = cohen_kappa_score(y_test, y_pred, sample_weight=None)
    recall = recall_score(y_test,y_pred)
    matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    report = classification_report(y_test, y_pred)
    print("accuracy :" +str(accuracy))
    print("cohen_kappa :" +str(cohen_kappa))
    print("recall :" +str(recall))
    print(" tn, fp, fn, tp :" )
    print(tn, fp, fn, tp)
    print("matrix :")
    print(matrix)
    print("report :")
    print(report)
    return accuracy

In [None]:
from keras.models import Sequential
from keras.layers import Dense,Dropout, Activation

# fix the input dimenstion to number of feature terms
input_dimenation = int(training_features.shape[1])

# create the output dimenstion
output = 1

# create the batch size
batch = int(round(df.shape[0]/10,0))

# epoch
epoch = 100



# Function to create model, required for KerasClassifier
def baseline_model():
# create model
    model = Sequential()
    model.add(Dense(input_dimenation, input_dim=input_dimenation, activation='relu'))
    model.add(Dense(output, activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn import model_selection
import matplotlib.pyplot as plt

# generate list of models  
models = []
models.append(('XGB', XGBClassifier(n_estimators = 500,learning_rate=.1, random_state=21)))
models.append(('GBM', GradientBoostingClassifier(n_estimators = 500,learning_rate=.1, random_state=21)))
models.append(('RF', RandomForestClassifier(n_estimators = 500 ,random_state=21)))
models.append(('DT', DecisionTreeClassifier(splitter='random', random_state=21)))
models.append(('ADA', AdaBoostClassifier(n_estimators = 500,learning_rate=.1, random_state=21)))
models.append(('LM', LogisticRegression(multi_class = 'ovr', solver='saga', random_state=21)))
#models.append(('TF', KerasClassifier(build_fn=baseline_model, epochs=epoch, batch_size=batch, verbose=0))) 

# check model performance
results = []
names = []
seed = 7
msgall = []
scoring='accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits = 10, random_state=seed)
    #cv_results = cross_val_score(model, X_train_res, y_train_res, cv=kfold, scoring = scoring)
    cv_results = cross_val_score(model, training_features, training_target, cv=kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print (msg)
    msgall.append(msg)

# compare algorithms
fig = plt.figure()
fig.suptitle("algorithm comparision")
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()


In [None]:
# import ensemble models

from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier,AdaBoostClassifier
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# create grid for randomizedsearch
GBM = GradientBoostingClassifier()

# Number of iterations needed
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
learning_rate = [0.1, 0.05, 0.02, 0.01]
max_features = ['sqrt','auto','log2','None','1','0.1']
loss = ['deviance', 'exponential']
max_depth = [4, 6, 8]
criterion = ['friedman_mse']
min_samples_split = [2, 5, 10]
min_samples_leaf = [20,50,100,150]
random_state = [21]

# Create the random grid

random_grid_gbm = {'n_estimators': n_estimators,
               'learning_rate': learning_rate,
               'loss':loss,
               'max_depth': max_depth,
               'criterion': criterion,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'random_state': random_state}
pprint(random_grid_gbm)

In [None]:
GBM = GradientBoostingClassifier()
# Random search of parameters, using 10 fold cross validation,
# search across 50 different combinations, and use all available cores
#gbm_smote = RandomizedSearchCV(estimator = GBM, param_distributions = random_grid_gbm, n_iter = 5, cv = 3, verbose=1, random_state=42, n_jobs = -1)
gbm = RandomizedSearchCV(estimator = GBM, param_distributions = random_grid_gbm, n_iter = 5, cv = 10, verbose=1, random_state=42, n_jobs = -1)

In [None]:
# Fit the gbm SMOTE model
#gbm_smote.fit(X_train_res, y_train_res)

In [None]:
# Check accuracy on test set

#gbm_smote_accuracy = evaluate(gbm_smote,test_features,test_target)

In [None]:
# Fit the gbm model without SMOTE

gbm.fit(training_features, training_target)

In [None]:
# Check accuracy on test set

gbm_accuracy = evaluate(gbm,test_features,test_target)

In [None]:
# feature importance plots

#print("gbm with SMOTE feature importance")
feature_names=df.drop(['target'],axis=1).columns
#feature_importance = gbm_smote.best_estimator_.feature_importances_
#sorted_idx = np.argsort(feature_importance)
#pos = np.arange(sorted_idx.shape[0]) + .5
#plt.figure(figsize=(10,fig_length/3))
#plt.barh(pos, feature_importance[sorted_idx], align='center')
#plt.yticks(pos, feature_names[sorted_idx])
#feature_names[sorted_idx]
#plt.show()

print("gbm without SMOTE feature importance")
feature_importance = gbm.best_estimator_.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10,fig_length/3))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names[sorted_idx])
feature_names[sorted_idx]
plt.show()

In [None]:
# First create the base model to tune

rf = RandomForestClassifier()

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_state = [21]

# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'random_state':random_state}
pprint(random_grid)

In [None]:
# Random search of parameters, using 10 fold cross validation,
# search across 50 different combinations, and use all available cores

#rf_smote = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 10, verbose=1, random_state=42, n_jobs = -1)
rf = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, cv = 10, verbose=0, random_state=42, n_jobs = -1)

In [None]:
# Fit the random search model

#rf_smote.fit(X_train_res, y_train_res)

In [None]:
# Check accuracy on test set

#rf_smote_accuracy = evaluate(rf_smote,test_features,test_target)

In [None]:
# Fit the random search model

rf.fit(training_features, training_target)

In [None]:
# Check accuracy on test set

rf_accuracy = evaluate(rf,test_features,test_target)

In [None]:
# feature importance plots

#print("rf with SMOTE feature importance")
feature_names=df.drop(['target'],axis=1).columns
#feature_importance = rf_smote.best_estimator_.feature_importances_
#sorted_idx = np.argsort(feature_importance)
#pos = np.arange(sorted_idx.shape[0]) + .5
#plt.figure(figsize=(10,fig_length/3))
#plt.barh(pos, feature_importance[sorted_idx], align='center')
#plt.yticks(pos, feature_names[sorted_idx])
#feature_names[sorted_idx]
#plt.show()

print("rf without SMOTE feature importance")
feature_importance = rf.best_estimator_.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10,fig_length/3))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names[sorted_idx])
feature_names[sorted_idx]
plt.show()

In [None]:
# First create the base model to tune

ADA = AdaBoostClassifier()

# Number of iterations needed
n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
learning_rate = [0.1, 0.05, 0.02, 0.01]
algorithm  = ['SAMME', 'SAMME.R']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
random_state = [21]

# Create the random grid
random_grid_ada = {'n_estimators': n_estimators,
                   'learning_rate': learning_rate,
                   'algorithm':algorithm,
 #                 'max_depth': max_depth,
 #                 'min_samples_leaf': min_samples_leaf,
                   'random_state': random_state}
pprint(random_grid_ada)

In [None]:
# First create the base model to tune
ada = AdaBoostClassifier()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
#ada_smote = RandomizedSearchCV(estimator = ada, param_distributions = random_grid_ada, n_iter = 5, cv = 10, verbose=1, random_state=42, n_jobs = -1)
ada = RandomizedSearchCV(estimator = ada, param_distributions = random_grid_ada, n_iter = 5, cv = 10, verbose=0, random_state=42, n_jobs = -1)


In [None]:
# Fit the random search model

#ada_smote.fit(X_train_res, y_train_res)

In [None]:
# Check accuracy on test set

#ada_smote_accuracy = evaluate(ada_smote,test_features,test_target)

In [None]:
# Fit the random search model

ada.fit(training_features, training_target)

In [None]:
# Check accuracy on test set

ada_accuracy = evaluate(ada,test_features,test_target)

In [None]:
# feature importance plots

#print("ada with SMOTE feature importance")
feature_names=df.drop(['target'],axis=1).columns
#feature_importance = ada_smote.best_estimator_.feature_importances_
#sorted_idx = np.argsort(feature_importance)
#pos = np.arange(sorted_idx.shape[0]) + .5
#plt.figure(figsize=(10,fig_length/3))
#plt.barh(pos, feature_importance[sorted_idx], align='center')
#plt.yticks(pos, feature_names[sorted_idx])
#feature_names[sorted_idx]
#plt.show()

print("ada without SMOTE feature importance")
feature_importance = ada.best_estimator_.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10,fig_length/3))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names[sorted_idx])
feature_names[sorted_idx]
plt.show()

In [None]:
# Baseline Model on the Sonar Dataset
import numpy as np
from pandas import read_csv
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

seed=7
np.random.seed(seed)

In [None]:
# fix the input dimenstion to number of feature terms
input_dimenation = X_train_res.shape[1]

In [None]:
# INPUT: model parameters
epochs_input = 100
batch_size_input = 50

In [None]:
def create_model():
# create model
    model = Sequential()
    model.add(Dense(input_dimenation, input_dim=input_dimenation, kernel_initializer='normal', activation='relu'))#, kernel_constraint=maxnorm(3)))
    #model.add(Dropout(0.2))
    model.add(Dense(input_dimenation, kernel_initializer='normal', activation='relu'))#, kernel_constraint=maxnorm(2)))
    #model.add(Dropout(0.2))
    model.add(Dense(output, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)
    model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
# create model

#model_smote = KerasClassifier(build_fn = create_model, epochs = epochs_input, batch_size = batch_size_input, verbose = 1)
# Fit the model
#model_smote.fit(X_train_res, y_train_res)


In [None]:
# Check accuracy on test set

#tf_accuracy = evaluate(model_smote,test_features,test_target)

In [None]:
# create model

model = KerasClassifier(build_fn = create_model, epochs = epochs_input, batch_size = batch_size_input, verbose = 1)
# Fit the model
model.fit(training_features, training_target)


In [None]:
# Check accuracy on test set

#tf_accuracy = evaluate(model,test_features,test_target)

In [None]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
import scipy.stats as st
from sklearn.grid_search import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(200, 2000, num = 10)]
learning_rate = [0.1, 0.05, 0.02, 0.01]
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)
random_state = [21]

# Create the random grid

# First create the base model to tune
n_estimators = [int(x) for x in np.linspace(3, 40, num = 10)]
max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
one_to_left = st.beta(10, 1)  
from_zero_positive = st.expon(0, 50)
random_state=[21]
params = {  
    'n_estimators': n_estimators,
    'learning_rate': learning_rate,
    "max_depth": max_depth,
    "learning_rate": st.uniform(0.05, 0.4),
    "colsample_bytree": one_to_left,
    "subsample": one_to_left,
    "gamma": st.uniform(0, 10),
    'reg_alpha': from_zero_positive,
    "min_child_weight": from_zero_positive,
    'random_state':random_state
}

xgbclass = XGBClassifier(nthreads=-1)  

In [None]:
gs = RandomizedSearchCV(xgbclass, params, n_jobs=1)  
gs.fit(training_features, training_target) 

In [None]:
# Check accuracy on test set

xgb_accuracy = evaluate(gs,test_features,test_target)

In [None]:
# feature importance plots

#print("XGBOOST with SMOTE feature importance")
feature_names=df.drop(['target'],axis=1).columns
#feature_importance = gs.best_estimator_.feature_importances_
#sorted_idx = np.argsort(feature_importance)
#pos = np.arange(sorted_idx.shape[0]) + .5
#plt.barh(pos, feature_importance[sorted_idx], align='center')
#plt.yticks(pos, feature_names[sorted_idx])
#feature_names[sorted_idx]
#plt.show()

print("XGBOOST without SMOTE feature importance")
feature_importance = gs.best_estimator_.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(10,fig_length/3))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, feature_names[sorted_idx])
feature_names[sorted_idx]
plt.show()

In [None]:
def evaluate_accuracy(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    return accuracy

def evaluate_kappa(model, X_test, y_test):
    y_pred = model.predict(X_test)
    kappa = cohen_kappa_score(y_test, y_pred, sample_weight=None)
    return kappa

def evaluate_recall(model, X_test, y_test):
    y_pred = model.predict(X_test)
    recall = recall_score(y_test,y_pred)
    return recall

def evaluate_precision(model, X_test, y_test):
    y_pred = model.predict(X_test)
    precision = precision_score(y_test,y_pred)
    return precision

def evaluate_con_mat_row(model, X_test, y_test):
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    return (tn, fp, fn, tp)

In [None]:
#xgbm_smote_accuracy = evaluate_accuracy(gs_smote,test_features,test_target)
#gbm_smote_accuracy = evaluate_accuracy(gbm_smote,test_features,test_target)
#rf_smote_accuracy = evaluate_accuracy(rf_smote,test_features,test_target)
#ada_smote_accuracy = evaluate_accuracy(ada_smote,test_features,test_target)
#tf_smote_accuracy = evaluate_accuracy(model_smote,test_features,test_target)
xgbm_accuracy = evaluate_accuracy(gs,test_features,test_target)
gbm_accuracy = evaluate_accuracy(gbm,test_features,test_target)
rf_accuracy = evaluate_accuracy(rf,test_features,test_target)
ada_accuracy = evaluate_accuracy(ada,test_features,test_target)    
tf_accuracy = evaluate_accuracy(model,test_features,test_target)

#xgbm_smote_kappa = evaluate_kappa(gs_smote,test_features,test_target)
#gbm_smote_kappa = evaluate_kappa(gbm_smote,test_features,test_target)
#rf_smote_kappa = evaluate_kappa(rf_smote,test_features,test_target)
#ada_smote_kappa = evaluate_kappa(ada_smote,test_features,test_target)
#tf_smote_kappa = evaluate_kappa(model_smote,test_features,test_target)
xgbm_kappa = evaluate_kappa(gs,test_features,test_target)
gbm_kappa = evaluate_kappa(gbm,test_features,test_target)
rf_kappa = evaluate_kappa(rf,test_features,test_target)
ada_kappa = evaluate_kappa(ada,test_features,test_target)
tf_kappa = evaluate_kappa(model,test_features,test_target)  

#xgbm_smote_recall = evaluate_recall(gs_smote,test_features,test_target)
#gbm_smote_recall = evaluate_recall(gbm_smote,test_features,test_target)
#rf_smote_recall = evaluate_recall(rf_smote,test_features,test_target)
#ada_smote_recall = evaluate_recall(ada_smote,test_features,test_target)
#tf_smote_recall = evaluate_recall(model_smote,test_features,test_target)
xgbm_recall = evaluate_recall(gs,test_features,test_target)
gbm_recall = evaluate_recall(gbm,test_features,test_target)
rf_recall = evaluate_recall(rf,test_features,test_target)
ada_recall = evaluate_recall(ada,test_features,test_target)
tf_recall = evaluate_recall(model,test_features,test_target)  

#xgbm_smote_precision = evaluate_precision(gs_smote,test_features,test_target)
#gbm_smote_precision = evaluate_precision(gbm_smote,test_features,test_target)
#rf_smote_precision = evaluate_precision(rf_smote,test_features,test_target)
#ada_smote_precision = evaluate_precision(ada_smote,test_features,test_target)
#tf_smote_precision = evaluate_precision(model_smote,test_features,test_target)
xgbm_precision = evaluate_precision(gs,test_features,test_target)
gbm_precision = evaluate_precision(gbm,test_features,test_target)
rf_precision = evaluate_precision(rf,test_features,test_target)
ada_precision = evaluate_precision(ada,test_features,test_target)
tf_precision = evaluate_precision(model,test_features,test_target) 

#xgbm_smote_tfft = evaluate_con_mat_row(gs_smote,test_features,test_target)
#gbm_smote_tfft = evaluate_con_mat_row(gbm_smote,test_features,test_target)
#rf_smote_tfft = evaluate_con_mat_row(rf_smote,test_features,test_target)
#ada_smote_tfft = evaluate_con_mat_row(ada_smote,test_features,test_target)
#tf_smote_tfft = evaluate_con_mat_row(model_smote,test_features,test_target)
xgbm_tfft = evaluate_con_mat_row(gs,test_features,test_target)
gbm_tfft = evaluate_con_mat_row(gbm,test_features,test_target)
rf_tfft = evaluate_con_mat_row(rf,test_features,test_target)
ada_tfft = evaluate_con_mat_row(ada,test_features,test_target)
tf_tfft = evaluate_con_mat_row(model,test_features,test_target)

In [None]:
report = [#{'model': 'XGBM_smote', 'accuracy': xgbm_smote_accuracy, 'kappa': xgbm_smote_kappa,'recall': xgbm_smote_recall, 'precision': xgbm_smote_precision,'tn, fp, fn, tp': xgbm_smote_tfft},
          #{'model': 'GBM_smote', 'accuracy': gbm_smote_accuracy, 'kappa': gbm_smote_kappa,'recall': gbm_smote_recall, 'precision': gbm_smote_precision,'tn, fp, fn, tp': gbm_smote_tfft},
          #{'model': 'RF_smote',  'accuracy': rf_smote_accuracy, 'kappa': rf_smote_kappa,'recall': rf_smote_recall, 'precision': rf_smote_precision,'tn, fp, fn, tp': rf_smote_tfft},
          #{'model': 'ADA_smote', 'accuracy': ada_smote_accuracy, 'kappa': ada_smote_kappa, 'recall': ada_smote_recall, 'precision': ada_smote_precision ,'tn, fp, fn, tp': ada_smote_tfft },
          #{'model': 'tf_smote', 'accuracy': tf_smote_accuracy,  'kappa': tf_smote_kappa,'recall': tf_smote_recall, 'precision': tf_smote_precision ,'tn, fp, fn, tp': tf_smote_tfft }]#,
          {'model': 'XGBM', 'accuracy': xgbm_accuracy, 'kappa': xgbm_kappa, 'recall': xgbm_recall, 'precision': xgbm_precision,'tn, fp, fn, tp': xgbm_tfft},
          {'model': 'GBM', 'accuracy': gbm_accuracy, 'kappa': gbm_kappa, 'recall': gbm_recall, 'precision': gbm_precision, 'tn, fp, fn, tp': gbm_tfft},
          {'model': 'RF',  'accuracy': rf_accuracy, 'kappa': rf_kappa, 'recall': rf_recall, 'precision': rf_precision, 'tn, fp, fn, tp': rf_tfft},
          {'model': 'ADA', 'accuracy': ada_accuracy, 'kappa': ada_kappa, 'recall': ada_recall, 'precision': ada_precision,  'tn, fp, fn, tp': ada_tfft },
          {'model': 'tf', 'accuracy': tf_accuracy,  'kappa': tf_kappa, 'recall': tf_recall, 'precision': tf_precision ,'tn, fp, fn, tp': tf_smote_tfft }]
df1 = pd.DataFrame(report)
df1 = df1[['model', 'accuracy', 'kappa', 'recall', 'precision','tn, fp, fn, tp']]

In [None]:
df1

In [None]:
df1 = df1.set_index('model')

In [None]:
df1.plot.bar(figsize=(df_length, 4))

In [None]:
# import pickle
import pickle

# save GBM model to disk
#filename1 = 'finalized_gbm_smote.sav'
#pickle.dump(gbm_smote, open(filename1, 'wb'))

# save RF model to disk
#filename2 = 'finalized_rf_smote.sav'
#pickle.dump(rf_smote, open(filename2, 'wb'))

# save Adaboost model to disk
#filename3 = 'finalized_ada_smote.sav'
#pickle.dump(ada_smote, open(filename3, 'wb'))

# save GBM model to disk
filename4 = 'finalized_gbm.sav'
pickle.dump(gbm, open(filename4, 'wb'))

# save RF model to disk
filename5 = 'finalized_rf.sav'
pickle.dump(rf, open(filename5, 'wb'))

# save Adaboost model to disk
filename6 = 'finalized_ada.sav'
pickle.dump(ada, open(filename6, 'wb'))

################XGBoost model save
# save XGB_smote model to disk
#filename1x_smote = 'finalized_xgb_smote.sav'
#pickle.dump(gs_smote, open(filename1x_smote, 'wb'))

# save XGB model to disk
filename1x = 'finalized_xgb.sav'
pickle.dump(gs, open(filename1x, 'wb'))

In [None]:
# serialize model to JSON
#model_smote_json = model_smote.model.to_json()
#with open("model_smote_json.json", "w") as json_file:
#    json_file.write(model_smote_json)
# serialize weights to HDF5
#model_smote.model.save_weights("model_smote_json.h5")
#print("Saved model to disk")

In [None]:
# serialize model to JSON
model_json = model.model.to_json()
with open("model_json.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.model.save_weights("model_json.h5")
print("Saved model to disk")