In [37]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
import xgboost
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# load dataset
data = pd.read_csv('Churn_Modelling.csv')
# Remove irrelevant variables as shown above
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1)

# Applying One-Hot encoding to categorical variable
col = ['Geography', 'Gender']
catg_data = data[col]

# Create a categorical boolean mask
categorical_feature_mask = catg_data.dtypes == object
# Filter categorical columns in a list
categorical_cols = catg_data.columns[categorical_feature_mask].tolist()

# Create a Boolean Numeric mask
Numeric_feature_mask = data.dtypes != object
# Filter Numeric columns in a list
Numeric_cols = data.columns[Numeric_feature_mask].tolist()

# Instantiate the OneHotEncoder object
ohe = OneHotEncoder(handle_unknown='ignore', sparse = False)

# Apply ohe on categorical data
ohe.fit(catg_data[categorical_cols])
cat_ohe = ohe.transform(data[categorical_cols])

# Apply ohe on new_df (Echo)
cat_ohe_new = ohe.transform(echant_ohe_cat)

# Create a categorical column DataFrame (hot encoded)
ohe_df_categ = pd.DataFrame(cat_ohe, columns = ohe.get_feature_names(input_features = categorical_cols))

# Create a Numeric Columns DataFrame
df_numer = data.drop(col, axis = 1)

# concat with original data and remove original columns
df_categ_numer = pd.concat([df_numer, ohe_df_categ], axis=1)
df_categ_numer

# Spliting target variable and independent variables
X = df_categ_numer.drop('Exited', axis=1)
y = df_categ_numer.Exited

# Splitting the dataset into training data and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)

In [20]:
# Import different models 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
import xgboost
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, recall_score


# bestAdaModFitted with optimal parameters
bestAdaModFitted = AdaBoostClassifier(base_estimator = None, n_estimators = 21, learning_rate = 1.0)

# bestGbModFitted with optimal parameters
bestGbModFitted = GradientBoostingClassifier(loss = 'deviance', n_estimators = 67)

# bestXgbModFitted with optimal parameters
bestXgbModFitted = xgboost.XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0.1, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=3, missing= None, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Voting Model 
votingMod = VotingClassifier(estimators=[('gb', bestGbModFitted),
                                         ('xgb', bestXgbModFitted),
                                         ('ada', bestAdaModFitted)],
                                         voting = 'soft', weights=[1.5,2,1])
# Fitting 
votingMod = votingMod.fit(X_train, y_train)

# Roc_auc score
test_labels=votingMod.predict_proba(np.array(X_test))[:,1]
print(round(roc_auc_score(y_test, test_labels , average = 'macro', sample_weight = None),4))

0.871


In [21]:
# Test sample
echo_df_test = np.array([596,32,3,96709.07,2,0,0,41788.37,0,1,0,0,1]).reshape(1,13)

prediction = votingMod.predict(echo_df_test)
prediction

array([0])

In [107]:
# Create a Pickle file using serialization for the best model (votingMod model)
import pickle

# Saving model to disk
pickle.dump(votingMod, open('model.pkl','wb'))

In [22]:
# Test pkl model 
import os
import pickle

# Apply Model to Make Prediction
# Unpickle our model RF so we can use it!
if os.path.isfile("./model.pkl"):
  model = pickle.load(open("./model.pkl", "rb"))
else:
  raise FileNotFoundError

prediction = model.predict(echo_df_test)
prediction

array([0])

### Pickle OneHotEncoder

In [23]:
# Create a Pickle OneHotEncoder file
import pickle

# Saving OneHotEncoder model to disk 
pickle.dump(ohe, open('OneHotEncoder.pkl','wb'))

In [31]:
# Test sample
echant_ohe_cat = [['France', 'Female']]
echo_numer =  [[596, 32, 3, 96709.07, 2, 0, 0, 41788.37]]

# Apply ohe on new_df (Echo)
cat_ohe_new = ohe.transform(echant_ohe_cat)

# Create a categorical column DataFrame (hot encoded)
echo_ohe_df_categ = pd.DataFrame(cat_ohe_new, columns = ohe.get_feature_names(input_features = categorical_cols))

# Create a Numeric Columns DataFrame
echo_df_numer = pd.DataFrame(echo_numer, columns=Numeric_cols[:-1])

# Concat with original data and remove original columns
echo_df_test = pd.concat([echo_df_numer, echo_ohe_df_categ], axis=1)
echo_df_test

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,596,32,3,96709.07,2,0,0,41788.37,1.0,0.0,0.0,1.0,0.0


In [35]:
# Test pkl OneHotEncoder 
import os
import pickle

# Apply OneHotEncoder 
if os.path.isfile("./OneHotEncoder.pkl"):
  OneHotEncoder = pickle.load(open("./OneHotEncoder.pkl", "rb"))
else:
  raise FileNotFoundError

#echo_df_test

transform = OneHotEncoder.transform(echant_ohe_cat)
transform

array([[1., 0., 0., 1., 0.]])

In [36]:
echo_final = np.concatenate((echo_numer, transform), axis=1)
echo_final

array([[5.960000e+02, 3.200000e+01, 3.000000e+00, 9.670907e+04,
        2.000000e+00, 0.000000e+00, 0.000000e+00, 4.178837e+04,
        1.000000e+00, 0.000000e+00, 0.000000e+00, 1.000000e+00,
        0.000000e+00]])