In [None]:
# Importing the libraries
import numpy as np
import pandas as pd

import sklearn
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score,recall_score,precision_score,f1_score,r2_score,explained_variance_score

from xgboost import XGBClassifier, XGBRegressor

%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sn
import pandas as pd

print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
#Importing the dataset + Examining the dataset for intuition
dataset = pd.read_csv('Sample.csv')
dataset.head()

In [None]:
# Examine the dataset - identify missing, incorrect, redundant data in order to proceed with the pre-processing phase
dataset.info()

In [None]:
# Find Missing Data
print(dataset.isnull().any())

# For numerical data
dataset['header'].fillna(dataset['header'].mean(), inplace = True) #replace missing data with mean/median etc.
# For categorical data
dataset['header'].fillna('unknown')

In [None]:
# Drop columns which are redundant
dataset = dataset.drop(['header1', 'header2', 'header3'], axis = 1)

In [None]:
# View dataset after pre-processing
dataset.head()

In [None]:
# Split into X and y
X = dataset.iloc[:, 3:13].values #to take all rows, and required columns
y = dataset.iloc[:, 13].values #to take all rows, and required columns

print(pd.DataFrame(X))
print(pd.DataFrame(y))

In [None]:
# Encoding categorical data (i.e. Creating dummy variables)
labelencoder_X_1 = LabelEncoder()
X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1])

labelencoder_X_2 = LabelEncoder()
X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2])

ct_1 = ColumnTransformer(
        [('one_hot_encoder', OneHotEncoder(), [1])],
        remainder = 'passthrough')
X = np.array(ct_1.fit_transform(X), dtype = np.float)

print(pd.DataFrame(X))

In [None]:
# Dropping 1 variable to avoid dummy variable trap if needed
X = np.delete(X,[1], axis = 1)

print(pd.DataFrame(X))

In [None]:
# Check if dataset is balanced
unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)


In [None]:
# FOR CLASSIFYING QUESTION

# Conduct GridSearch, to find the optimal hyper-parameters
xgb_model = XGBClassifier(objective='binary:logistic', #'binary:logistic' for 2 classes, 'multi:softprob' for > 2 classes
                          tree_method='exact', 
                          early_stopping_rounds = 50)

parameters = {'max_depth': [2, 4, 6],
              'learning_rate': [0.01, 0.03, 0.05, 0.07], #so called `eta` value
              'n_estimators': [100, 500], #higher number of trees if insufficient data and vice versa
              'gamma': [0, 0.2],
              'min_child_weight': [1, 4, 6],
              'subsample': [0.8], #randomly sample before growing tree, prevents over-fitting
              'colsample_bytree': [0.8], #randomly sample before growing tree, prevents over-fitting
              #'seed': [10]
             }

clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, 
                   cv=10,
                   scoring='accuracy',
                   verbose=3, 
                   refit=True)


clf.fit(X_train, y_train)

'''
# FOR REGRESSION QUESTION

# Conduct GridSearch, to find the optimal hyper-parameters
xgb_model = XGBRegressor(objective='reg:squarederror', 
                          tree_method='exact', 
                          early_stopping_rounds = 50)
                          
parameters = {'max_depth': [2, 4, 6],
              'learning_rate': [0.01, 0.02, 0.3, 0.4], #so called `eta` value
              'n_estimators': [100, 500], #higher number of trees if insufficient data and vice versa
              'gamma': [0, 0.2],
              'min_child_weight': [1, 4, 6],
              'subsample': [0.8], #randomly sample before growing tree, prevents over-fitting
              'colsample_bytree': [0.8], #randomly sample before growing tree, prevents over-fitting
              #'seed': [10]
             }
             
clf = GridSearchCV(xgb_model, parameters, n_jobs=-1, 
                   cv=10,
                   verbose=3, refit=True)

clf.fit(X_train, y_train)
'''

In [None]:
# Print model report:
print ("Model Report:")
print("Best: Accuracy of %f using %s" % (clf.best_score_, clf.best_params_))

In [None]:
# Predicting the Test set results
y_pred = clf.predict(X_test)

In [None]:
# Model Evaluation metrics - For Classification
print('Accuracy Score : ' + str(accuracy_score(y_test,y_pred)))
print('Precision Score : ' + str(precision_score(y_test,y_pred)))
print('Recall Score : ' + str(recall_score(y_test,y_pred)))
print('F1 Score : ' + str(f1_score(y_test,y_pred)))

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, dtype = 'int64')

sn.set(font_scale=1)#for label size
sn.heatmap(df_cm, annot=True, annot_kws={"size": 10}, fmt='g')# font size

'''
Model Evaluation metrics - For Regression
print('Adj R-squared : ' + str(r2_score(y_test,y_pred)))
print('Variance: ' + str(explained_variance_score(y_test,y_pred)))

'''

In [None]:
# Importing the final set
z_test = pd.read_csv('Churn_Modelling.csv')
z_test.head()

In [None]:
# Convert into numpy array
z_test = z_test.iloc[:, :].values
print (pd.DataFrame(z_test))

In [None]:
# Predicting the final set
z_pred = clf.predict(z_test)

# Review predictions of final set
print(pd.DataFrame(z_pred))

In [None]:
# If required to add to z_test file
predictions = np.concatenate([(z_test, z_pred)]).T

predictions = pd.DataFrame(predictions, columns=["header1", "header2"]).to_csv('prediction.csv', index=False))

In [None]:
# Export all as predictions

predictions = pd.DataFrame(y_pred, columns=["header1"]).to_csv('prediction.csv', index=False)