In [16]:
import pandas as pd
import numpy as np
#from scipy import stats

import pyarrow

import matplotlib.pyplot as plt
#import seaborn as sns

#from sklearn import preprocessing
from sklearn.model_selection import train_test_split

#!conda install -c conda-forge xgboost
from xgboost import XGBClassifier, plot_importance

%matplotlib inline

## Load the data

In [2]:
df = pd.read_parquet('client_database_preprocessed.parquet', engine='pyarrow')

In [3]:
oryg_shape = df.shape
oryg_shape

(1284, 13)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1284 entries, 0 to 1283
Data columns (total 13 columns):
accepted               1284 non-null float64
gender                 1284 non-null float64
age                    1284 non-null float64
phone_calls            1284 non-null float64
emails                 1284 non-null float64
salary                 1284 non-null float64
estimated_expenses     1284 non-null float64
customer_type_C        1284 non-null float64
customer_type_Q        1284 non-null float64
customer_type_S        1284 non-null float64
offer_class_High       1284 non-null float64
offer_class_Medium     1284 non-null float64
offer_class_Premium    1284 non-null float64
dtypes: float64(13)
memory usage: 140.4 KB


In [5]:
df.head()

Unnamed: 0,accepted,gender,age,phone_calls,emails,salary,estimated_expenses,customer_type_C,customer_type_Q,customer_type_S,offer_class_High,offer_class_Medium,offer_class_Premium
0,1.0,0.0,0.177419,0.0,0.0,0.412503,0.517787,0.0,0.0,1.0,0.0,1.0,0.0
1,1.0,1.0,0.193548,0.125,0.2,0.295806,0.287459,0.0,0.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.193548,0.125,0.3,0.295806,0.089689,0.0,0.0,1.0,0.0,1.0,0.0
3,0.0,1.0,0.193548,0.125,0.2,0.295806,0.376244,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.0,0.112903,0.125,0.2,0.295806,0.173802,0.0,0.0,1.0,0.0,1.0,0.0


## Training and Validation Sets

In [6]:
# Randomize the dataset
# The frac keyword argument specifies the fraction of rows to return in the random sample, so frac=1 means return all rows (in random order).
df = df.sample(frac=1)

In [7]:
# Take the target variable out from the dataset
y = df.pop('accepted')

In [8]:
# 80/20 Split
seed = 123
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=seed)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1027, 12) (1027,)
(257, 12) (257,)


In [9]:
classifier = XGBClassifier(objective='binary:logistic', n_estimators=50)

In [10]:
classifier

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [11]:
classifier.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=['logloss'])

[0]	validation_0-logloss:0.652839	validation_1-logloss:0.651478
[1]	validation_0-logloss:0.619772	validation_1-logloss:0.617868
[2]	validation_0-logloss:0.592321	validation_1-logloss:0.590064
[3]	validation_0-logloss:0.56941	validation_1-logloss:0.566607
[4]	validation_0-logloss:0.550113	validation_1-logloss:0.548045
[5]	validation_0-logloss:0.533763	validation_1-logloss:0.531389
[6]	validation_0-logloss:0.519909	validation_1-logloss:0.517753
[7]	validation_0-logloss:0.507633	validation_1-logloss:0.505846
[8]	validation_0-logloss:0.497101	validation_1-logloss:0.49656
[9]	validation_0-logloss:0.488292	validation_1-logloss:0.487255
[10]	validation_0-logloss:0.480358	validation_1-logloss:0.48065
[11]	validation_0-logloss:0.473628	validation_1-logloss:0.473399
[12]	validation_0-logloss:0.467112	validation_1-logloss:0.468023
[13]	validation_0-logloss:0.461938	validation_1-logloss:0.462359
[14]	validation_0-logloss:0.457431	validation_1-logloss:0.457437
[15]	validation_0-logloss:0.452838	val

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=50,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [12]:
eval_result = classifier.evals_result()

In [13]:
training_rounds = range(len(eval_result['validation_0']['logloss']))
training_rounds

range(0, 50)

In [None]:
plt.scatter(x=training_rounds, y=eval_result['validation_0']['logloss'], label="Training Error")
plt.scatter(x=training_rounds, y=eval_result['validation_1']['logloss'], label="Validation Error")
plt.grid(True)
plt.xlabel('Input Feature')
plt.ylabel('Logloss')
plt.title('Training vs. Validation Error')
plt.legend()
plt.show();

In [None]:
plot_importance(classifier)

In [None]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))