In [1]:
import numpy as np
import pandas as pd

from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score,KFold

from sklearn.model_selection import GridSearchCV



In [2]:
df=pd.read_csv(r'C:\Users\lfakh\Downloads\coding_round_data.csv')

In [3]:
100*df['Revenue'].value_counts().values/df['Revenue'].value_counts().sum()


array([84.52554745, 15.47445255])

In [4]:
df["Revenue"] = df["Revenue"].astype(int)
df["Weekend"] = df["Weekend"].astype(int)


In [5]:
df.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,1,0


In [6]:
df=pd.get_dummies(df, columns=['VisitorType','Month'])

In [7]:
X=df.drop('Revenue',axis=1)
y=df['Revenue']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
     

In [9]:
y_train.value_counts()/y_train.value_counts().sum() #change this. this is not the same distribution

0    0.848236
1    0.151764
Name: Revenue, dtype: float64

## Base Line Model

In [10]:

logreg=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
M_LG=logreg.fit(X_train,y_train)

In [11]:
y_pred_train=M_LG.predict(X_train)
y_pred_test=M_LG.predict(X_test)

auc_train=metrics.roc_auc_score(y_train, y_pred_train)
auc_test=metrics.roc_auc_score(y_test, y_pred_test)



print('AUC Training ',auc_train)
print('AUC Testing',auc_test)

AUC Training  0.6781441453481841
AUC Testing 0.6644768856447689


## Train XGboost with default hyperprameters

In [33]:
model=XGBClassifier(booster='gbtree',objective='binary:logistic',eta=0.3,gamma=0,max_depth=6,min_child_weight=0,scale_pos_weight=0,n_jobs=4,n_estimators=100,random_state=7)

M=model.fit(X_train, y_train,verbose=True,eval_metric="auc")

In [34]:
y_pred_train=M.predict(X_train)
y_pred_test=M.predict(X_test)

fpr_train,tpr_train,threshols_train=metrics.roc_curve(y_train,y_pred_train)
fpr_test,tpr_test,threshols_test=metrics.roc_curve(y_test,y_pred_test)
print('AUC Training ',metrics.auc(fpr_train,tpr_train))
print('AUC Testing',metrics.auc(fpr_test,tpr_test))

AUC Training  0.5
AUC Testing 0.5


In [45]:
model=XGBClassifier(booster='gbtree',objective='binary:logistic',eta=0.3,gamma=0,max_depth=6,min_child_weight=0,scale_pos_weight=2,n_jobs=4,n_estimators=100,random_state=7)
M=model.fit(X_train, y_train,verbose=True,eval_metric="auc")

In [44]:
y_pred_train=M.predict(X_train)
y_pred_test=M.predict(X_test)
fpr_train,tpr_train,threshols_train=metrics.roc_curve(y_train,y_pred_train)
fpr_test,tpr_test,threshols_test=metrics.roc_curve(y_test,y_pred_test)

print('AUC Training ',metrics.auc(fpr_train,tpr_train))

print('AUC Testing',metrics.auc(fpr_test,tpr_test))

AUC Training  0.9935279906053293
AUC Testing 0.7892944038929441


In [46]:
model=XGBClassifier(booster='gbtree',objective='binary:logistic',eta=0.1,gamma=5,max_depth=1,min_child_weight=6,scale_pos_weight=2,n_jobs=4,n_estimators=100,random_state=7)
M=model.fit(X_train, y_train,verbose=True,eval_metric="auc")

In [47]:
y_pred_train=M.predict(X_train)
y_pred_test=M.predict(X_test)
fpr_train,tpr_train,threshols_train=metrics.roc_curve(y_train,y_pred_train)
fpr_test,tpr_test,threshols_test=metrics.roc_curve(y_test,y_pred_test)

print('AUC Training ',metrics.auc(fpr_train,tpr_train))
print('AUC Testing',metrics.auc(fpr_test,tpr_test))

AUC Training  0.831887750641716
AUC Testing 0.8245742092457421


## Grid Search

In [51]:
estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

In [52]:
parameters = {
    'scale_pos_weight': range (0, 7, 1),
    'eta':[0.1],
    'gamma':[5],
    'max_depth':[1],
    'min_child_weight':[6],
    'n_estimators': [100]
    
    

}

In [53]:
grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [54]:
grid_search.fit(X_train, y_train)


Fitting 10 folds for each of 7 candidates, totalling 70 fits


GridSearchCV(cv=10,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None...
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     seed=42, subsample=No

In [55]:
grid_search.best_estimator_


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eta=0.1, gamma=5, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.100000001,
              max_delta_step=0, max_depth=1, min_child_weight=6, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=4, nthread=4,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=6, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1, ...)

In [56]:


model=XGBClassifier(**grid_search.best_params_)
M=model.fit(X_train, y_train,verbose=True,eval_metric="auc")

In [57]:
y_pred_train=M.predict(X_train)
y_pred_test=M.predict(X_test)

auc_train=metrics.roc_auc_score(y_train, y_pred_train)
auc_test=metrics.roc_auc_score(y_test, y_pred_test)



print('AUC Training ',auc_train)
print('AUC Testing',auc_test)

AUC Training  0.8470508604157042
AUC Testing 0.8447688564476886
