# Imports

In [16]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from plotnine import *

import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelBinarizer, OneHotEncoder #Z-score variables

from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut # simple TT split cv, k-fold cv, LOO cv

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.calibration import calibration_curve

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from skopt import BayesSearchCV
from skopt.space import Integer, Real

import xgboost as xgb

In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/cmparlettpelleriti/CPSC392ParlettPelleriti/master/Data/streaming.csv")
data.dropna(inplace = True)
data.reset_index(inplace = True)
data.head(20)
data.info()

<class 'pandas.DataFrame'>
RangeIndex: 95844 entries, 0 to 95843
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             95844 non-null  int64  
 1   gender            95844 non-null  str    
 2   age               95844 non-null  float64
 3   income            95844 non-null  float64
 4   monthssubbed      95844 non-null  int64  
 5   plan              95844 non-null  str    
 6   meanhourswatched  95844 non-null  float64
 7   competitorsub     95844 non-null  int64  
 8   numprofiles       95844 non-null  int64  
 9   cancelled         95844 non-null  float64
 10  downgraded        95844 non-null  int64  
 11  bundle            95844 non-null  int64  
 12  kids              95844 non-null  int64  
 13  longestsession    95844 non-null  float64
 14  topgenre          95844 non-null  str    
 15  secondgenre       95844 non-null  str    
 16  churn             95844 non-null  int64  
dtypes: f

In [3]:
print(data['plan'].value_counts())
print('\n')
print(data['competitorsub'].value_counts())
print('\n')
print(data['cancelled'].value_counts())
print('\n')
print(data['downgraded'].value_counts())
print('\n')
print(data['kids'].value_counts())
print('\n')
print(data['topgenre'].value_counts())
print('\n')
print(data['secondgenre'].value_counts())
print('\n')

plan
B    57742
A    19121
P    18981
Name: count, dtype: int64


competitorsub
1    57563
0    38281
Name: count, dtype: int64


cancelled
0.0    57533
1.0    38311
Name: count, dtype: int64


downgraded
0    71876
1    23968
Name: count, dtype: int64


kids
0    47937
1    47907
Name: count, dtype: int64


topgenre
Comedy            28606
Thriller          19268
ScienceFiction     9665
RomanticComedy     9661
Drama              9589
Horror             9582
Action             4750
Documentary        4723
Name: count, dtype: int64


secondgenre
Comedy            23001
Thriller          18868
Drama             10742
ScienceFiction    10724
RomanticComedy    10663
Horror            10545
Action             5745
Documentary        5556
Name: count, dtype: int64




# Pipeline definition + grid search

In [17]:
predictors = ["gender", "age", "income", "monthssubbed", "plan", "meanhourswatched", "competitorsub", "numprofiles", "cancelled", "downgraded", "bundle", "kids", "longestsession", "topgenre", "secondgenre"]
contin = ["age", "income", "monthssubbed", "meanhourswatched", "numprofiles", "longestsession"]

X = data[predictors]
y = data["churn"]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state = 3619)
#random_state makes sure we have the same split each time

# Create Empty Model
pre = make_column_transformer((StandardScaler(), contin),
                              (OneHotEncoder(), ["gender", "plan", "competitorsub", "cancelled", "downgraded", "kids", "topgenre", "secondgenre"]),
                              remainder = "passthrough")
boost = xgb.XGBClassifier(tree_method="hist", scale_pos_weight=len(y_train[y_train==0])/len(y_train[y_train==1]))

pipe = Pipeline([("pre", pre), ("model", boost)])

param_space = {
    "model__n_estimators": Integer(50, 500),
    "model__learning_rate": Real(0.01, 0.3, prior='log-uniform'),
    "model__max_depth": Integer(2, 10),
    "model__subsample": Real(0.5, 1.0),
    "model__colsample_bytree": Real(0.5, 1.0),
    "model__gamma": Real(0, 0.5),
    "model__reg_alpha": Real(0, 1),
    "model__reg_lambda": Real(0, 1),
}

grid = BayesSearchCV(pipe,param_space,cv=5, n_iter=100,n_jobs=-1, scoring="roc_auc", random_state=3619)
# fit
grid.fit(X_train, y_train)

xgb_best = grid.best_estimator_
print(f"Best n_estimators: {grid.best_params_['model__n_estimators']}")
print(f"Best learning_rate: {grid.best_params_['model__learning_rate']}")
print(f"Best max_depth: {grid.best_params_['model__max_depth']}")
print(f"Best subsample: {grid.best_params_['model__subsample']}")
print(f"Best colsample_bytree: {grid.best_params_['model__colsample_bytree']}")
print(f"Best gamma: {grid.best_params_['model__gamma']}")
print(f"Best reg_alpha: {grid.best_params_['model__reg_alpha']}")

Best n_estimators: 443
Best learning_rate: 0.032123639706696155
Best max_depth: 2
Best subsample: 0.5
Best colsample_bytree: 0.5
Best gamma: 0.5
Best reg_alpha: 1.0


## n_iter=10 (default RandomSearchCV:)
- Best n_estimators: 300
- Best learning_rate: 0.05
- Best max_depth: 3
- Best subsample: 1.0
- Best colsample_bytree: 1.0
- Best gamma: 0.1
- Best reg_alpha: 0

## n_iter=100 (RandomizedSearchCV):
- Best n_estimators: 200
- Best learning_rate: 0.05
- Best max_depth: 3
- Best subsample: 0.8
- Best colsample_bytree: 0.8
- Best gamma: 0
- Best reg_alpha: 0.2

## n_iter=100 (BayesSearchCV):


# Prediction

In [12]:
# predict

y_pred_train = grid.best_estimator_.predict(X_train)
y_pred_test = grid.best_estimator_.predict(X_test)

y_pred_train_prob = grid.best_estimator_.predict_proba(X_train)[:,1]
y_pred_test_prob = grid.best_estimator_.predict_proba(X_test)[:,1]

# assess
print("Train Acc       : ", accuracy_score(y_train, y_pred_train))
print("Train Prescision: ", precision_score(y_train, y_pred_train))
print("Train Recall    : ", recall_score(y_train, y_pred_train))
print("Train F1        : ", f1_score(y_train, y_pred_train))
print("Train ROC AUC   : ", roc_auc_score(y_train, y_pred_train_prob)) # notice this actually takes in predicted probability

print("\n") # just for aesthetics :)

print("Test Acc        : ", accuracy_score(y_test, y_pred_test))
print("Test Prescision : ", precision_score(y_test, y_pred_test))
print("Test Recall     : ", recall_score(y_test, y_pred_test))
print("Test F1         : ", f1_score(y_test, y_pred_test))
print("Test ROC AUC    : ", roc_auc_score(y_test, y_pred_test_prob))

Train Acc       :  0.6683112237293188
Train Prescision:  0.44816156369600874
Train Recall    :  0.682397627347937
Train F1        :  0.5410143761730916
Train ROC AUC   :  0.7390330053349002


Test Acc        :  0.6691590735202059
Test Prescision :  0.4498407643312102
Test Recall     :  0.6845995395613717
Test F1         :  0.542929899581992
Test ROC AUC    :  0.7382341578878632
