# Extreme Gradient Boosting Classifier - Content-based Filtering
---

- Importing the relevant libraries first...

In [2]:
#import requests
import random
#import time
import numpy as np
import pandas as pd
#from bs4 import BeautifulSoup as bs
import re
from sklearn.preprocessing import LabelEncoder
#from os import path   #uncomment these if you have downloaded and installed wordcloud based on the instructions above
#from PIL import Image
#from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.feature_extraction import stop_words
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
from sklearn.metrics import pairwise_distances
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import joblib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Importing X_legit and y which contain the shops that userid 2043 rated and ratings respectively
---

In [3]:
X_legit = pd.read_csv('yelp_data/xlegit.csv')
X_legit.shape

(2935, 19498)

In [4]:
y = pd.read_csv('yelp_data/y.csv', squeeze=True)
y.head()

0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
Name: user_ratings, dtype: float64

In [5]:
#split the dataset into train and test sets first
X_train, X_test, y_train, y_test = train_test_split(X_legit, y, test_size=0.2, random_state=42, stratify=X_legit['userids'])

In [6]:
X_test.userids.head(3)

290     488
1330    789
1567    209
Name: userids, dtype: int64

In [7]:
X_legit[X_legit['userids'].isin([488,789,209])]['userids']

36      488
38      209
245     789
273     488
290     488
       ... 
2580    488
2783    209
2839    209
2844    488
2862    209
Name: userids, Length: 64, dtype: int64

In [8]:
y_train.value_counts(normalize=True)

4.0    0.506814
5.0    0.409284
3.0    0.075383
2.0    0.006388
1.0    0.002129
Name: user_ratings, dtype: float64

In [9]:
#instantiate scaler since not all of the features are of the same scale, eg. review_count and avg_store_rating
ss = StandardScaler()

In [10]:
X_train_1=X_train.drop(['userids'],axis=1)
X_test_1=X_test.drop(['userids'],axis=1)

In [11]:
X_train[X_train['userids']==75][['userids','shops_the-coconut-club-singapore']]

Unnamed: 0,userids,shops_the-coconut-club-singapore
1657,75,0
2575,75,0
915,75,0
2549,75,0
2573,75,0
2004,75,0
653,75,0
2915,75,0
2393,75,0
1069,75,0


In [12]:
#fitting the train and transforming both the train and test sets
X_train_sc = ss.fit_transform(X_train_1)
X_test_sc = ss.transform(X_test_1)

In [13]:
X_train_sc.shape

(2348, 19497)

In [14]:
X_test_sc.shape

(587, 19497)

## The following cells have been commented out due to some mistake saving the wrong xgb_model.sav, but fortunately, the best hyperparameters from the tuned model have been saved in the flask app python script and the Simple Hybrid Recommender trial with extn with XGB's notebook...
---

In [14]:
# Instantiate the model
#xgb_model = XGBClassifier(tree_method='hist')

In [16]:
#just in case it takes too long to run this even...
#cross_val_score(xgb_model,X_train_sc,y_train,cv=5).mean()

<img src="yelp_data/xgb_1.png"/>

In [None]:
#this is for initial decision trees tuned
#params = {
 #   'learning_rate' : [0.1, 0.5, 0.9],
  #  'n_estimators' : [100,200,300],
   # 'max_depth':[6,9,12],
    #'gamma':[0,1,2,3],
    #'max_delta_step':[1,2,3],

    #'scale_pos_weight':[0.5, 1, 2, 3]
  #  } 

In [None]:
#took too long to run...to be run as part of future plans for this project
#xgb_gridsearch = GridSearchCV(xgb_model, params, cv = 5, verbose = 1, n_jobs = -1)
#xgb_gridsearch.fit(X_train_sc, y_train)

<img src="yelp_data/xgb_2.png"/>

In [None]:
#print('Gridsearch best score: ', xgb_gridsearch.best_score_)
#print('Gridsearch best estimator: ', xgb_gridsearch.best_estimator_)
#print('Gridsearch best score on test set: ', xgb_gridsearch.best_estimator_.score(X_test_sc,y_test))

<img src="yelp_data/xgb_3.png"/>

In [None]:
#creating a filename for tuned model
#filename = 'xgb_model.sav'

In [None]:
#saving the tuned model just in case of need to retrieve it in future
#joblib.dump(xgb_gridsearch,filename)

In [15]:
#to demonstrate loading of saved model just in case...
#loaded_model = joblib.load('yelp_data/xgb_model.sav')

In [17]:
#loaded_model.best_estimator_.score(X_test_sc,y_test)

<img src="yelp_data/xgb_4.png"/>

Acc on test was 0.97!

In [None]:
#check out the tuned params for usage in deployment on Heroku
loaded_model.best_params_

<img src="yelp_data/xgb_5.png"/>

## Formulating recommendations for userid 2043 based on XGB model
---

In [None]:
#stacking X_test as first step in regenerating the shops column for predictions
#trial = X_test.stack()

In [None]:
# #creating loop to re-generate original X_test order of shops
# index_lst = []
# outlets_lst = []
# for n in range(len(trial.index)):
#     if trial.index[n][1].startswith('shops_') and trial[n]!=0:
#         index_lst.append(str(trial.index[n][0]))
#         outlets_lst.append(trial.index[n][1])
# index_lst = [int(x) for x in index_lst]
# reconstructed_X_test = pd.DataFrame({'shops':outlets_lst}, index=index_lst)

In [None]:
#creating column of rating predictions
#rating_predictions = loaded_model.best_estimator_.predict(X_test_sc)

In [None]:
#adding column of rating predictions into reconstructed df
#reconstructed_X_test['predicted_ratings']=rating_predictions

In [None]:
#adding the actual ratings into the reconstructed df for comparison later on...
#reconstructed_X_test['actual_ratings']=y_test

In [None]:
#reconstructed_X_test['userids'] = X_test.userids

In [None]:
#reconstructed_X_test.head(3)

In [None]:
#showing top 5 rows based on content-based filtering (using xgb classifier) predictions to get a sense...
#reconstructed_X_test.sort_values('predicted_ratings', ascending=False).head(5)

<img src="yelp_data/xgb_11.png"/>

In [None]:
#saving this reconstructed_X_test for fusion with collab X_test to evaluate hybrid system later on. 
#reconstructed_X_test.to_csv('yelp_data/xgb_hundten_cb_pred_actual.csv',index=False)

In [None]:
#reconstructed_X_test.shape

In [None]:
#reconstructed_X_test[(reconstructed_X_test['userids']==75)&(reconstructed_X_test['shops']=='shops_the-coconut-club-singapore')]

In [None]:
#reconstructed_X_test.head(3)

## Defining functions for evaluation of model
---

In [None]:
# #defining function for obtaining tn, fp, fn, tp for each rating class for feeding into micro-avg precision and recall functions defined below
# def cm_spec(y_true,y_pred,rating,state):
#     if state=='tn':
#         return multilabel_confusion_matrix(y_true,y_pred)[rating-1][0][0]
#     elif state=='fp':
#         return multilabel_confusion_matrix(y_true,y_pred)[rating-1][0][1]
#     elif state=='fn':
#         return multilabel_confusion_matrix(y_true,y_pred)[rating-1][1][0]
#     else:
#         return multilabel_confusion_matrix(y_true,y_pred)[rating-1][1][1]
    

In [None]:
# #defining function for obtaining micro-avg precision
# def micro_avg_precision(y_true,y_pred):
#     return ((cm_spec(y_true,y_pred,1,'tp')+
#                                                  cm_spec(y_true,y_pred,2,'tp')+
#                                                  cm_spec(y_true,y_pred,3,'tp')+
#                                                  cm_spec(y_true,y_pred,4,'tp')+
#                                                  cm_spec(y_true,y_pred,5,'tp'))/(
#                                                 cm_spec(y_true,y_pred,1,'tp')+
#                                                  cm_spec(y_true,y_pred,2,'tp')+
#                                                  cm_spec(y_true,y_pred,3,'tp')+
#                                                  cm_spec(y_true,y_pred,4,'tp')+
#                                                  cm_spec(y_true,y_pred,5,'tp')+
#                                                 cm_spec(y_true,y_pred,1,'fp')+
#                                                  cm_spec(y_true,y_pred,2,'fp')+
#                                                  cm_spec(y_true,y_pred,3,'fp')+
#                                                  cm_spec(y_true,y_pred,4,'fp')+
#                                                  cm_spec(y_true,y_pred,5,'fp')))

In [None]:
# #defining function for obtaining micro-avg recall
# def micro_avg_recall(y_true,y_pred):
#     return ((cm_spec(y_true,y_pred,1,'tp')+
#                                                  cm_spec(y_true,y_pred,2,'tp')+
#                                                  cm_spec(y_true,y_pred,3,'tp')+
#                                                  cm_spec(y_true,y_pred,4,'tp')+
#                                                  cm_spec(y_true,y_pred,5,'tp'))/(
#                                                 cm_spec(y_true,y_pred,1,'tp')+
#                                                  cm_spec(y_true,y_pred,2,'tp')+
#                                                  cm_spec(y_true,y_pred,3,'tp')+
#                                                  cm_spec(y_true,y_pred,4,'tp')+
#                                                  cm_spec(y_true,y_pred,5,'tp')+
#                                                 cm_spec(y_true,y_pred,1,'fn')+
#                                                  cm_spec(y_true,y_pred,2,'fn')+
#                                                  cm_spec(y_true,y_pred,3,'fn')+
#                                                  cm_spec(y_true,y_pred,4,'fn')+
#                                                  cm_spec(y_true,y_pred,5,'fn')))

In [None]:
# #defining function for obtaining micro_avg_f1
# def micro_avg_f1(y_true,y_pred):
#     return 2 * ((micro_avg_precision(y_true,y_pred) * micro_avg_recall(y_true,y_pred))/(micro_avg_precision(y_true,y_pred) + micro_avg_recall(y_true,y_pred)))

In [None]:
# #function to print out confusion matrix breakdown for each rating class
# def confusion_breakdown(y_true,y_pred,rating):
#     print("True negatives for rating {}: {}".format(
#         rating,multilabel_confusion_matrix(y_true,y_pred)[rating-1][0][0]))
#     print("False positives for rating {}: {}".format(
#         rating,multilabel_confusion_matrix(y_true,y_pred)[rating-1][0][1]))
#     print("False negatives for rating {}: {}".format(
#         rating,multilabel_confusion_matrix(y_true,y_pred)[rating-1][1][0]))
#     print("True positives for rating {}: {}".format(
#         rating,multilabel_confusion_matrix(y_true,y_pred)[rating-1][1][1]))
#     return "******************************************"

In [None]:
# print(confusion_breakdown(y_test,rating_predictions,1))
# print(confusion_breakdown(y_test,rating_predictions,2))
# print(confusion_breakdown(y_test,rating_predictions,3))
# print(confusion_breakdown(y_test,rating_predictions,4))
# print(confusion_breakdown(y_test,rating_predictions,5))

<img src="yelp_data/xgb_6.png"/>

In [None]:
#can tell it is able to predict correctly most of the rating classes!
#y_test.value_counts()

In [None]:
#print("Gridsearched XGB Classifier with balanced class_weight yielded micro_avg_precision of ", micro_avg_precision(y_test,rating_predictions))

In [None]:
#print("Gridsearched XGB Classifier with balanced class_weight yielded micro_avg_recall of ", micro_avg_recall(y_test,rating_predictions))

In [None]:
#print("Gridsearched XGB Classifier with balanced class_weight yielded micro_avg_f1 of ", micro_avg_f1(y_test,rating_predictions))

<img src="yelp_data/xgb_10.png"/>

In [None]:
#precision, recall, f1 of all rating classes show good performance!
#print(classification_report(y_test,rating_predictions))

<img src="yelp_data/xgb_7.png"/>

## Calculating and plotting the multiclass ROC AUC as part of model evaluation
---


In [None]:
# #making a copy of X_legit and y for computation and plotting of ROC AUC for the respective rating classes and micro-average
# X_copy = X_legit.copy()

# y_copy = y.copy()

# # Binarize the output
# y_copy = label_binarize(y_copy, classes=[1.0, 2.0, 3.0, 4.0, 5.0])
# n_classes = y_copy.shape[1]

# # shuffle and split training and test sets
# X_copy_train, X_copy_test, y_copy_train, y_copy_test = train_test_split(X_copy, y_copy, test_size=0.2,
#                                                     random_state=42, stratify=y_copy)

# #instantiate scaler since not all of the features are of the same scale, eg. review_count and avg_store_rating
# ss1 = StandardScaler()

# #fitting the train and transforming both the train and test sets
# X_copy_train_sc = ss1.fit_transform(X_copy_train)
# X_copy_test_sc = ss1.transform(X_copy_test)

# # Learn to predict each class against the other using the params of tuned model with random_state set to 42 so that the values and curves do not waver
# classifier = OneVsRestClassifier(XGBClassifier(learning_rate=0.5, max_depth=9, n_estimators=200, random_state=42))
# y_score = classifier.fit(X_copy_train_sc, y_copy_train).predict_proba(X_copy_test_sc)

# # Compute ROC curve and ROC area for each class
# fpr = dict()
# tpr = dict()
# roc_auc = dict()
# for i in range(n_classes):
#     fpr[i], tpr[i], _ = roc_curve(y_copy_test[:, i], y_score[:, i])
#     roc_auc[i] = auc(fpr[i], tpr[i])

# # Compute micro-average ROC curve and ROC area
# fpr["micro"], tpr["micro"], _ = roc_curve(y_copy_test.ravel(), y_score.ravel())
# roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# # Plot all ROC curves and micro-averaged one
# plt.figure(figsize=(12,9))
# lw=2
# plt.plot(fpr["micro"], tpr["micro"],
#          label='micro-average ROC curve (area = {0:0.2f})'
#                ''.format(roc_auc["micro"]), linestyle=':', linewidth=4)
# for i in range(n_classes):
#     plt.plot(fpr[i], tpr[i], lw=lw,
#              label='ROC curve of class {0} (area = {1:0.2f})'
#              ''.format(i+1, roc_auc[i]))

# plt.plot([0, 1], [0, 1], 'k--', lw=lw)
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('1-Specificity',fontsize=20)
# plt.ylabel('Sensitivity',fontsize=20)
# plt.title('ROC AUC for XGB w Tfidf',fontsize=20)
# plt.legend(loc="lower right",fontsize=15)
# plt.show()

In [None]:
# #calculating one-vs-one and one-vs-rest micro-averaged ROC AUC to check if there is a difference between both
# y_prob = classifier.predict_proba(X_copy_test_sc)
# micro_roc_auc_ovo = roc_auc_score(y_copy_test, y_prob, multi_class="ovo",
#                                      average="micro")
# micro_roc_auc_ovr = roc_auc_score(y_copy_test, y_prob, multi_class="ovr",
#                                      average="micro")
# print("One-vs-One ROC AUC score:\n{:.6f} "
#       "(micro-averaged)"
#       .format(micro_roc_auc_ovo))
# print("One-vs-Rest ROC AUC score:\n{:.6f} "
#       "(micro-averaged)"
#       .format(micro_roc_auc_ovr))

<img src="yelp_data/xgb_8.png"/>

In [None]:
# #calculating one-vs-one and one-vs-rest ROC AUC weighted by prevalence to check if there is a difference between both
# y_prob = classifier.predict_proba(X_copy_test_sc)
# weighted_roc_auc_ovo = roc_auc_score(y_copy_test, y_prob, multi_class="ovo",
#                                      average="weighted")
# weighted_roc_auc_ovr = roc_auc_score(y_copy_test, y_prob, multi_class="ovr",
#                                      average="weighted")
# print("One-vs-One ROC AUC score:\n{:.6f} "
#       "(weighted by prevalence)"
#       .format(weighted_roc_auc_ovo))
# print("One-vs-Rest ROC AUC score:\n{:.6f} "
#       "(weighted by prevalence)"
#       .format(weighted_roc_auc_ovr))

<img src="yelp_data/xgb_9.png"/>