# Part 8: Hybrid Recommender Evaluation_in test set for userid 2043
---

- Importing the relevant libraries first...


In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report, multilabel_confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import joblib

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## Importing X_legit and y which contain the shops that userid 2043 rated and ratings respectively
---

In [2]:
X_legit = pd.read_csv('yelp_data/xlegit.csv')
X_legit.shape

(2935, 19498)

In [3]:
y = pd.read_csv('yelp_data/y.csv', squeeze=True)
y.head()

0    4.0
1    4.0
2    4.0
3    4.0
4    4.0
Name: user_ratings, dtype: float64

In [4]:
X_legit['userids'].dtypes

dtype('int64')

In [5]:
#split the dataset into train and test sets first
X_train, X_test, y_train, y_test = train_test_split(X_legit, y, test_size=0.2, random_state=42, stratify=X_legit['userids'])

In [6]:
X_test.shape

(587, 19498)

In [7]:
mbcf_test = pd.read_csv('yelp_data/test_mbcf_hundten.csv')

In [8]:
mbcf_test.shape

(587, 5)

In [9]:
mbcf_test[['shops','userids']].head(10)

Unnamed: 0,shops,userids
0,breko-cafe-singapore,488
1,the-tea-party-cafe-singapore,789
2,nylon-coffee-roasters-singapore,209
3,da-paolo-gastronomia-holland-village-singapore-2,306
4,twenty-grammes-singapore,2043
5,pine-gardens-cake-singapore,2043
6,brawn-and-brains-coffee-singapore,57
7,the-tiny-roaster-120106,27
8,ps-cafe-singapore-8,2237
9,living-cafe-and-deli-singapore,1123


In [12]:
y_train.value_counts(normalize=True)

4.0    0.506814
5.0    0.409284
3.0    0.075383
2.0    0.006388
1.0    0.002129
Name: user_ratings, dtype: float64

<ul>
    
- The baseline accuracy will be 0.51 since that is the highest proportion among the training dataset's target classes

In [13]:
#instantiate scaler since not all of the features are of the same scale, eg. review_count and avg_store_rating
ss = StandardScaler()

In [14]:
X_train=X_train.drop(['userids'],axis=1)
X_test=X_test.drop(['userids'],axis=1)

In [15]:
#fitting the train and transforming both the train and test sets
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [16]:
#reading in reconstructed_X_test for 110 userids from content-based filtering for comparison later on...
hundten_cb_pred_actual_X_test = pd.read_csv('yelp_data/xgb_hundten_cb_pred_actual.csv')

In [17]:
#checking out the dimensions of the read-in content-based filtering dataset....
hundten_cb_pred_actual_X_test.shape

(587, 4)

In [18]:
#checking out the first few rows of the content-based filtering dataset....
hundten_cb_pred_actual_X_test.head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids
0,shops_brunetti-singapore,5.0,5.0,488
1,shops_long-black-cafe-singapore,4.0,4.0,789
2,shops_nylon-coffee-roasters-singapore,5.0,5.0,209


In [19]:
#cleaning the shops column to remove the "shops_" prefix for easier merging later on...
hundten_cb_pred_actual_X_test['shops'] = hundten_cb_pred_actual_X_test['shops'].apply(lambda x: x[6:])

In [20]:
#confirming that the change has been made
hundten_cb_pred_actual_X_test.head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids
0,brunetti-singapore,5.0,5.0,488
1,long-black-cafe-singapore,4.0,4.0,789
2,nylon-coffee-roasters-singapore,5.0,5.0,209


In [21]:
hundten_cb_pred_actual_X_test[(hundten_cb_pred_actual_X_test['shops']=='long-black-cafe-singapore') & (hundten_cb_pred_actual_X_test['userids']==789)]

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids
1,long-black-cafe-singapore,4.0,4.0,789


In [22]:
hundten_cb_pred_actual_X_test[hundten_cb_pred_actual_X_test['userids']==75]

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids
67,penny-university-singapore,4.0,4.0,75
156,old-hen-coffee-bar-singapore-2,4.0,4.0,75
189,brunches-cafe-singapore-3,5.0,5.0,75
355,kith-cafe-singapore-7,2.0,2.0,75
470,chye-seng-huat-hardware-singapore,5.0,5.0,75
561,jewel-coffee-singapore-3,5.0,5.0,75


In [23]:
hundten_cb_pred_actual_X_test.shape

(587, 4)

In [24]:
#reading in the collaborative filtering dataset for the 110 userids...
hundten_mbcf_pred_actual_test = pd.read_csv('yelp_data/mbcf_hundten_pred_actual_testeval.csv')

In [25]:
#checking out the dimensions of the model-based collaborative filtering test dataset...
hundten_mbcf_pred_actual_test.shape

(497, 5)

In [26]:
#checking out the first few rows of the model-based collaborative filtering test dataset...
hundten_mbcf_pred_actual_test.head(3)

Unnamed: 0,shops,ratings,prediction_rounded,prediction,userids
0,the-coconut-club-singapore,5.0,5.0,4.90392,75
1,wilder-singapore,5.0,5.0,4.907964,2445
2,the-daily-cut-singapore-3,5.0,5.0,4.913113,225


In [27]:
#let's merge both content-based filtering and model-based collaborative filtering predictions for userid 2043 together!
con_collab_110_tst = pd.merge(hundten_cb_pred_actual_X_test,hundten_mbcf_pred_actual_test,how="right",on=['shops','userids'])

In [28]:
#checking out the dimensions of the merged dataset...
con_collab_110_tst.shape

(497, 7)

In [29]:
con_collab_110_tst.isnull().sum()

shops                   0
predicted_ratings     397
actual_ratings        397
userids                 0
ratings                 0
prediction_rounded      0
prediction              0
dtype: int64

In [30]:
con_collab_110_tst[con_collab_110_tst['predicted_ratings'].isnull()].head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids,ratings,prediction_rounded,prediction
100,the-coconut-club-singapore,,,75,5.0,5.0,4.90392
101,wilder-singapore,,,2445,5.0,5.0,4.907964
102,the-daily-cut-singapore-3,,,225,5.0,5.0,4.913113


In [31]:
#keeping only common shops present in both content-based and collaborative filtering...
con_collab_110_tst.dropna(inplace=True)

In [32]:
#looks like we got 36 outlets in common between content-based and model-based collaborative filtering to work with for userid 2043...
con_collab_110_tst.shape

(100, 7)

In [33]:
con_collab_110_tst.isnull().sum()

shops                 0
predicted_ratings     0
actual_ratings        0
userids               0
ratings               0
prediction_rounded    0
prediction            0
dtype: int64

In [34]:
#checking out the first few rows of the merged dataset that has been trimmed of NaNs...
con_collab_110_tst.head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids,ratings,prediction_rounded,prediction
0,long-black-cafe-singapore,4.0,4.0,789,4.0,4.0,3.921498
1,nylon-coffee-roasters-singapore,5.0,5.0,209,5.0,5.0,4.900086
2,da-paolo-gastronomia-holland-village-singapore-2,4.0,4.0,306,4.0,4.0,3.924546


In [35]:
#loading decisiontreeclassifier model
#loaded_model = joblib.load('yelp_data/xgb_model.sav')

In [1]:
#decisiontreeclassifier for content-based filtering had a test accuracy score of 0.85
#loaded_model.best_estimator_.score(X_test_sc,y_test)

<ul>
    
- Suggest baseline score for hybrid recommender is the average of the content-based and collaborative filtering baseline accuracies, i.e. 
    
    $Hybrid\ recommender\ baseline\ accuracy = \frac{0.48 + 0.47}{2} = 0.48$

<img src="yelp_data/extended_xgb_1.png"/>

<ul>
    
- The above shows tuned XGB Classifier's micro-averaged $F_1$ is 0.97! This shall be the weight for content-based filtering!


<img src="yelp_data/extended_als.png"/>

<ul>
    
- The above shows re-tuned ALS' micro-averaged $F_1$ is 1.0! This shall be the weight for collaborative filtering!

In [2]:
#since we are not tuning the models further, let's use the respective models' F1 scores to weight each model's rating predictions!
con_wt = 0.97 / (0.97 + 1.0)
collab_wt = 1.0 / (0.97 + 1.0)

In [38]:
#creating a new column containing the weighted sum of rating predictions from content-based and collaborative filtering
con_collab_110_tst['final_rating_predictions'] = (con_collab_110_tst['predicted_ratings']*con_wt) + (con_collab_110_tst['prediction']*collab_wt)

In [39]:
#checking out the new df with added column...
con_collab_110_tst.head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids,ratings,prediction_rounded,prediction,final_rating_predictions
0,long-black-cafe-singapore,4.0,4.0,789,4.0,4.0,3.921498,3.960151
1,nylon-coffee-roasters-singapore,5.0,5.0,209,5.0,5.0,4.900086,4.949282
2,da-paolo-gastronomia-holland-village-singapore-2,4.0,4.0,306,4.0,4.0,3.924546,3.961698


In [40]:
#rounding the computed final rating predictions to 0 decimal place so that it can be compared to the actual ratings (which are also discrete whole numbers) via the f1 score...
con_collab_110_tst['final_rating_predictions_rd'] = round(con_collab_110_tst['final_rating_predictions'],0)

In [41]:
#checking out the first few rows of the df containing the rounded prediction column...
con_collab_110_tst.head(3)

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids,ratings,prediction_rounded,prediction,final_rating_predictions,final_rating_predictions_rd
0,long-black-cafe-singapore,4.0,4.0,789,4.0,4.0,3.921498,3.960151,4.0
1,nylon-coffee-roasters-singapore,5.0,5.0,209,5.0,5.0,4.900086,4.949282,5.0
2,da-paolo-gastronomia-holland-village-singapore-2,4.0,4.0,306,4.0,4.0,3.924546,3.961698,4.0


In [57]:
#getting a sense of the top 5 recommendations from this hybrid system; seems like the hybrid system's predictions are identical to the actual ratings for the top 5 recommendations!
con_collab_110_tst[['shops','actual_ratings','userids','final_rating_predictions_rd','final_rating_predictions']].sort_values('final_rating_predictions',ascending=False).head()

Unnamed: 0,shops,actual_ratings,userids,final_rating_predictions_rd,final_rating_predictions
13,clementi-881-coffee-station-singapore,5.0,322,5.0,4.961556
33,maxwell-food-centre-singapore-3,5.0,1868,5.0,4.960391
92,the-flying-squirrel-singapore,5.0,2073,5.0,4.959476
85,praelum-wine-bistro-singapore,5.0,1894,5.0,4.957876
66,yahava-koffeeworks-singapore,5.0,57,5.0,4.95669


In [43]:
#however, the hybrid system is rather weak in predicting rating 2...but at least it performed well for the other 3 rating classes (3,4,5)
print(classification_report(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd']))

              precision    recall  f1-score   support

         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00        50
         5.0       1.00      1.00      1.00        40

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [44]:
#appears there were no rating 1 in the 36 rows of common outlets between content-based filtering and collaborative filtering..
con_collab_110_tst[con_collab_110_tst['actual_ratings']==1.0]

Unnamed: 0,shops,predicted_ratings,actual_ratings,userids,ratings,prediction_rounded,prediction,final_rating_predictions,final_rating_predictions_rd


## Defining functions for evaluation of model
---

In [45]:
#defining function for obtaining tn, fp, fn, tp for each rating class for feeding into micro-avg precision and recall functions defined below
def cm_spec(y_true,y_pred,rating,state):
    if state=='tn':
        return multilabel_confusion_matrix(y_true,y_pred)[rating-3][0][0]
    elif state=='fp':
        return multilabel_confusion_matrix(y_true,y_pred)[rating-3][0][1]
    elif state=='fn':
        return multilabel_confusion_matrix(y_true,y_pred)[rating-3][1][0]
    else:
        return multilabel_confusion_matrix(y_true,y_pred)[rating-3][1][1]
    

In [46]:
#defining function for obtaining micro-avg precision
def micro_avg_precision(y_true,y_pred):
    return ((cm_spec(y_true,y_pred,1,'tp')+
                                                 cm_spec(y_true,y_pred,2,'tp')+
                                                 cm_spec(y_true,y_pred,3,'tp')+
                                                 cm_spec(y_true,y_pred,4,'tp')+
                                                 cm_spec(y_true,y_pred,5,'tp'))/(
                                                cm_spec(y_true,y_pred,1,'tp')+
                                                 cm_spec(y_true,y_pred,2,'tp')+
                                                 cm_spec(y_true,y_pred,3,'tp')+
                                                 cm_spec(y_true,y_pred,4,'tp')+
                                                 cm_spec(y_true,y_pred,5,'tp')+
                                                cm_spec(y_true,y_pred,1,'fp')+
                                                 cm_spec(y_true,y_pred,2,'fp')+
                                                 cm_spec(y_true,y_pred,3,'fp')+
                                                 cm_spec(y_true,y_pred,4,'fp')+
                                                 cm_spec(y_true,y_pred,5,'fp')))

In [47]:
#defining function for obtaining micro-avg recall
def micro_avg_recall(y_true,y_pred):
    return ((cm_spec(y_true,y_pred,1,'tp')+
                                                 cm_spec(y_true,y_pred,2,'tp')+
                                                 cm_spec(y_true,y_pred,3,'tp')+
                                                 cm_spec(y_true,y_pred,4,'tp')+
                                                 cm_spec(y_true,y_pred,5,'tp'))/(
                                                cm_spec(y_true,y_pred,1,'tp')+
                                                 cm_spec(y_true,y_pred,2,'tp')+
                                                 cm_spec(y_true,y_pred,3,'tp')+
                                                 cm_spec(y_true,y_pred,4,'tp')+
                                                 cm_spec(y_true,y_pred,5,'tp')+
                                                cm_spec(y_true,y_pred,1,'fn')+
                                                 cm_spec(y_true,y_pred,2,'fn')+
                                                 cm_spec(y_true,y_pred,3,'fn')+
                                                 cm_spec(y_true,y_pred,4,'fn')+
                                                 cm_spec(y_true,y_pred,5,'fn')))

In [48]:
#defining function for obtaining micro_avg_f1
def micro_avg_f1(y_true,y_pred):
    return 2 * ((micro_avg_precision(y_true,y_pred) * micro_avg_recall(y_true,y_pred))/(micro_avg_precision(y_true,y_pred) + micro_avg_recall(y_true,y_pred)))

In [49]:
#function to print out confusion matrix breakdown for each rating class
def confusion_breakdown(y_true,y_pred,rating):
    print("True negatives for rating {}: {}".format(
        rating,multilabel_confusion_matrix(y_true,y_pred)[rating-3][0][0]))
    print("False positives for rating {}: {}".format(
        rating,multilabel_confusion_matrix(y_true,y_pred)[rating-3][0][1]))
    print("False negatives for rating {}: {}".format(
        rating,multilabel_confusion_matrix(y_true,y_pred)[rating-3][1][0]))
    print("True positives for rating {}: {}".format(
        rating,multilabel_confusion_matrix(y_true,y_pred)[rating-3][1][1]))
    return "******************************************"

In [50]:
print(confusion_breakdown(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd'],3))
print(confusion_breakdown(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd'],4))
print(confusion_breakdown(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd'],5))

True negatives for rating 3: 90
False positives for rating 3: 0
False negatives for rating 3: 0
True positives for rating 3: 10
******************************************
True negatives for rating 4: 50
False positives for rating 4: 0
False negatives for rating 4: 0
True positives for rating 4: 50
******************************************
True negatives for rating 5: 60
False positives for rating 5: 0
False negatives for rating 5: 0
True positives for rating 5: 40
******************************************


In [51]:
print("Hybrid recommender yielded accuracy: ", (90+10+100+100)/(300))

Hybrid recommender yielded accuracy:  1.0


In [52]:
print("Hybrid recommender yielded micro-averaged precision: ", micro_avg_precision(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd']))


Hybrid recommender yielded micro-averaged precision:  1.0


In [53]:
print("Hybrid recommender yielded micro-averaged recall: ", micro_avg_recall(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd']))

Hybrid recommender yielded micro-averaged recall:  1.0


In [54]:
print("Hybrid recommender yielded micro_avg_f1 of ", micro_avg_f1(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd']))

Hybrid recommender yielded micro_avg_f1 of  1.0


In [55]:
#precision, recall, f1 of all rating classes show good performance except for rating 2, which shows quite a strong false positive count...
print(classification_report(con_collab_110_tst['actual_ratings'],con_collab_110_tst['final_rating_predictions_rd']))

              precision    recall  f1-score   support

         3.0       1.00      1.00      1.00        10
         4.0       1.00      1.00      1.00        50
         5.0       1.00      1.00      1.00        40

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



## Hybrid Model Evaluation Result Interpretation
---

<ul>
    
- The updated hybrid recommender (XGB trained on 110 userids, with ALS trained and tested on data with stratified splitting based on userid performed astoundingly well in terms of predicting the ratings with perfect scores of 1.0 for precision, recall, and $F_1$ although it is unable to predict ratings 1.0 and 2.0 as the overlapped samples lacked those ratings...

</ul>


<ul>
    
- Accuracy of 1.0, Micro-Averaged precision of 1.0, Micro-Averaged recall of 1.0, Micro-Averaged $F_1$ of 1.0
    
</ul>
