<font size = '6' color = 'blue'> Collaborative Filtering </font>

In [1]:
%pylab inline
import warnings
warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [1]:
# citation: https://cambridgespark.com/content/tutorials/implementing-your-own-recommender-systems-in-Python/index.html
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn import cross_validation as cv
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import mean_absolute_error



In [2]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def mae(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return mean_absolute_error(prediction, ground_truth)


def collaborativeFiltering(reviews_source):
    reviews = pd.read_csv(reviews_source)
    reviews['text'] = reviews['text'].str[2:-2]

    
    print("Undersampling of the dataset started--------")
    
    #Undersampling of the dataset to get a balanced dataset
    review1 = reviews[reviews['stars'] == 1][0:12000]
    review2 = reviews[reviews['stars'] == 2][0:7000]
    review3 = reviews[reviews['stars'] == 3][0:12000]
    review4 = reviews[reviews['stars'] == 4][0:12000]
    review5 = reviews[reviews['stars'] == 5][0:12000]
    frames = [review1, review2, review3,review4,review5]
    reviews = pd.concat(frames)
    
    print("Undersampling of the dataset completed--------")
    
    # converting user_id and business_id to integers for the matrix
    reviews['user_id'] = pd.factorize(reviews.user_id)[0]
    reviews['business_id'] = pd.factorize(reviews.business_id)[0]
    
    # getting the number unique users and restaurants
    unique_users = reviews.user_id.unique().shape[0]
    unique_restaurants = reviews.business_id.unique().shape[0]
    
    #splitting the dataset
    train_data, test_data = cv.train_test_split(reviews, test_size=0.20)

    #Create two user-item matrices, one for training and another for testing
    train_data_matrix = np.zeros((unique_users, unique_restaurants))
    
    print("Creation of user-item matrix started--------")
    
    # train_data_matrix
    for line in train_data.itertuples():
         train_data_matrix[line[3], line[2]] = line[5]
            
    # test_data_matrix
    test_data_matrix = np.zeros((unique_users, unique_restaurants))
    for line in test_data.itertuples():
        test_data_matrix[line[3], line[2]] = line[5]
    
    print("Creation of user-item matrix completed--------")
    
    print("Creation of similarity matrix started--------")
    
    # calculating similarity between users
    user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
    # calculating similarity between items
    item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')
    
    print("Creation of similarity matrix completed--------")
    
    
    print("Creation of prediction matrix started--------")
    
    item_prediction = predict(train_data_matrix, item_similarity, type='item')
    user_prediction = predict(train_data_matrix, user_similarity, type='user')
    
    print("Creation of prediction matrix completed--------")
    
    print('Printing the RMSE and MAE------------' + '\n')
    
    if reviews_source == 'reviews_restaurants_text.csv':
        rating_type = 'biased rating'
    elif reviews_source == 'reviews_restaurants_text_LinearSVM.csv':
        rating_type = 'unbiased rating from Linear SVM'
    else:
        rating_type = 'unbiased rating from Naive Bayes'
    print ('Root Mean Square Error while testing the model using ' + rating_type)
    print ('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
    print ('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)) + '\n')

    print ('Root Mean Square Error while training the model using ' + rating_type)
    print ('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix)))
    print ('Item-based CF RMSE: ' + str(rmse(item_prediction, train_data_matrix)) + '\n')
    
    print ('Mean Absolute Error while testing the model using ' + rating_type)
    print ('User-based CF MAE: ' + str(mae(user_prediction, test_data_matrix)))
    print ('Item-based CF MAE: ' + str(mae(item_prediction, test_data_matrix)) + '\n')

    print ('Mean Absolute Error while training the model using ' + rating_type)
    print ('User-based CF MAE: ' + str(mae(user_prediction, train_data_matrix)))
    print ('Item-based CF MAE: ' + str(mae(item_prediction, train_data_matrix)) + '\n')   

<font size = '5' color = 'blue'>Collaborative filtering using biased rating</font>

In [4]:
collaborativeFiltering('reviews_restaurants_text.csv')

Undersampling of the dataset started--------
Undersampling of the dataset completed--------
Creation of user-item matrix started--------
Creation of user-item matrix completed--------
Creation of similarity matrix started--------
Creation of similarity matrix completed--------
Creation of prediction matrix started--------
Creation of prediction matrix completed--------
Printing the RMSE and MAE------------
Root Mean Square Error while testing the model using biased rating

User-based CF RMSE: 3.4224251658150537
Item-based CF RMSE: 3.4244977815398223
Root Mean Square Error while testing the model using biased rating

User-based CF RMSE: 3.4059797325229546
Item-based CF RMSE: 3.4068094068976698
Mean Absolute Error while testing the model using biased rating

User-based CF MAE: 3.10118322299
Item-based CF MAE: 3.10332547733
Mean Absolute Error while training the model using biased rating

User-based CF MAE: 3.08609887518
Item-based CF MAE: 3.086804974


<font size = '5' color = 'blue'>Collaborative filtering using unbiased rating from Linear SVM</font>

In [6]:
collaborativeFiltering('reviews_restaurants_text_LinearSVM.csv')

Undersampling of the dataset started--------
Undersampling of the dataset completed--------
Creation of user-item matrix started--------
Creation of user-item matrix completed--------
Creation of similarity matrix started--------
Creation of similarity matrix completed--------
Creation of prediction matrix started--------
Creation of prediction matrix completed--------
Printing the RMSE and MAE------------
Root Mean Square Error while testing the model using unbiased rating from Linear SVM
User-based CF RMSE: 3.3930751924760343
Item-based CF RMSE: 3.395080653479287

Root Mean Square Error while training the model using unbiased rating from Linear SVM
User-based CF RMSE: 3.4133323262548423
Item-based CF RMSE: 3.4141643344383974

Mean Absolute Error while testing the model using unbiased rating from Linear SVM
User-based CF MAE: 3.07344432866
Item-based CF MAE: 3.07549327049

Mean Absolute Error while training the model using unbiased rating from Linear SVM
User-based CF MAE: 3.093057992

<font size = '5' color = 'blue'>Collaborative filtering using unbiased rating from Naive Bayes</font>

In [11]:
collaborativeFiltering('reviews_restaurants_text_NaiveBayes.csv')

Undersampling of the dataset started--------
Undersampling of the dataset completed--------
Creation of user-item matrix started--------
Creation of user-item matrix completed--------
Creation of similarity matrix started--------
Creation of similarity matrix completed--------
Creation of prediction matrix started--------
Creation of prediction matrix completed--------
Printing the RMSE and MAE------------

Root Mean Square Error while testing the model using unbiased rating from Naive Bayes
User-based CF RMSE: 3.572904578536179
Item-based CF RMSE: 3.5750603264816747

Root Mean Square Error while training the model using unbiased rating from Naive Bayes
User-based CF RMSE: 3.567457513507413
Item-based CF RMSE: 3.5683351784692414

Mean Absolute Error while testing the model using unbiased rating from Naive Bayes
User-based CF MAE: 3.22686737475
Item-based CF MAE: 3.22908543246

Mean Absolute Error while training the model using unbiased rating from Naive Bayes
User-based CF MAE: 3.21909