In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, BaselineOnly, accuracy
from surprise import AlgoBase, SVD
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.model_selection.search import RandomizedSearchCV
import copy
import difflib
import random
import src.util as utils

In [2]:
# Load config file
config = utils.load_config()

# Data Preparation

In [3]:
df = pd.read_csv(config['ratings_dataset_path'])
df.columns=['user_id','item_id','rating']

In [4]:
df

Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [5]:
def load_books_data(book_path):
    """
    Load movie data from the given path

    Parameters
    ----------
    movie_path : str
        The movie data path

    Returns
    -------
    book_data : pandas DataFrame
        The books metadata
    """
    # Load data
    book_data = pd.read_csv(book_path,
                             index_col='ISBN',
                             delimiter=',')

    print('Books data shape :', book_data.shape)
    return book_data

In [6]:
book_data = load_books_data('dataset/Books.csv')

Books data shape : (271360, 7)


  book_data = pd.read_csv(book_path,


In [7]:
col_rename_dict = {
    'Book-Title': 'title',
    'Book-Author': 'author',
    'Year-Of-Publication': 'year',
    'Publisher': 'publisher'
}

book_data.rename(columns=col_rename_dict, inplace=True)

book_data.head()

Unnamed: 0_level_0,title,author,year,publisher,Image-URL-S,Image-URL-M,Image-URL-L
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [8]:
reader = Reader(rating_scale=(1,10))
data = Dataset.load_from_df(df[['user_id','item_id','rating']], reader)

print(type(data))

<class 'surprise.dataset.DatasetAutoFolds'>


In [9]:
data.df.head()

Unnamed: 0,user_id,item_id,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


## Data Splitting

In [10]:
# Create a function
def train_test_split(utility_data, test_size, random_state=42):
    """
    Train test split the data
    ref: https://surprise.readthedocs.io/en/stable/FAQ.html#split-data-for-unbiased-estimation-py

    Parameters
    ----------
    utility_data : Surprise utility data
        The sample of whole data set

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility

    Returns
    -------
    full_data : Surprise utility data
        The new utility data

    train_data : Surprise format
        The train data

    test_data : Surprise format
        The test data
    """
    
    # Deep copy the utility_data
    full_data = copy.deepcopy(utility_data)

    # Generate random seed
    np.random.seed(random_state)

    # Shuffle the raw_ratings for reproducibility
    raw_ratings = full_data.raw_ratings
    np.random.shuffle(raw_ratings)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_ratings))

    # Split the data
    train_raw_ratings = raw_ratings[:threshold]
    test_raw_ratings = raw_ratings[threshold:]

    # Get the data
    full_data.raw_ratings = train_raw_ratings
    train_data = full_data.build_full_trainset()
    test_data = full_data.construct_testset(test_raw_ratings)

    return full_data, train_data, test_data


In [11]:
# Split the data
full_data, train_data, test_data = train_test_split(data,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [12]:
type(full_data), type(train_data), type(test_data)

(surprise.dataset.DatasetAutoFolds, surprise.trainset.Trainset, list)

In [13]:
# Validate the splitting
len(full_data.df), train_data.n_ratings, len(test_data)

(1149780, 919824, 229956)

# Creating Model

## Experimenting

In [14]:
class MeanPrediction(AlgoBase):
    '''Baseline prediction. Return global mean as prediction'''
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        '''Fit the train data'''
        AlgoBase.fit(self, trainset)

    def estimate(self, u, i):
        '''Perform the estimation/prediction.'''
        est = self.trainset.global_mean
        return est

In [15]:
# Creating baseline model instance
baseline_model = MeanPrediction()
baseline_model

<__main__.MeanPrediction at 0x2b7202914d0>

In [16]:
# Use full_data for cross validation
# Your results could be different because
# there is no random seed stated within this functions
cv_baseline = cross_validate(algo = baseline_model,
                             data = full_data,
                             cv = 5,
                             measures = ['rmse'])

In [17]:
# Extract CV results
cv_baseline_rmse = cv_baseline['test_rmse'].mean()
cv_baseline_rmse

3.8531953449278733

## Hyperparameter Tuning for SVD

PS: you can skip this if you just want to create the model

In [18]:
params_SVD = {
    'lr_all': [0.005, 0.002, 0.001, 0.0005],
    'n_factors': [20, 50, 75, 125, 150, 250],
    'reg_all': [0.005, 0.01, 0.015, 0.02, 0.03, 0.05],
    'n_epochs': [10, 20, 30, 50, 70, 100]
}

In [19]:
tuning_svd = RandomizedSearchCV(algo_class=SVD, param_distributions = params_SVD,
                   cv=5
                   )

In [20]:
tuning_svd.fit(data=full_data)

In [21]:
best_params_svd = tuning_svd.best_params['rmse']
tuning_svd.best_params['rmse']

{'lr_all': 0.0005, 'n_factors': 50, 'reg_all': 0.01, 'n_epochs': 50}

In [30]:
summary_df = pd.DataFrame({'Model': ['Baseline', 'Funk SVD'],
                           'CV Performance - RMSE': [cv_baseline_rmse,tuning_svd.best_score['rmse'] ],
                           'Model Condiguration':['N/A',f'{tuning_svd.best_params["rmse"]}']})

summary_df

Unnamed: 0,Model,CV Performance - RMSE,Model Condiguration
0,Baseline,3.853195,
1,Funk SVD,3.455943,"{'lr_all': 0.0005, 'n_factors': 50, 'reg_all':..."


## Train the model with best hyperparams

In [None]:
# only run this if you didn't run the hyperparameter tuning process
best_params_svd = {'lr_all': 0.0005, 'n_factors': 50, 'reg_all': 0.01, 'n_epochs': 50}

In [31]:
# Create object
svd_model = SVD(**best_params_svd)

# Retrain on whole train dataset
svd_model.fit(train_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2b77370ed50>

## Save model as Pickle file

In [32]:
utils.pickle_dump(svd_model, config['svd_model_path'])

## Evaluating the Best Model

In [33]:
test_pred = svd_model.test(test_data)
test_rmse = accuracy.rmse(test_pred)
test_rmse

RMSE: 3.4498


3.449829836167209

In [34]:
summary_test_df = pd.DataFrame({'Model' : ['User to User CF'],
                                'RMSE-Tuning': [tuning_svd.best_score['rmse']],
                                'RMSE-Test': [test_rmse]})

summary_test_df

Unnamed: 0,Model,RMSE-Tuning,RMSE-Test
0,User to User CF,3.455943,3.44983


# Prediction

## Predict with Best Model

In [35]:
sample_prediction = svd_model.predict(uid = 9,
                                      iid = 10)

In [36]:
sample_prediction

Prediction(uid=9, iid=10, r_ui=None, est=2.8507843906667447, details={'was_impossible': False})

In [49]:
# Let's create a function
def get_unrated_item(userid, rating_data):
    """
    Get unrated item id from a user id

    Parameters
    ----------
    userid : int
        The user id

    rating_data : pandas DataFrame
        The rating data

    Returns
    -------
    unrated_item_id : set
        The unrated item id
    """
    # Find the whole item id
    unique_item_id = set(rating_data['item_id'])

    # Find the item id that was rated by user id
    rated_item_id = set(rating_data.loc[rating_data['user_id']==userid, 'item_id'])

    # Find the unrated item id
    unrated_item_id = unique_item_id.difference(rated_item_id)

    return unrated_item_id


### Predict book randomly

In [38]:
def get_book_id(book_title, metadata):
    existing_titles = list(metadata['title'])
    closest_titles = difflib.get_close_matches(book_title, existing_titles)
    book_id = metadata[metadata['title'] == closest_titles[0]].index[0]
    
    return book_id

In [39]:
def get_book_info(book_id, metadata):
    book_info = metadata.loc[book_id, :]
    return book_info.to_dict()

In [40]:
def predict_review_by_title(user_id, book_title, model, metadata):
    book_id = get_book_id(book_title, metadata)
    review_prediction = model.predict(uid = user_id, iid = book_id)
    
    return review_prediction.est

In [41]:
def predict_review_by_id(user_id, item_id, model, metadata):
    review_prediction = model.predict(uid = user_id, iid = item_id)
    
    return review_prediction.est

In [73]:
def generate_recommendation(user_id, model, metadata, thresh = 3.5):
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    for book_title in book_titles:
        rating = predict_review_by_title(user_id, book_title, model, metadata)
        print(book_title, rating) 
        if rating > thresh:
            book_id = get_book_id(book_title, metadata)
            return get_book_info(book_id, metadata)

In [60]:
def generate_recommendation(user_id, model, rating_data, metadata, thresh = 3.5):
    unrated_book_id = list(get_unrated_item(user_id, rating_data))
    # print(unrated_book_id[:5])
    random.shuffle(unrated_book_id)
    
    for book_id in unrated_book_id:
        rating = predict_review_by_id(user_id, book_id, model, metadata)
        print(book_id, rating) 
        if rating > thresh:
            return get_book_info(book_id, metadata)

In [62]:
generate_recommendation(10, svd_model, df, book_data, thresh = 2)

0060392517 2.949912053160439


{'title': 'Judy Garland, Ginger Love',
 'author': 'Nicole Cooley',
 'year': 1998,
 'publisher': 'ReganBooks',
 'Image-URL-S': 'http://images.amazon.com/images/P/0060392517.01.THUMBZZZ.jpg',
 'Image-URL-M': 'http://images.amazon.com/images/P/0060392517.01.MZZZZZZZ.jpg',
 'Image-URL-L': 'http://images.amazon.com/images/P/0060392517.01.LZZZZZZZ.jpg'}

### Generate top 5 rated books for certain user

In [63]:
# Let's create this into a function
def get_pred_unrated_item(userid, estimator, unrated_item_id):
    """
    Get the predicted unrated item id from user id

    Parameters
    ----------
    userid : int
        The user id

    estimator : Surprise object
        The estimator

    unrated_item_id : set
        The unrated item id

    Returns
    -------
    pred_data : pandas Dataframe
        The predicted rating of unrated item of user id
    """
    # Initialize dict
    pred_dict = {
        'user_id': userid,
        'item_id': [],
        'predicted_rating': []
    }

    # Loop for over all unrated movie Id
    for id in unrated_item_id:
        # Create a prediction
        pred_id = estimator.predict(uid = pred_dict['user_id'],
                                    iid = id)

        # Append
        pred_dict['item_id'].append(id)
        pred_dict['predicted_rating'].append(pred_id.est)

    # Create a dataframe
    pred_data = pd.DataFrame(pred_dict).sort_values('predicted_rating',
                                                     ascending = False)

    return pred_data

In [71]:
def get_top_highest_unrated(estimator, k, userid, rating_data, metadata):
    """
    Get top k highest of unrated movie from a Surprise estimator RecSys

    Parameters
    ----------
    estimator : Surprise model
        The RecSys model

    k : int
        The number of Recommendations

    userid : int
        The user Id to recommend

    rating_data : pandas Data Frame
        The rating data

    movie_data : pandas DataFrame
        The movie meta data

    Returns
    -------
    top_item_pred : pandas DataFrame
        The top items recommendations
    """
    # 1. Get the unrated item id of a user id
    unrated_item_id = get_unrated_item(userid=userid, rating_data=rating_data)

    # 2. Create prediction from estimator to all unrated item id
    predicted_unrated_item = get_pred_unrated_item(userid = userid,
                                                   estimator = estimator,
                                                   unrated_item_id = unrated_item_id)

    # 3. Sort & add meta data
    top_item_pred = predicted_unrated_item.head(k).copy()
    print(top_item_pred)
    top_item_pred_detail = metadata.loc[top_item_pred['item_id'], :]
    # top_item_pred['genres'] = metadata.loc[top_item_pred['item_id'], 'genres'].values

    return top_item_pred_detail


In [80]:
# Generate 10 recommendation for user 500
get_top_highest_unrated(estimator=svd_model,
                        k=5,
                        userid=100,
                        rating_data=data.df,
                        metadata=book_data)

        user_id     item_id  predicted_rating
245268      100  1844262553          6.277151
284438      100  0590353403          6.120859
47839       100  0439139597          6.080721
22430       100  0439136350          6.016197
171894      100  0439064864          5.833320


Unnamed: 0_level_0,title,author,year,publisher,Image-URL-S,Image-URL-M,Image-URL-L
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1844262553,Free,Paul Vincent,2003,Upfront Publishing,http://images.amazon.com/images/P/1844262553.0...,http://images.amazon.com/images/P/1844262553.0...,http://images.amazon.com/images/P/1844262553.0...
590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...,http://images.amazon.com/images/P/0590353403.0...
439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...
439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...,http://images.amazon.com/images/P/0439136350.0...
439064864,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...,http://images.amazon.com/images/P/0439064864.0...
