# Content based Recommender System Using KnnBaseline Algorithm

In [2]:
"""
Created on Sun July  5 11:46:44 2020

@author: Muhammad Imran Shaikh
"""

'\nCreated on Sun July  5 11:46:44 2020\n\n@author: Muhammad Imran Shaikh\n'

## Importing libraries

In [27]:
import io # Module for Python interfaces to stream handling
import pandas as pd # pandas is a data manipulation library
import numpy as np #provides numerical arrays and functions to manipulate the arrays
from surprise.model_selection import LeaveOneOut # LeaveOneOut Cross Validator
from surprise.model_selection import cross_validate #cross validation module in surprise lib
from surprise import KNNBaseline # KNN baseline algorithm
from surprise import Dataset
from surprise import get_dataset_dir# Extracting dataset from folder directory
from surprise.model_selection import train_test_split # Train and test split module in surprise library

#### Read the u.item file from MovieLens dataset and return two mappings to convert raw ids into movie names and movie names into raw ids.
    

In [28]:
# A function created to read item names

def read_item_names():
    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

## Loading Data

In [29]:
data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.25)

## Using pearson_baseline as a simliarity measure in KnnBaseLine algorithm

In [30]:
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(k=30 ,sim_options=sim_options)
algo.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x1b7fba06b70>

In [31]:
# Read the mappings raw id into movie name
rid_to_name, name_to_rid = read_item_names()

In [41]:
# Fetching inner id of the movie
# Multiple movies to select and check the top 10 nearest movies results (Forrest Gump (1994), Braveheart (1995),Toy Story (1995),Clockwork Orange, A (1971))

movie_raw_id = name_to_rid['Braveheart (1995)'] # Change movies names to get content based results at the output
movie_inner_id = algo.trainset.to_inner_iid(movie_raw_id)
movie_neighbors = algo.get_neighbors(movie_inner_id, k=10)

# Convert inner ids of the neighbors into names.
movie_neighbors = (algo.trainset.to_raw_iid(inner_id)
                   for inner_id in movie_neighbors)
movie_neighbors = (rid_to_name[rid]
                   for rid in movie_neighbors)

# Output

In [42]:
print()
print('The 10 nearest neighbors of {} are:'.format(rid_to_name[movie_raw_id]))
for movie in movie_neighbors:
    print(movie)


The 10 nearest neighbors of Braveheart (1995) are:
Apollo 13 (1995)
Raiders of the Lost Ark (1981)
Shawshank Redemption, The (1994)
Return of the Jedi (1983)
Titanic (1997)
Forrest Gump (1994)
E.T. the Extra-Terrestrial (1982)
While You Were Sleeping (1995)
Hunt for Red October, The (1990)
Miracle on 34th Street (1994)


# Run 5-fold cross-validation and print results

In [24]:
vali = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBaseline on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9155  0.9126  0.9190  0.9215  0.9173  0.9172  0.0030  
MAE (testset)     0.7183  0.7187  0.7181  0.7221  0.7174  0.7189  0.0017  
Fit time          3.58    3.51    3.69    3.70    3.80    3.66    0.10    
Test time         5.70    5.66    5.44    5.35    5.1

# Converting our Cross-Validation results in DataFrame

In [25]:
df=pd.DataFrame(vali)
df_new = df.rename(index={0: 'Fold 1', 1: 'Fold 2', 2: 'Fold 3', 3: 'Fold 4', 4: 'Fold 5'})
df_new = df_new.transpose()

In [26]:
df_new['mean'] = df_new.mean(axis=1)
df_new['std'] = df_new.std(axis=1)
df_new

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean,std
test_rmse,0.915529,0.912617,0.919035,0.921451,0.917292,0.917185,0.003008
test_mae,0.718273,0.71873,0.71814,0.722125,0.717393,0.718932,0.001653
fit_time,3.576786,3.507827,3.690716,3.699712,3.803644,3.655737,0.103096
test_time,5.697473,5.656499,5.440631,5.349687,5.164804,5.461819,0.197318
