## Surprise

In [1]:
from surprise import Dataset
from surprise import Reader
from surprise import BaselineOnly, KNNBasic, NormalPredictor
from surprise import accuracy
from surprise.model_selection import KFold

import os
data_path = os.environ.get('DATA_PATH') + 'AI_Cheats/'

In [2]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(data_path + 'movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [3]:
# Baseline algorithm using ALS for optimization
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

# SGD optimization (uncomment to use)
# bsl_options = {'method': 'sgd', 'n_epochs': 5}

algo = BaselineOnly(bsl_options=bsl_options)

uid = str(196)
iid = str(302)
pred = 0

# Define K-Fold cross-validation iterator, k=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # Train and predict
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Calculate RMSE
    accuracy.rmse(predictions, verbose=True)
    # Each model provides 1/3 of the results
    pred = algo.predict(uid, iid, r_ui=4, verbose=False)
    print(pred)


Estimating biases using als...
RMSE: 0.8644
user: 196        item: 302        r_ui = 4.00   est = 4.21   {'was_impossible': False}
Estimating biases using als...
RMSE: 0.8629
user: 196        item: 302        r_ui = 4.00   est = 4.01   {'was_impossible': False}
Estimating biases using als...
RMSE: 0.8645
user: 196        item: 302        r_ui = 4.00   est = 4.18   {'was_impossible': False}


## NormalPredictor 

In [4]:
algo = NormalPredictor()

# Define K-Fold cross-validation iterator with k=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions, verbose=True)
    pred = algo.predict(uid, iid, r_ui=4, verbose=False)
    print(pred)


RMSE: 1.4313
user: 196        item: 302        r_ui = 4.00   est = 4.10   {'was_impossible': False}
RMSE: 1.4339
user: 196        item: 302        r_ui = 4.00   est = 1.26   {'was_impossible': False}
RMSE: 1.4321
user: 196        item: 302        r_ui = 4.00   est = 3.67   {'was_impossible': False}


## SlopeOne

In [5]:
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise import KNNBaseline, SlopeOne
import io
import pandas as pd

In [6]:
# load data
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(data_path + 'movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

In [7]:
# Use SlopeOne 
algo = SlopeOne()
algo.fit(train_set)

# Score prediction for developing users and products
uid = str(196)
iid = str(302)

pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.32   {'was_impossible': False}


## KNNBaseline

In [8]:
df = pd.read_csv(data_path + 'movies/movies.csv')
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
def read_item_names():
    data = pd.read_csv(data_path + 'movies/movies.csv')
    rid_to_name = {}
    name_to_rid = {}
    for i in range(len(data['movieId'])):
        rid_to_name[data['movieId'][i]] = data['title'][i]
        name_to_rid[data['title'][i]] = data['movieId'][i]

    return rid_to_name, name_to_rid

In [10]:
# Data loading
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_file(data_path + 'movies/ratings.csv', reader=reader)
train_set = data.build_full_trainset()

# Similarity computation using Pearson similarity for ItemCF
sim_options = {'name': 'pearson_baseline', 'user_based': False}

# Using the KNNBaseline algorithm, a type of collaborative filtering (CF) algorithm
algo = KNNBaseline(sim_options=sim_options)
algo.fit(train_set)

# Retrieve movie title information
rid_to_name, name_to_rid = read_item_names()

# Get the raw ID for the movie "Toy Story (1995)"
toy_story_raw_id = name_to_rid['Toy Story (1995)']
print(toy_story_raw_id)


Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
1


In [11]:
# Retrieve the internal ID of the movie "Toy Story" using its raw movie ID
toy_story_inner_id = algo.trainset.to_inner_iid(str(toy_story_raw_id))
print('Movie title: {}, Movie ID: {}'.format(df.loc[df['movieId'] == toy_story_inner_id]['title'].values, toy_story_inner_id))

Movie title: ['Drop Zone (1994)'], Movie ID: 227
