## Imports

https://towardsdatascience.com/building-and-testing-recommender-systems-with-surprise-step-by-step-d4ba702ef80b

http://surpriselib.com/

In [1]:
import pandas as pd
import numpy as np
import os
import time
from IPython.display import display
from collections import defaultdict

from surprise import Reader, Dataset
from surprise import SVD, BaselineOnly
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

## Load data

In [2]:
ratings_df_raw = pd.read_csv("movies-data/ml-1m/ratings.csv", dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})
items_df = pd.read_csv("movies-data/ml-1m/movies.csv")[["movieId", "title", "genres"]]

## Filter out users and items with few ratings

In [3]:
def filter_out_few_ratings(min_item_ratings=50, min_user_ratings=50):
    
    filter_items = ratings_df_raw["movieId"].value_counts() > min_item_ratings
    filter_items = filter_items[filter_items].index.tolist()

    filter_users = ratings_df_raw["userId"].value_counts() > min_user_ratings
    filter_users = filter_users[filter_users].index.tolist()

    ratings_df = ratings_df_raw[(ratings_df_raw["movieId"].isin(filter_items)) & (ratings_df_raw["userId"].isin(filter_users))]
    
    print("Original data:")
    print("\tRatings:", len(ratings_df_raw))
    print("\tUnique users:", ratings_df_raw.userId.nunique())
    print("\tUnique movies:", ratings_df_raw.movieId.nunique())

    print("Filtered data:")
    print("\tRatings:", len(ratings_df))
    print("\tUnique users:", ratings_df.userId.nunique())
    print("\tUnique movies:", ratings_df.movieId.nunique())
    
    return ratings_df
    
ratings_df = filter_out_few_ratings(min_item_ratings=50, min_user_ratings=25)

ratings_df.head()

Original data:
	Ratings: 1000209
	Unique users: 6040
	Unique movies: 3706
Filtered data:
	Ratings: 966321
	Unique users: 5549
	Unique movies: 2499


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5.0,978300760
1,1,661,3.0,978302109
2,1,914,3.0,978301968
3,1,3408,4.0,978300275
4,1,2355,5.0,978824291


## Load data into Surprise

In [4]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings_df[["userId", "movieId", "rating"]], reader)

## Evaluate algorithm with Cross-Validation

In [5]:
def evaluate_algorithm(algo, label):
    # Start timer
    start = time.time()

    result = cross_validate(algo, data, measures=["RMSE"], cv=3, verbose=False)

    # Stop timer
    end = time.time()

    # Show results
    print(label + ":")
    print("\tTime elapsed: {0:0.2f} sec".format(end - start))
    print("\tMean RMSE: {0:.4f}".format(np.mean(result.get("test_rmse"))))

# BaselineOnly with Alternating Least Squares (ALS)
evaluate_algorithm(BaselineOnly(bsl_options={'method': 'als', 'n_epochs': 8, 'reg_u': 12, 'reg_i': 5}), "BaselineOnly")
evaluate_algorithm(SVD(random_state=42), "SVD")

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
BaselineOnly:
	Time elapsed: 22.50 sec
	Mean RMSE: 0.9054
SVD:
	Time elapsed: 127.51 sec
	Mean RMSE: 0.8808


## Split data into train and test set

In [6]:
train, test = train_test_split(data, test_size=0.15, random_state=42)
print("Training data:",train.n_ratings)
print("Test data:",len(test))

Training data: 821372
Test data: 144949


## Train and evaluate model

In [7]:
# Start timer
start = time.time()
    
# Create algorithm
#algo = BaselineOnly(bsl_options={'method': 'als', 'n_epochs': 8, 'reg_u': 12, 'reg_i': 5})
algo = SVD(random_state=42)

# Train and test algorithm
preds = algo.fit(train).test(test)

# Stop timer
end = time.time()

# Show results
print("Time elapsed: {0:0.2f} sec".format(end - start))
rmse = accuracy.rmse(preds)

Time elapsed: 47.38 sec
RMSE: 0.8629


## Predictions for a user

In [8]:
# Get top n recommendations for all users
def get_top_n(predictions, n=10):
    
    # Map predictions to each user
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and get top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

# Get top 10 recommendations
top_n = get_top_n(preds, n=10)

In [9]:
# Show already rated movies
def already_seen(test_user_id):
    tab = []
    # Filter out rated movies
    ritems = ratings_df[ratings_df["userId"] == test_user_id]
    # Generate new dataframe
    for ritem in ritems.iterrows():
        mid = int(ritem[1]["movieId"])
        #print(mid)
        rating = ritem[1]["rating"]
        itm = list((items_df[items_df['movieId'] == mid].values)[0])
        itm += [rating]
        tab.append(itm)
        
    df = pd.DataFrame(tab, columns=["MovieId", "Title", "Genres", "Score"])
    df = df.round(2)
    display(df)

def predict(test_user_id, verbose=0):
    # Filter out prediction ids for a user
    top_user_rec = top_n.get(test_user_id)
    if verbose == 1:
        pred_ids = np.asarray(top_user_rec)[:,0].astype(int)
        print("Prediction ids:",pred_ids)
    
    # Generate result dataframe
    tab = []
    for r in top_user_rec:
        itm = list((items_df[items_df['movieId'] == r[0]].values)[0])
        itm += [r[1]]
        tab.append(itm)

    df = pd.DataFrame(tab, columns=["MovieId", "Title", "Genres", "Score"])
    df = df.round(2)
    display(df)

In [10]:
already_seen(608)
predict(608)

Unnamed: 0,MovieId,Title,Genres,Score
0,2995,House on Haunted Hill (1999),Horror|Thriller,3.0
1,2058,"Negotiator, The (1998)",Action|Crime|Drama|Mystery|Thriller,4.0
2,724,"Craft, The (1996)",Drama|Fantasy|Horror|Thriller,3.0
3,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,3.0
4,2060,BASEketball (1998),Comedy,2.0
...,...,...,...,...
164,3919,Hellraiser III: Hell on Earth (1992),Horror,3.0
165,1090,Platoon (1986),Drama|War,5.0
166,1093,"Doors, The (1991)",Drama,5.0
167,1242,Glory (1989),Drama|War,3.0


Unnamed: 0,MovieId,Title,Genres,Score
0,527,Schindler's List (1993),Drama|War,4.11
1,2193,Willow (1988),Action|Adventure|Fantasy,3.75
2,1263,"Deer Hunter, The (1978)",Drama|War,3.61
3,1242,Glory (1989),Drama|War,3.58
4,1704,Good Will Hunting (1997),Drama|Romance,3.55
5,2028,Saving Private Ryan (1998),Action|Drama|War,3.55
6,357,Four Weddings and a Funeral (1994),Comedy|Romance,3.47
7,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,3.45
8,69,Friday (1995),Comedy,3.41
9,2060,BASEketball (1998),Comedy,3.41


In [11]:
already_seen(131)
predict(131)

Unnamed: 0,MovieId,Title,Genres,Score
0,2987,Who Framed Roger Rabbit? (1988),Adventure|Animation|Children|Comedy|Crime|Fant...,5.0
1,1248,Touch of Evil (1958),Crime|Film-Noir|Thriller,4.0
2,1175,Delicatessen (1991),Comedy|Drama|Romance,3.0
3,1249,"Femme Nikita, La (Nikita) (1990)",Action|Crime|Romance|Thriller,4.0
4,574,Spanking the Monkey (1994),Comedy|Drama,4.0
...,...,...,...,...
270,1171,Bob Roberts (1992),Comedy,4.0
271,1245,Miller's Crossing (1990),Crime|Drama|Film-Noir|Thriller,4.0
272,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,4.0
273,1247,"Graduate, The (1967)",Comedy|Drama|Romance,3.0


Unnamed: 0,MovieId,Title,Genres,Score
0,858,"Godfather, The (1972)",Crime|Drama,4.47
1,111,Taxi Driver (1976),Crime|Drama|Thriller,4.36
2,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,4.25
3,1250,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,4.1
4,2575,"Dreamlife of Angels, The (Vie rêvée des anges,...",Drama,4.1
5,2997,Being John Malkovich (1999),Comedy|Drama|Fantasy,4.08
6,2871,Deliverance (1972),Adventure|Drama|Thriller,4.01
7,800,Lone Star (1996),Drama|Mystery|Western,3.98
8,930,Notorious (1946),Film-Noir|Romance|Thriller,3.97
9,2858,American Beauty (1999),Drama|Romance,3.97


In [12]:
already_seen(293)
predict(293)

Unnamed: 0,MovieId,Title,Genres,Score
0,1177,Enchanted April (1992),Drama|Romance,4.0
1,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller,2.0
2,3863,"Cell, The (2000)",Drama|Horror|Thriller,1.0
3,3864,Godzilla 2000 (Gojira ni-sen mireniamu) (1999),Action|Adventure|Sci-Fi,1.0
4,1250,"Bridge on the River Kwai, The (1957)",Adventure|Drama|War,5.0
...,...,...,...,...
282,1244,Manhattan (1979),Comedy|Drama|Romance,4.0
283,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,5.0
284,1246,Dead Poets Society (1989),Drama,5.0
285,1247,"Graduate, The (1967)",Comedy|Drama|Romance,5.0


Unnamed: 0,MovieId,Title,Genres,Score
0,908,North by Northwest (1959),Action|Adventure|Mystery|Romance|Thriller,4.76
1,58,"Postman, The (Postino, Il) (1994)",Comedy|Drama|Romance,4.42
2,1633,Ulee's Gold (1997),Drama,4.34
3,412,"Age of Innocence, The (1993)",Drama,4.23
4,1296,"Room with a View, A (1986)",Drama|Romance,4.21
5,1357,Shine (1996),Drama|Romance,4.2
6,2395,Rushmore (1998),Comedy|Drama,4.19
7,590,Dances with Wolves (1990),Adventure|Drama|Western,4.16
8,2171,Next Stop Wonderland (1998),Comedy|Drama|Romance,4.12
9,902,Breakfast at Tiffany's (1961),Drama|Romance,4.09
