# Preparing Data & Preprocessing

In [116]:
import pandas as pd
import numpy as np

## Dataset Acquire

MovieLens Latest Datasets

These datasets will change over time, and are not appropriate for reporting research results. We will keep the download links stable for automated downloads. We will not archive or make available previously released versions.

Small: 100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

https://grouplens.org/datasets/movielens/

In [291]:
df_movies = pd.read_csv("res/movies.csv")
df_ratings = pd.read_csv("res/ratings.csv")

In [292]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [293]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [294]:
# Excluding irrelevant data from ratings dataframe
df_newRatings = df_ratings.loc[:, df_ratings.columns != "timestamp"]
df_newRatings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [295]:
df_newRatings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [296]:
df_newRatings.describe()

Unnamed: 0,userId,movieId,rating
count,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557
std,182.618491,35530.987199,1.042529
min,1.0,1.0,0.5
25%,177.0,1199.0,3.0
50%,325.0,2991.0,3.5
75%,477.0,8122.0,4.0
max,610.0,193609.0,5.0


## Splitting The Data and Training

In [315]:
from surprise import SVD, accuracy
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate, train_test_split

In [316]:
reader = Reader(rating_scale=(0.5, 5.0))

In [317]:
data = Dataset.load_from_df(df_newRatings[['userId', 'movieId', 'rating']], reader)

In [318]:
train_data, test_data = train_test_split(data, test_size=.2)

In [319]:
sg_val_decomp = SVD()

In [320]:
sg_val_decomp.fit(train_data)
predictions = sg_val_decomp.test(test_data)

In [321]:
accuracy.rmse(predictions)

RMSE: 0.8624


0.8624085726851033

In [322]:
comparison_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, round(pred.est, 2)) for pred in predictions], 
                             columns=["User ID", "Movie ID", "Actual Rating", "Predicted Rating"])
print(comparison_df.head()) 

   User ID  Movie ID  Actual Rating  Predicted Rating
0      479       260            4.0              4.18
1      414      1387            4.0              4.28
2      428         9            2.0              2.35
3      288      3034            3.0              3.43
4        1      2012            4.0              4.20


In [341]:
def recommend_movies(user_id, num_recommendations):
    movie_ids = df_newRatings['movieId'].unique()
    rated_movies = df_newRatings.loc[df_newRatings['userId'] == user_id, 'movieId']

    unrated_movies = []
    for id in movie_ids:
        if id not in rated_movies.values:
            unrated_movies.append(id)

    predict_ratings = []
    for id in unrated_movies:
        predict_ratings.append(algo.predict(user_id, id))

    predict_ratings.sort(key=lambda x: x.est, reverse=True)

    recommended_movie_ids = []
    for pred in predict_ratings[:num_recommendations]:
        recommended_movie_ids.append(pred.iid)

    return recommended_movie_ids


In [342]:
def get_movie_name(recon_list):
    movies_name = []
    for id in recon_list:
        movies_name.append(df_movies[df_movies["movieId"] == id]["title"])

    for name in movies_name:
        print(name)

In [348]:
#Param user_id, number of recommendation
recon_list = recommend_movies(11, 3)
print(recon_list)
get_movie_name(recon_list)

[1213, 858, 1208]
914    Goodfellas (1990)
Name: title, dtype: object
659    Godfather, The (1972)
Name: title, dtype: object
909    Apocalypse Now (1979)
Name: title, dtype: object
