# Recommendation system for Movies : Collabrative filtering

Link to download the data : https://grouplens.org/datasets/movielens/

In [1]:
import pandas as pd 
from surprise import Dataset, accuracy, Reader
from surprise.model_selection import cross_validate
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)


In [27]:
def get_top_n_recommendations(model,df,user_id, n=3):
  user_movies = df[df['userId'] == user_id]['movieId'].unique()
  all_movies = df['movieId'].unique()
  movies_to_predict = list(set(all_movies) - set(user_movies))
  user_movie_pairs = [(user_id, movie_id, 0) for movie_id in movies_to_predict]
  predictions_cf = model.test(user_movie_pairs)
  top_n_recommendations = sorted(predictions_cf, key = lambda x: x.est)[:n]
  for pred in top_n_recommendations:
    predicted_rating = pred.est
    print(predicted_rating)
  top_n_movie_ids = [int(pred.iid) for pred in top_n_recommendations]
  top_n_movies = movie_encoder.inverse_transform(top_n_movie_ids)
  return top_n_movies

## Read Data : 

In [2]:
data_size = "ml-latest-small"# "ml-32m"
ratings = pd.read_csv(f"./data/{data_size}/ratings.csv")
movies = pd.read_csv(f"./data/{data_size}/movies.csv")



In [28]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [29]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [30]:
movies_ratings = pd.merge(left=ratings, right=movies, how='left' , on="movieId")
movies_ratings.drop(columns=["title"], inplace=True)
movies_ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,genres
0,1,1,4.0,964982703,Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Comedy|Romance


## Preprocess data : 

In [6]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()
mlb = MultiLabelBinarizer()

movies_ratings['userId'] = user_encoder.fit_transform(movies_ratings['userId'])
movies_ratings['movieId'] = movie_encoder.fit_transform(movies_ratings['movieId'])

In [7]:
genres_list_df= movies_ratings.pop('genres').str.split('|')
genres_list_df

0         [Adventure, Animation, Children, Comedy, Fantasy]
1                                         [Comedy, Romance]
2                                 [Action, Crime, Thriller]
3                                       [Mystery, Thriller]
4                                [Crime, Mystery, Thriller]
                                ...                        
100831                            [Drama, Horror, Thriller]
100832                            [Action, Crime, Thriller]
100833                                             [Horror]
100834                                     [Action, Sci-Fi]
100835                     [Action, Crime, Drama, Thriller]
Name: genres, Length: 100836, dtype: object

In [8]:
# For each film find the genres that represent it. put 0 for each genres.
movies_ratings = movies_ratings.join(pd.DataFrame(mlb.fit_transform(genres_list_df), columns = mlb.classes_, index = movies_ratings.index ))


In [9]:
movies_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,4.0,964982703,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,964981247,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,964982224,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,964983815,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,964982931,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0


In [10]:
movies_ratings.drop(columns=["(no genres listed)"], inplace=True)

##### Split Data : 

In [11]:
train, test = train_test_split(movies_ratings, test_size=0.25)
print(f"Train size = {train.size}")
print(f"Test  size = {test.size}")

Train size = 1739421
Test  size = 579807


In [12]:
# As I am loading a custom dataset, we need to define a reader.
reader = Reader(rating_scale = (0.5, 5))

In [13]:
train_data = Dataset.load_from_df(train[["userId", "movieId", "rating"]], reader).build_full_trainset()
testset = train_data.build_anti_testset()

## train model using collab filtering: 

In [14]:
from surprise.prediction_algorithms.matrix_factorization import SVD

In [15]:
svd = SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)
svd.fit(train_data)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a4a585e320>

#### Root mean squared error

In [32]:
predictions_svd = svd.test(testset)
rsme_value = accuracy.rmse(predictions_svd)
print(f"Root mean squared error = {rsme_value}")

RMSE: 0.4898
Root mean squared error = 0.48983484359459833


## Recommendation : 

In [26]:
user_id = 58
n_recommendation = 3
recommendations = get_top_n_recommendations(svd, movies_ratings,user_id, n_recommendation)
top_n_movies_titles = movies[movies['movieId'].isin(recommendations)]['title'].tolist()
print(f"Top {n_recommendation} Recommendations for User {user_id}:")
for i, title in enumerate(top_n_movies_titles, 1):
  print(f"{i}.{title}")

2.3715523935887095
2.5656907653845176
2.8114653310951345
Top 3 Recommendations for User 58:
1.Stuart Saves His Family (1995)
2.I Know What You Did Last Summer (1997)
3.Inspector Gadget (1999)
