In [28]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load and Preprocess Data

In [19]:
ratings = pd.read_csv("ml-latest-small/ratings.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")

In [18]:
print('Ratings')
display(ratings.head())

print('Movies')
display(movies.head())

Ratings


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Movies


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [22]:
# merge datasets 
data = pd.merge(ratings,movies,on="movieId")
data = data.drop(columns=["timestamp"])
data.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


## why do we encode? 
the reason that we need to encode userId and movieId is that, while they are numerical, they do not have a meaningful ordinal relationship. For example, ``` movieId = 500``` isn't "closer" to ``` movieId = 501 ```. 

encoding also reduces memory usage. For example, if ```userId``` ranges from 1 to 10000, but there are only 100 users in the dataset, the encodinig will map them from 0 to 99, reducing memory usage and reducing training time. 

In [24]:
# encode categorical features 
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

In [26]:
data['userId'] = user_encoder.fit_transform(data['userId'])
data['movieId'] = movie_encoder.fit_transform(data['movieId'])

In [27]:
data.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,0,0,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,0,2,4.0,Grumpier Old Men (1995),Comedy|Romance
2,0,5,4.0,Heat (1995),Action|Crime|Thriller
3,0,43,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,0,46,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [29]:
# create train-test split 
train_data, test_data = train_test_split(data, test_size= 0.2, random_state=42 )

In [34]:
print(len(train_data))
print(len(test_data))

80668
20168


# Train Random Forest Model for Rating Prediction

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [37]:
# define features and target variables 
x_train, x_test = train_data[["userId","movieId"]], test_data[["userId","movieId"]]
y_train, y_test = train_data["rating"], test_data["rating"]

In [62]:
# train random forest regressor model 

model = RandomForestRegressor(n_estimators=50,random_state=42)
model.fit(x_train,y_train)


# model = RandomForestRegressor(max_depth=20,min_samples_leaf=20, n_estimators=200,
#                       random_state=42)
# model.fit(x_train,y_train)

In [63]:
# evaluate model 
import numpy as np 

y_pred = model.predict(x_test)
rmse = root_mean_squared_error(y_test,y_pred)
print('RMSE:', rmse)

RMSE: 1.0604925210240557


In [64]:
# create a baseline model 
# baseline model: predict average rating for all movies 

baseline_pred = ratings['rating'].mean()
baseline_rmse = root_mean_squared_error(test_data['rating'],[baseline_pred]*len(test_data))
print('baseline rmse:', baseline_rmse)



baseline rmse: 1.0488361768130714


# Hyperparamter tuning

In [65]:
from sklearn.model_selection import GridSearchCV
import time

In [59]:
%%time
# set the parameters that we want to search 
params = {
    'max_depth': [2,3,5,10,20],
    'min_samples_leaf': [5,10,20,50,100,200],
    'n_estimators': [10,25,30,50,100,200]
}

grid_search = GridSearchCV(estimator=model,
                           param_grid=params,
                           cv = 4,
                           n_jobs=-1, verbose=1, scoring="neg_mean_squared_error")
grid_search.fit(x_train, y_train)

Fitting 4 folds for each of 180 candidates, totalling 720 fits
CPU times: user 21.4 s, sys: 4.71 s, total: 26.1 s
Wall time: 5min 33s
