# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from enum import auto, Enum
from datetime import datetime
from collections import defaultdict
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Dataset Analysis

### Current Dataset Info

In [7]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Active Users

In [8]:
active_user_ratings = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].mean())
active_user_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].count())
active_user_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,rating,No_of_ratings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
414,3.391957,2698
599,2.64205,2478
474,3.398956,2108
448,2.847371,1864
274,3.235884,1346
610,3.688556,1302
68,3.23373,1260
380,3.673235,1218
606,3.657399,1115
288,3.145972,1055


## Most Rated Movies

In [9]:
mean_ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
mean_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
movies_with_mean_ratings = pd.merge(movies[['item_id','title']], mean_ratings, on='title').set_index('item_id')
movies_with_mean_ratings.columns = ['title', 'mean_rating', 'No_of_ratings']
movies_with_mean_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,title,mean_rating,No_of_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump,4.164134,329
318,"Shawshank Redemption, The",4.429022,317
296,Pulp Fiction,4.197068,307
593,"Silence of the Lambs, The",4.16129,279
2571,"Matrix, The",4.192446,278
260,Star Wars: Episode IV - A New Hope,4.231076,251
480,Jurassic Park,3.75,238
110,Braveheart,4.031646,237
589,Terminator 2: Judgment Day,3.970982,224
527,Schindler's List,4.225,220


## Helper Functions

In [10]:
def get_user_rating(user_id):
    return movie_ratings.loc[movie_ratings['user_id'] == user_id]

def get_user_rating_for_movie(user_id, movie_id):
    return movie_ratings.loc[ (movie_ratings['user_id'] == user_id) & (movie_ratings['item_id'] == movie_id) ].values[0,2]

def get_rating_timestamp(user_id, movie_id):
    return movie_ratings.loc[ (movie_ratings['user_id'] == user_id) & (movie_ratings['item_id'] == movie_id) ].values[0,3]

def get_movies_watched(movie_ratings_data, user_id, year = 2015, month = 7, day = 15):
    return movie_ratings.loc[((movie_ratings_data['user_id'] == user_id)) & (movie_ratings_data.timestamp < datetime(year, month, day))][['item_id','title','rating']].set_index('item_id')

def get_user_avg_rating(movie_ratings_data, user_id):
    return movie_ratings.loc[((movie_ratings_data['user_id'] == user_id))].rating.mean()

def get_user_avg_rating_at(movie_ratings_data, user_id, year = 2015, month = 7, day = 15):
    return movie_ratings.loc[((movie_ratings_data['user_id'] == user_id)) & (movie_ratings_data.timestamp < datetime(year, month, day))].rating.mean()

In [11]:
get_user_rating(449).head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
515,449,50,4.5,2003-05-17 19:32:39,"Usual Suspects, The",Crime|Mystery|Thriller,1995.0
3515,449,527,4.0,2003-05-17 19:32:43,Schindler's List,Drama|War,1993.0
6177,449,1090,3.5,2003-05-17 19:37:12,Platoon,Drama|War,1986.0
8053,449,1222,3.0,2003-05-17 19:36:19,Full Metal Jacket,Drama|War,1987.0
10496,449,2012,2.5,2003-05-17 19:15:54,Back to the Future Part III,Adventure|Comedy|Sci-Fi|Western,1990.0


In [12]:
get_rating_timestamp(user_id=449,movie_id=32)

Timestamp('2003-05-17 19:35:19')

In [13]:
get_user_rating_for_movie(user_id=449,movie_id=32)

4.0

In [14]:
get_user_avg_rating_at(movie_ratings, 449, year=2015)

3.289473684210526

In [15]:
get_user_avg_rating(movie_ratings, 449)

3.289473684210526

## Get User Correlations

In [16]:
def get_user_correlations(movie_ratings_data, user_id, min_common_elements=20):

    """
        Get the correlations of the 'user_id' to all other users.
        Correlation is calculated if and only if both of them rated 'min_common_elements' number of movies in common.
    """
    
    # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
    user_movie_matrix = movie_ratings_data.pivot_table(index='title', columns='user_id', values='rating') 
    
    # Create a matrix of correlations :: Each row is a user, the columns are the all other users, 
    #                                 :: Values are the pearson correlations in between users
    corrs = user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
    
    # Get the correlations of 'user_id'
    try:
        user_corrs = corrs[user_id]
    except(KeyError):
        # No Correlations found
        return None

    # Drop any null, if found
    user_corrs.dropna(inplace=True)
    
    # Create A dataframe from not-null correlations of the 'user_id'
    users_alike = pd.DataFrame(user_corrs)
    
    # Rename the only column to 'correlation'
    users_alike.columns= ['correlation']
    
    # Sort the user correlations in descending order 
    #     so that first one is the most similar, last one least similar
    users_alike.sort_values(by='correlation', ascending=False, inplace=True)
    
    # Eliminate Correlation to itself by deleting first row, 
    #     since biggest corr is with itself it is in first row
    users_alike = users_alike.iloc[1:] 
    
    return users_alike

# Predict Movie Rating

In [17]:
def predict_movie(movie_ratings_data, user_id, movie_id, 
                     min_common_elements = 20,
                     default_value=-2):
    
    # If a movie with movie_id not exists, return -1
    if movie_ratings_data.loc[movie_ratings_data['item_id'] == movie_id] is None:
        return -1
    
    # Get Nearest Neighbours of the 'user_id'
    k_nearest_neighbours = get_user_correlations(movie_ratings_data, user_id, min_common_elements)

    # If no neighbour is found, return 'default_value'
    if k_nearest_neighbours is None:
        return default_value
    
    # Calculate Mean Centered Prediciton
    user_avg_rating = get_user_avg_rating(movie_ratings_data,user_id)
    weighted_sum = 0.0
    sum_of_weights = 0.0
    for neighbour_id, data in k_nearest_neighbours.iterrows():
        # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
        neighbour_corr = data['correlation']
        neighbour_data = movie_ratings_data.loc[(movie_ratings_data['user_id'] == neighbour_id) & (movie_ratings['item_id'] == movie_id)]
        if neighbour_data.empty:
            continue
        neighbour_avg_rating = get_user_avg_rating(movie_ratings_data,neighbour_id)
        neighbour_rating = float(neighbour_data.rating)
        neighbour_mean_centered_rating =  neighbour_rating - neighbour_avg_rating
        # Calculate Weighted sum and sum of weights
        weighted_sum += neighbour_mean_centered_rating * neighbour_corr
        sum_of_weights += neighbour_corr
    
    # Predict
    if sum_of_weights != 0.0:
        prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
    else:
        prediction_rating = default_value
    return prediction_rating

In [18]:
predict_movie(movie_ratings,user_id=449,movie_id=3)

3.0755261802637825

## Predict Movie In Different Temporal Bins

In [19]:
class PredictionTimeConstraint(Enum):
    AT = auto()
    IN = auto()
    NO = auto()

In [20]:
def get_user_correlations_to_neighbours(movie_ratings_data, user_id, min_common_elements=20,
                                        time_constraint=PredictionTimeConstraint.NO,
                                        start_dt=None,
                                        end_dt=None,
                                        dt=None):    
    """
        Get the correlations of the 'user_id' to all other users.
        Correlation is calculated if and only if both of them rated 'min_common_elements' number of movies in common.
    """
    
    
    # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
    if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
        user_movie_matrix = movie_ratings_data.pivot_table(index='title', columns='user_id', values='rating')
    elif time_constraint == PredictionTimeConstraint.AT and dt != None:
        user_movie_matrix = movie_ratings_data[(movie_ratings_data.timestamp < dt)].pivot_table(index='title', columns='user_id', values='rating') 
    elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
         user_movie_matrix = movie_ratings_data[(movie_ratings_data.timestamp >= start_dt) & (movie_ratings_data.timestamp < end_dt)].pivot_table(index='title', columns='user_id', values='rating') 
    else:
        return -1                                      # Unknown time constraint, exit the function
    
    # Create a matrix of correlations :: Each row is a user, the columns are the all other users, 
    #                                 :: Values are the pearson correlations in between users
    corrs = user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
    
    # Get the correlations of 'user_id'
    try:
        user_corrs = corrs[user_id]
    except(KeyError):
        # No Correlations found
        return None

    # Drop any null, if found
    user_corrs.dropna(inplace=True)
    
    # Create A dataframe from not-null correlations of the 'user_id'
    users_alike = pd.DataFrame(user_corrs)
    
    # Rename the only column to 'correlation'
    users_alike.columns= ['correlation']
    
    # Sort the user correlations in descending order 
    #     so that first one is the most similar, last one least similar
    users_alike.sort_values(by='correlation', ascending=False, inplace=True)
    
    # Eliminate Correlation to itself by deleting first row, 
    #     since biggest corr is with itself it is in first row
    users_alike = users_alike.iloc[1:] 
    
    return users_alike

In [21]:
def predict_movie_rating(movie_ratings_data, user_id, movie_id, 
                         min_common_elements = 20,
                         time_constraint=PredictionTimeConstraint.NO,
                         start_dt=None,end_dt=None, dt=None,
                         default_value=-2):
    
    # If a movie with movie_id not exists, return -1
    if movie_ratings_data.loc[movie_ratings_data['item_id'] == movie_id] is None:
        return -1

    # Get Nearest Neighbours of the 'user_id' according to time_constraint
    if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
        k_nearest_neighbours = get_user_correlations_to_neighbours(movie_ratings_data, user_id, min_common_elements)
    elif time_constraint == PredictionTimeConstraint.AT:
        k_nearest_neighbours = get_user_correlations_to_neighbours(movie_ratings_data, user_id, min_common_elements, 
                                                           time_constraint=PredictionTimeConstraint.AT,
                                                           dt=dt)
    elif time_constraint == PredictionTimeConstraint.IN:
        k_nearest_neighbours = get_user_correlations_to_neighbours(movie_ratings_data, user_id, min_common_elements, 
                                                                   time_constraint=PredictionTimeConstraint.IN,
                                                                   start_dt=start_dt, end_dt=end_dt)
    else:
        return -1                                      # Unknown time constraint, exit the function
    
    # If no neighbour is found, return 'default_value'
    if k_nearest_neighbours is None:
        return default_value
    
    # Calculate Mean Centered Prediciton
    user_avg_rating = get_user_avg_rating(movie_ratings_data,user_id)
    weighted_sum = 0.0
    sum_of_weights = 0.0
    for neighbour_id, data in k_nearest_neighbours.iterrows():
        # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
        neighbour_corr = data['correlation']
        neighbour_data = movie_ratings_data.loc[(movie_ratings_data['user_id'] == neighbour_id) & (movie_ratings['item_id'] == movie_id)]
        if neighbour_data.empty:
            continue
        neighbour_avg_rating = get_user_avg_rating(movie_ratings_data,neighbour_id)
        neighbour_rating = float(neighbour_data.rating)
        neighbour_mean_centered_rating =  neighbour_rating - neighbour_avg_rating
        # Calculate Weighted sum and sum of weights
        weighted_sum += neighbour_mean_centered_rating * neighbour_corr
        sum_of_weights += neighbour_corr
    
    # Predict
    if sum_of_weights != 0.0:
        prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
    else:
        prediction_rating = default_value
    return prediction_rating
    

In [22]:
predict_movie_rating(movie_ratings,449,3)

3.0755261802637825

In [23]:
predict_movie_rating(movie_ratings,449,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2005,7,15))

3.3322558664251902

In [24]:
predict_movie_rating(movie_ratings,449,3, time_constraint=PredictionTimeConstraint.IN, 
                                          start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

2.9626269281111557

## Prediction of All Movie Ratings Given by a User

In [25]:
def get_movies_watched(movie_ratings_data,user_id, 
                       time_constraint=PredictionTimeConstraint.NO, 
                       dt=None, start_dt=None, end_dt=None):
    if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
        return movie_ratings_data.loc[(movie_ratings_data['user_id'] == user_id)][['item_id','rating']]
    elif time_constraint == PredictionTimeConstraint.AT and dt != None:
        return movie_ratings_data.loc[(movie_ratings_data['user_id'] == user_id) & (movie_ratings_data.timestamp < dt)][['item_id','rating']]
    elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
        return movie_ratings_data.loc[(movie_ratings_data['user_id'] == user_id) & (movie_ratings_data.timestamp >= start_dt) & (movie_ratings_data.timestamp < end_dt)][['item_id','rating']]
    else:
        return -1                                      # Unknown time constraint, exit the function

In [26]:
def predict_movies_watched(movie_ratings_data, user_id,
                           k = 10,
                           min_common_elements = 5,
                           time_constraint=PredictionTimeConstraint.NO,
                           dt = None,
                           start_dt = None,
                           end_dt = None,
                           default_value=0):
    
    movies_watched = get_movies_watched(movie_ratings_data,user_id)
    
    if movies_watched.empty:
        print("No movies watched in this interval.")
        return
    
    number_of_predictions = 0.0
    sum_of_square_differences = 0.0
    for _,row in movies_watched.iterrows():
        prediction = predict_movie_rating(movie_ratings_data, user_id=user_id, movie_id=row['item_id'],
                                          min_common_elements=min_common_elements, 
                                          time_constraint=time_constraint,
                                          default_value=default_value,
                                          start_dt=start_dt, end_dt=end_dt,dt=dt)
        if number_of_predictions == k:
            break
        sum_of_square_differences += (row['rating'] - prediction) ** 2
        number_of_predictions += 1
        print(f"Rating={row['rating']}, Predicted={prediction}, Movie_id={row['item_id']}")
    rmse = sum_of_square_differences/number_of_predictions
    print(f"RMSE = {rmse}")

In [27]:
predict_movies_watched(movie_ratings, user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       min_common_elements=3,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15))

Rating=4.5, Predicted=3.833787439912831, Movie_id=6.0
Rating=4.0, Predicted=3.9534812705512707, Movie_id=47.0
Rating=4.5, Predicted=4.369170393840825, Movie_id=50.0
Rating=4.5, Predicted=2.763200595625623, Movie_id=70.0
Rating=3.5, Predicted=4.060523352060786, Movie_id=110.0
Rating=3.5, Predicted=3.717918104320959, Movie_id=163.0
Rating=4.5, Predicted=3.8472527641287257, Movie_id=223.0
Rating=0.5, Predicted=3.397562751360292, Movie_id=231.0
Rating=4.5, Predicted=4.4514754802817595, Movie_id=235.0
Rating=3.5, Predicted=4.260753981891985, Movie_id=260.0
RMSE = 1.3244316554673292


In [28]:
predict_movies_watched(movie_ratings, user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.AT,
                       min_common_elements=3,
                       dt=datetime(2005,5,12))

Rating=4.5, Predicted=4.02600984570496, Movie_id=6.0
Rating=4.0, Predicted=4.1118672971640615, Movie_id=47.0
Rating=4.5, Predicted=4.463908498164707, Movie_id=50.0
Rating=4.5, Predicted=2.8875642500445178, Movie_id=70.0
Rating=3.5, Predicted=4.080838094393174, Movie_id=110.0
Rating=3.5, Predicted=3.3787742223903323, Movie_id=163.0
Rating=4.5, Predicted=4.165226263552398, Movie_id=223.0
Rating=0.5, Predicted=2.4558861476884735, Movie_id=231.0
Rating=4.5, Predicted=3.9622568239828126, Movie_id=235.0
Rating=3.5, Predicted=4.429403878761258, Movie_id=260.0
RMSE = 0.8281024554382285


In [29]:
predict_movies_watched(movie_ratings, user_id=182, min_common_elements=3, k=10)

Rating=4.5, Predicted=4.030467363197655, Movie_id=6.0
Rating=4.0, Predicted=4.020870132285964, Movie_id=47.0
Rating=4.5, Predicted=4.2897370059382745, Movie_id=50.0
Rating=4.5, Predicted=3.5163370042288493, Movie_id=70.0
Rating=3.5, Predicted=3.9081027703966007, Movie_id=110.0
Rating=3.5, Predicted=3.4399423602692023, Movie_id=163.0
Rating=4.5, Predicted=4.013594502419993, Movie_id=223.0
Rating=0.5, Predicted=2.7063952005892773, Movie_id=231.0
Rating=4.5, Predicted=3.809624569664496, Movie_id=235.0
Rating=3.5, Predicted=4.043373905015375, Movie_id=260.0
RMSE = 0.727949819138293
