# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from enum import auto, Enum
from datetime import datetime
from collections import defaultdict
from timeit import default_timer
from collections import defaultdict
import random
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
class PredictionTimeConstraint(Enum):
    AT = auto()
    IN = auto()
    NO = auto()
    
class RecSys:
    def __init__(self, movie_ratings_data, min_common_elements):
        self.movie_ratings = movie_ratings_data
        self.min_common_elements = min_common_elements
        # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
        self.user_movie_matrix = self.movie_ratings.pivot_table(index='title', columns='user_id', values='rating')
        self.user_corr_matrix = self.user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
        
        # Temporal User Correlations Matrix CACHE details
        # Since analysis requires us do more than one predictions with same dates save the user_corr_matrix for performance
        self.temporal_user_corr_matrix = None
        self.temporal_time_constraint = None
        self.temporal_dt = None
        self.temporal_start_dt = None
        self.temporal_end_dt = None
    
    def get_k_neighbours(self, user_id, k = 20, 
                         time_constraint=None,
                         start_dt=None,
                         end_dt=None,
                         dt=None):
        
        # Apply Time Constraint and get user_correlations to each other as matrix
        if time_constraint is None:
            user_corr_matrix = self.user_corr_matrix
        else:
            user_corr_matrix = self.create_temporal_corr_matrix(time_constraint=time_constraint,
                                                           dt=dt, start_dt=start_dt, end_dt=end_dt)
        
        # Exit if matrix is None
        if user_corr_matrix is None:
            return None
        
        # Get the 'user_id's sorrelations
        user_corrs = user_corr_matrix.get(user_id)
        if user_corrs is None:
            return None

        # Drop any null, if found
        user_corrs.dropna(inplace=True)

        # Create A dataframe from not-null correlations of the 'user_id'
        users_alike = pd.DataFrame(user_corrs)

        # Rename the only column to 'correlation'
        users_alike.columns= ['correlation']

        # Sort the user correlations in descending order 
        #     so that first one is the most similar, last one least similar
        users_alike.sort_values(by='correlation', ascending=False, inplace=True)

        # Eliminate Correlation to itself by deleting first row, 
        #     since biggest corr is with itself it is in first row
        return users_alike.iloc[1:k]
    
    def predict_movie(self, user_id, movie_id, k = 10, 
                      time_constraint=None,
                      start_dt=None,
                      end_dt=None,
                      dt=None):
        
        # If a movie with movie_id not exists, predict 0
        if self.movie_ratings.loc[self.movie_ratings['item_id'] == movie_id] is None:
            print("Movie Not Found!!")
            return 0

        # Get Nearest Neighbours of the 'user_id'
        k_nearest_neighbours = self.get_k_neighbours(user_id, k=k, 
                                                     time_constraint=time_constraint,
                                                     start_dt=start_dt, end_dt=end_dt,
                                                     dt=dt)

        # Calculate Mean Centered Prediciton
        user_avg_rating = self.get_user_avg_rating(user_id)
        
        #########################################################################
        ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS, PREDICT AS HIS AVERAGE
        if k_nearest_neighbours is None or k_nearest_neighbours.empty:
            return 0 #user_avg_rating
        #########################################################################
        weighted_sum = 0.0
        sum_of_weights = 0.0
        for neighbour_id, data in k_nearest_neighbours.iterrows():
            # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
            neighbour_corr = data['correlation']
            neighbour_data = self.movie_ratings.loc[(self.movie_ratings['user_id'] == neighbour_id) & (self.movie_ratings['item_id'] == movie_id)]
            # If the neighbour doesnt give rating to the movie_id, pass this around of the loop
            if neighbour_data.empty:
                continue
            neighbour_avg_rating = self.get_user_avg_rating(neighbour_id)
            neighbour_rating = float(neighbour_data.rating)
            neighbour_mean_centered_rating =  neighbour_rating - neighbour_avg_rating
            # Calculate Weighted sum and sum of weights
            weighted_sum += neighbour_mean_centered_rating * neighbour_corr
            sum_of_weights += neighbour_corr

        # Predict
        if sum_of_weights != 0:
            prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
        else:
            #########################################################################
            ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS THAT GIVEN RATING TO THE movie_id, PREDICT AS HIS AVERAGE
            prediction_rating = 0 #user_avg_rating
            #########################################################################
            
        return prediction_rating
    
    def is_temporal_cache_valid(self,
                                time_constraint=None,
                                start_dt=None,
                                end_dt=None,
                                dt=None):
        
        if self.temporal_time_constraint != time_constraint or time_constraint is None:
            return False
        
        # If 'AT' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.AT and self.temporal_dt == dt:
            return True
        
        # If 'IN' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.IN and self.temporal_start_dt == start_dt and self.temporal_end_dt == end_dt:
            return True

        # Otherwise cache invalid
        return False
    
    def create_temporal_corr_matrix(self, time_constraint=PredictionTimeConstraint.NO,
                                          start_dt=None,
                                          end_dt=None,
                                          dt=None):
        
        # Check cache, if valid return the cache
        temporal_cache_status = self.is_temporal_cache_valid(time_constraint=time_constraint,start_dt=start_dt,end_dt=end_dt,dt=dt)
        if temporal_cache_status == True:
            return self.temporal_user_corr_matrix
        
        # No valid cache, recreate the correlations matrix 
        if time_constraint == PredictionTimeConstraint.AT and dt != None:
            user_movie_matrix = self.movie_ratings[(self.movie_ratings.timestamp < dt)].pivot_table(index='title', columns='user_id', values='rating') 
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            user_movie_matrix =  self.movie_ratings[(self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)].pivot_table(index='title', columns='user_id', values='rating') 
        else:
            return None
        
        # Create CACHE
        self.temporal_time_constraint = time_constraint
        self.temporal_start_dt = start_dt
        self.temporal_end_dt = end_dt
        self.temporal_dt = dt
        self.temporal_user_corr_matrix = user_movie_matrix.corr(method='pearson', min_periods=self.min_common_elements) 
        
        return self.temporal_user_corr_matrix
        
    def get_user_rating(self, user_id):
        return self.movie_ratings.loc[self.movie_ratings['user_id'] == user_id]

    def get_user_rating_for_movie(self, user_id, movie_id):
        try:
            rating = self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,2]
        except IndexError:
            rating = 0
        return rating 

    def get_rating_timestamp(self, user_id, movie_id):
        return self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,3]

    def get_movies_watched(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))][['item_id','title','rating']].set_index('item_id')

    def get_user_avg_rating(self, user_id):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id))].rating.mean()

    def get_user_avg_rating_at(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))].rating.mean()
    
    
    def get_movies_watched(self, user_id,
                           time_constraint=None, 
                           dt=None, start_dt=None, end_dt=None):
        if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.AT and dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp < dt)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)][['item_id','rating']]
        else:
            return None                                      # Unknown time constraint, exit the function
    
    def get_random_movie_watched(self, user_id):
        return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id)][['item_id']].iloc[0][0]
    
    def predict_movies_watched(self, user_id,
                               k = 10,
                               time_constraint=None,
                               dt = None,
                               start_dt = None,
                               end_dt = None):
        
        # Get all movies watched by a user
        movies_watched = self.get_movies_watched(user_id)

        if movies_watched is None or movies_watched.empty:
            return None

        predictions = list()
        
        number_of_predictions = 0
        for _,row in movies_watched.iterrows():
            prediction = self.predict_movie(user_id=user_id, movie_id=row['item_id'], 
                                              time_constraint=time_constraint,
                                              start_dt=start_dt, end_dt=end_dt,dt=dt)
            if number_of_predictions == k:
                break
            predictions.append([prediction,row['rating'],row['item_id']])
            number_of_predictions += 1

        predictions_df = pd.DataFrame(predictions, columns = ['prediction','rating', 'movie_id'])
        predictions_df.movie_id = predictions_df.movie_id.astype(int)
        return predictions_df.set_index('movie_id')
    
    ## Load Datasets
    @staticmethod
    def load_movielens_small():
        col_names = ['user_id', 'item_id', 'rating', 'timestamp']
        ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)

        col_names = ['item_id', 'title', 'genres']
        movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
        
        movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
        movies.year = pd.to_datetime(movies.year, format='%Y')
        movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
        movies.title = movies.title.str[:-7]

        ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

        movie_ratings = pd.merge(ratings, movies, on='item_id')

        return movie_ratings
    
    ## Accuracy Metrics
    
    @staticmethod
    def rmse(predictions_df):
        number_of_predictions = predictions_df.count()
        sum_of_square_differences = 0.0
        for _,row in predictions_df.iterrows():
            prediction_rating = row['prediction'] 
            if prediction_rating != 0:
                sum_of_square_differences += (row['rating'] - prediction_rating) ** 2
        result = sum_of_square_differences/number_of_predictions
        return result[0]

In [3]:
movie_ratings = RecSys.load_movielens_small()

In [4]:
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Most Active Users

In [5]:
active_user_ratings = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].mean())
active_user_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].count())
active_user_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(10)

Unnamed: 0_level_0,rating,No_of_ratings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
414,3.391957,2698
599,2.64205,2478
474,3.398956,2108
448,2.847371,1864
274,3.235884,1346
610,3.688556,1302
68,3.23373,1260
380,3.673235,1218
606,3.657399,1115
288,3.145972,1055


In [6]:
%reset_selective -f (^active_user_ratings$)

## Evaluator

In [7]:
evaluator = RecSys(movie_ratings_data=movie_ratings, min_common_elements=5)

In [8]:
evaluator.predict_movies_watched(user_id=414, k=10)

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,0.0,4.0
6,0.0,3.0
47,4.166467,4.0
50,4.20621,5.0
101,0.0,4.0
110,3.687791,5.0
151,0.0,5.0
157,0.0,4.0
163,3.688742,4.0
216,0.0,3.0


In [9]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=414, k=10))

0.2476589017082173

In [10]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2015,7,15))

2.3876879699248117

In [11]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2016,7,15))

0

In [12]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

0

In [13]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

0

In [14]:
evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15))

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,0.0,4.5
47,3.328332,4.0
50,4.333318,4.5
70,0.0,4.5
110,3.015295,3.5
163,3.794592,3.5
223,2.249234,4.5
231,0.0,0.5
235,4.669989,4.5
260,3.920001,3.5


In [15]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.6071889106519102

In [16]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=100,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.5768815748798852

In [17]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=1000,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.3320122419372031

In [None]:
def find_best_at_constraint(number_of_users):
    results = list()    
    for i in range(number_of_users):
        user_id = random.randint(1,610)
        
        st = default_timer()
        rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
        runtime_no_constraint = default_timer() - st
        
        st = default_timer()
        rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5,
                                                                       time_constraint=PredictionTimeConstraint.AT,
                                                                       dt=datetime(2010,7,15)))
        runtime_with_constraint = default_timer() - st
        
        results.append([user_id,rmse_no_constraint,runtime_no_constraint,rmse_with_constraint,runtime_with_constraint])
    results = pd.DataFrame(results)
    results.columns= ['user_id','rmse','runtime1','temporal_rmse','runtime2']
    results.set_index('user_id', inplace=True)
    return results

In [18]:
def compare_no_time_constraint_and_at_constraint(number_of_users):
    """
    For Random 'number_of_users' Users
    Calculate rmse without any time constraint
    Calculate rmse with time constraint
    return as dataframe and also include runtime's of the calculations
    """
    results = list()    
    for i in range(number_of_users):
        user_id = random.randint(1,610)
        
        st = default_timer()
        rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
        runtime_no_constraint = default_timer() - st
        
        st = default_timer()
        rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5,
                                                                       time_constraint=PredictionTimeConstraint.AT,
                                                                       dt=datetime(2004,7,15)))
        runtime_with_constraint = default_timer() - st
        
        results.append([user_id,rmse_no_constraint,runtime_no_constraint,rmse_with_constraint,runtime_with_constraint])
    results = pd.DataFrame(results)
    results.columns= ['user_id','rmse','runtime1','temporal_rmse','runtime2']
    results.set_index('user_id', inplace=True)
    return results

In [None]:
#compare_no_time_constraint_and_at_constraint(50)

In [None]:
def find_best_year_constraint(n, rmse_diff_limit):
    possible_years = defaultdict(int)
    min_year = movie_ratings['timestamp'].min().year + 5 ## No need to scan first 5 years, so less ratings
    max_year = datetime.now().year - 5                   ## No need to scan last 5 years, almost all, no meaning of filter
    
    # For random 'n' users
    for i in range(n):
        # Choose user
        user_id = random.randint(1,610)
        # Find Best Year
        for year in range(min_year, max_year):
            rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
            rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5, dt=datetime(year,1,15),
                                                                   time_constraint=PredictionTimeConstraint.AT))
            if abs(rmse_no_constraint - rmse_with_constraint) <= rmse_diff_limit:
                possible_years[year] += 1
    return possible_years

In [None]:
best_years = find_best_year_constraint(n = 5, rmse_diff_limit = 0.1)
print(best_years)

In [20]:
def get_random_n_users(n):
    users = list()
    for i in range(n):
        users.append(random.randint(1,610))
    return users

In [21]:
get_random_n_users(10)

[241, 404, 333, 561, 489, 532, 164, 346, 510, 213]

In [22]:
def get_random_movie_per_user(user_list):
    user_movie_list = list()
    for user_id in user_list:
        movie_id = evaluator.get_random_movie_watched(user_id)
        user_movie_list.append((user_id,movie_id))
    return user_movie_list

In [23]:
get_random_movie_per_user(get_random_n_users(10))

[(173, 110),
 (191, 6),
 (255, 223),
 (327, 50),
 (420, 296),
 (22, 216),
 (126, 47),
 (552, 3),
 (15, 47),
 (368, 3)]

In [24]:
x = get_random_movie_per_user(get_random_n_users(10))
for user,movie in x:
    print(int(user),int(movie))

488 457
237 50
554 527
324 608
41 47
282 6
364 260
71 260
477 3
581 356


In [32]:
def RMSE(predictions):
    number_of_predictions = len(predictions)
    sum_of_square_differences = 0.0
    for prediction,actual in predictions:
        sum_of_square_differences += (actual - prediction) ** 2
    result = sum_of_square_differences/number_of_predictions
    return result

In [52]:
def compare_decay_periods(n,k=10):
    min_year = movie_ratings['timestamp'].min().year
    max_year = datetime.now().year
    
    user_list = get_random_n_users(n)
    user_movie_pair_list = get_random_movie_per_user(user_list)
    
    # Calculate RMSE for each period in between [2,10)
    for period in range(2,10):
        curr_year = min_year
        predictions = list()
        while (curr_year+period) < max_year:
            for user_id, movie_id in user_movie_pair_list:
                prediction = evaluator.predict_movie(user_id=user_id, movie_id=movie_id, k=k,
                                                     start_dt=datetime(curr_year,1,15),
                                                     end_dt = datetime(curr_year+period,1,15),
                                                     time_constraint=PredictionTimeConstraint.IN)
                if prediction != 0:
                    rating = evaluator.get_user_rating_for_movie(user_id=user_id,movie_id=movie_id)
                    predictions.append((prediction, rating))
                    #print(f"Period={period} interval=({curr_year},{curr_year+period}) prediction={prediction} actual={rating} user_id={user_id} movie_id={movie_id}")
            curr_year += period
        period_rmse = RMSE(predictions)
        print(f"\nPeriod={period} RMSE = {period_rmse}\n")

In [53]:
compare_decay_periods(n=100, k=1000)


Period=2 RMSE = 6.879660690570211


Period=3 RMSE = 65.29964250023697


Period=4 RMSE = 188.32192431293794


Period=5 RMSE = 3.109579094678005


Period=6 RMSE = 0.4910420277683879


Period=7 RMSE = 2.9919218808263044


Period=8 RMSE = 0.5047111350016473


Period=9 RMSE = 0.4425586406090878



In [54]:
compare_decay_periods(n=100, k=100)


Period=2 RMSE = 17.643301496068712


Period=3 RMSE = 25.668154452989597


Period=4 RMSE = 21.403135237876427


Period=5 RMSE = 1.2269694263224946


Period=6 RMSE = 23.593933002945732


Period=7 RMSE = 0.7342296379937046


Period=8 RMSE = 0.6814236667748258


Period=9 RMSE = 0.7219425393306387



In [55]:
compare_decay_periods(n=600, k=1000)


Period=2 RMSE = 4494.5415073248305


Period=3 RMSE = 13.603330595761703


Period=4 RMSE = 20003.267133295038


Period=5 RMSE = 2.511113141159035


Period=6 RMSE = 16.863983287582364


Period=7 RMSE = 2.757625671600112


Period=8 RMSE = 3.431179013258927


Period=9 RMSE = 1.5747452990033544

