# Basic Recommendation on Movielens Dataset

## Import Libraries

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from enum import auto, Enum
from datetime import datetime
from collections import defaultdict
from timeit import default_timer
from collections import defaultdict
import random
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [80]:
class PredictionTimeConstraint(Enum):
    AT = auto()
    IN = auto()
    NO = auto()
    
class RecSys:
    def __init__(self, movie_ratings_data, min_common_elements):
        self.movie_ratings = movie_ratings_data
        self.min_common_elements = min_common_elements
        # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
        self.user_movie_matrix = self.movie_ratings.pivot_table(index='title', columns='user_id', values='rating')
        self.user_corr_matrix = self.user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
        
        # Temporal User Correlations Matrix CACHE details
        # Since analysis requires us do more than one predictions with same dates save the user_corr_matrix for performance
        self.temporal_user_corr_matrix = None
        self.temporal_time_constraint = None
        self.temporal_dt = None
        self.temporal_start_dt = None
        self.temporal_end_dt = None
    
    def get_k_neighbours(self, user_id, k = 20, 
                         time_constraint=None,
                         start_dt=None,
                         end_dt=None,
                         dt=None):
        
        # Apply Time Constraint and get user_correlations to each other as matrix
        if time_constraint is None:
            user_corr_matrix = self.user_corr_matrix
        else:
            user_corr_matrix = self.create_temporal_corr_matrix(time_constraint=time_constraint,
                                                           dt=dt, start_dt=start_dt, end_dt=end_dt)
        
        # Exit if matrix is None
        if user_corr_matrix is None:
            return None
        
        # Get the 'user_id's sorrelations
        user_corrs = user_corr_matrix.get(user_id)
        if user_corrs is None:
            return None

        # Drop any null, if found
        user_corrs.dropna(inplace=True)

        # Create A dataframe from not-null correlations of the 'user_id'
        users_alike = pd.DataFrame(user_corrs)

        # Rename the only column to 'correlation'
        users_alike.columns= ['correlation']

        # Sort the user correlations in descending order 
        #     so that first one is the most similar, last one least similar
        users_alike.sort_values(by='correlation', ascending=False, inplace=True)

        # Eliminate Correlation to itself by deleting first row, 
        #     since biggest corr is with itself it is in first row
        return users_alike.iloc[1:k]
    
    def predict_movie(self, user_id, movie_id, k = 10, 
                      time_constraint=None,
                      start_dt=None,
                      end_dt=None,
                      dt=None):
        
        # If a movie with movie_id not exists, predict 0
        if self.movie_ratings.loc[self.movie_ratings['item_id'] == movie_id] is None:
            print("Movie Not Found!!")
            return 0

        # Get Nearest Neighbours of the 'user_id'
        k_nearest_neighbours = self.get_k_neighbours(user_id, k=k, 
                                                     time_constraint=time_constraint,
                                                     start_dt=start_dt, end_dt=end_dt,
                                                     dt=dt)

        # Calculate Mean Centered Prediciton
        user_avg_rating = self.get_user_avg_rating(user_id)
        
        #########################################################################
        ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS, PREDICT AS HIS AVERAGE
        if k_nearest_neighbours is None or k_nearest_neighbours.empty:
            return user_avg_rating
        #########################################################################
        weighted_sum = 0.0
        sum_of_weights = 0.0
        for neighbour_id, data in k_nearest_neighbours.iterrows():
            # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
            neighbour_corr = data['correlation']
            neighbour_data = self.movie_ratings.loc[(self.movie_ratings['user_id'] == neighbour_id) & (self.movie_ratings['item_id'] == movie_id)]
            # If the neighbour doesnt give rating to the movie_id, pass this around of the loop
            if neighbour_data.empty:
                continue
            neighbour_avg_rating = self.get_user_avg_rating(neighbour_id)
            neighbour_rating = float(neighbour_data.rating)
            neighbour_mean_centered_rating =  neighbour_rating - neighbour_avg_rating
            # Calculate Weighted sum and sum of weights
            weighted_sum += neighbour_mean_centered_rating * neighbour_corr
            sum_of_weights += neighbour_corr

        # Predict
        if sum_of_weights != 0:
            prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
        else:
            #########################################################################
            ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS THAT GIVEN RATING TO THE movie_id, PREDICT AS HIS AVERAGE
            prediction_rating = user_avg_rating
            #########################################################################
            
        return prediction_rating
    
    def is_temporal_cache_valid(self,
                                time_constraint=None,
                                start_dt=None,
                                end_dt=None,
                                dt=None):
        
        if self.temporal_time_constraint != time_constraint or time_constraint is None:
            return False
        
        # If 'AT' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.AT and self.temporal_dt == dt:
            return True
        
        # If 'IN' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.IN and self.temporal_start_dt == start_dt and self.temporal_end_dt == end_dt:
            return True

        # Otherwise cache invalid
        return False
    
    def create_temporal_corr_matrix(self, time_constraint=PredictionTimeConstraint.NO,
                                          start_dt=None,
                                          end_dt=None,
                                          dt=None):
        
        # Check cache, if valid return the cache
        temporal_cache_status = self.is_temporal_cache_valid(time_constraint=time_constraint,start_dt=start_dt,end_dt=end_dt,dt=dt)
        if temporal_cache_status == True:
            return self.temporal_user_corr_matrix
        
        # No valid cache, recreate the correlations matrix 
        if time_constraint == PredictionTimeConstraint.AT and dt != None:
            user_movie_matrix = self.movie_ratings[(self.movie_ratings.timestamp < dt)].pivot_table(index='title', columns='user_id', values='rating') 
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            user_movie_matrix =  self.movie_ratings[(self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)].pivot_table(index='title', columns='user_id', values='rating') 
        else:
            return None
        
        # Create CACHE
        self.temporal_time_constraint = time_constraint
        self.temporal_start_dt = start_dt
        self.temporal_end_dt = end_dt
        self.temporal_dt = dt
        self.temporal_user_corr_matrix = user_movie_matrix.corr(method='pearson', min_periods=self.min_common_elements) 
        
        return self.temporal_user_corr_matrix
        
    def get_user_rating(self, user_id):
        return self.movie_ratings.loc[self.movie_ratings['user_id'] == user_id]

    def get_user_rating_for_movie(self, user_id, movie_id):
        try:
            rating = self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,2]
        except IndexError:
            rating = 0
        return rating 

    def get_rating_timestamp(self, user_id, movie_id):
        return self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,3]

    def get_movies_watched(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))][['item_id','title','rating']].set_index('item_id')

    def get_user_avg_rating(self, user_id):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id))].rating.mean()

    def get_user_avg_rating_at(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))].rating.mean()
    
    
    def get_movies_watched(self, user_id,
                           time_constraint=None, 
                           dt=None, start_dt=None, end_dt=None):
        if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.AT and dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp < dt)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)][['item_id','rating']]
        else:
            return None                                      # Unknown time constraint, exit the function
    
    def get_random_movie_watched(self, user_id):
        return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id)][['item_id']].iloc[0][0]
    
    def predict_movies_watched(self, user_id,
                               k = 10,
                               time_constraint=None,
                               dt = None,
                               start_dt = None,
                               end_dt = None):
        
        # Get all movies watched by a user
        movies_watched = self.get_movies_watched(user_id)

        if movies_watched is None or movies_watched.empty:
            return None

        predictions = list()
        
        number_of_predictions = 0
        for _,row in movies_watched.iterrows():
            prediction = self.predict_movie(user_id=user_id, movie_id=row['item_id'], 
                                              time_constraint=time_constraint,
                                              start_dt=start_dt, end_dt=end_dt,dt=dt)
            if number_of_predictions == k:
                break
            predictions.append([prediction,row['rating'],row['item_id']])
            number_of_predictions += 1

        predictions_df = pd.DataFrame(predictions, columns = ['prediction','rating', 'movie_id'])
        predictions_df.movie_id = predictions_df.movie_id.astype(int)
        return predictions_df.set_index('movie_id')
    
    ## Load Datasets
    @staticmethod
    def load_movielens_small():
        col_names = ['user_id', 'item_id', 'rating', 'timestamp']
        ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)

        col_names = ['item_id', 'title', 'genres']
        movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
        
        movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
        movies.year = pd.to_datetime(movies.year, format='%Y')
        movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
        movies.title = movies.title.str[:-7]

        ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

        movie_ratings = pd.merge(ratings, movies, on='item_id')

        return movie_ratings
    
    ## Accuracy Metrics
    
    @staticmethod
    def rmse(predictions_df):
        number_of_predictions = predictions_df.count()
        sum_of_square_differences = 0.0
        for _,row in predictions_df.iterrows():
            prediction_rating = row['prediction'] 
            if prediction_rating != 0:
                sum_of_square_differences += (row['rating'] - prediction_rating) ** 2
        result = sum_of_square_differences/number_of_predictions
        return result[0]

In [3]:
movie_ratings = RecSys.load_movielens_small()

In [4]:
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Most Active Users

In [5]:
active_user_ratings = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].mean())
active_user_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].count())
active_user_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(10)

Unnamed: 0_level_0,rating,No_of_ratings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
414,3.391957,2698
599,2.64205,2478
474,3.398956,2108
448,2.847371,1864
274,3.235884,1346
610,3.688556,1302
68,3.23373,1260
380,3.673235,1218
606,3.657399,1115
288,3.145972,1055


In [6]:
%reset_selective -f (^active_user_ratings$)

## Evaluator

In [81]:
evaluator = RecSys(movie_ratings_data=movie_ratings, min_common_elements=5)

In [8]:
evaluator.predict_movies_watched(user_id=414, k=10)

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,3.391957,4.0
6,3.391957,3.0
47,4.012537,4.0
50,4.248152,5.0
101,3.391957,4.0
110,3.731189,5.0
151,3.391957,5.0
157,3.391957,4.0
163,3.688742,4.0
216,3.391957,3.0


In [9]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=414, k=10))

0.6274407953718544

In [10]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2015,7,15))

1.8116444731567574

In [11]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2016,7,15))

4.410714285714286

In [12]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

2.5774288362750863

In [13]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

4.410714285714286

In [14]:
evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15))

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,4.150142,4.5
47,3.880246,4.0
50,4.343716,4.5
70,2.778037,4.5
110,3.881257,3.5
163,3.906408,3.5
223,3.367151,4.5
231,2.752739,0.5
235,4.455266,4.5
260,4.438914,3.5


In [15]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

1.0678587059650162

In [16]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=100,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.9313640606019753

In [18]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=1000,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.9549217715816656

In [37]:
def compare_no_time_constraint_and_at_constraint(number_of_users):
    """
    For Random 'number_of_users' Users
    Calculate rmse without any time constraint
    Calculate rmse with time constraint
    return as dataframe and also include runtime's of the calculations
    """
    results = list()    
    for i in range(number_of_users):
        user_id = random.randint(1,610)
        
        st = default_timer()
        rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
        runtime_no_constraint = default_timer() - st
        
        st = default_timer()
        rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5,
                                                                       time_constraint=PredictionTimeConstraint.AT,
                                                                       dt=datetime(2004,7,15)))
        runtime_with_constraint = default_timer() - st
        
        results.append([user_id,rmse_no_constraint,runtime_no_constraint,rmse_with_constraint,runtime_with_constraint])
    results = pd.DataFrame(results)
    results.columns= ['user_id','rmse','runtime1','temporal_rmse','runtime2']
    results.set_index('user_id', inplace=True)
    return results

In [39]:
compare_no_time_constraint_and_at_constraint(50)

Unnamed: 0_level_0,rmse,runtime1,temporal_rmse,runtime2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
516,0.089278,0.332686,0.33944,0.196627
500,1.115556,0.286346,1.612488,0.166787
209,0.228505,0.301505,0.267551,0.011877
592,1.769964,0.272082,1.266543,0.131064
445,0.459233,0.253761,0.756122,0.011045
195,0.29294,0.245544,0.694105,0.132518
283,0.324467,0.251963,0.150248,0.153927
34,1.355353,0.24786,1.842672,0.010997
205,0.20245,0.284707,0.19273,0.010841
471,0.107062,0.281091,0.265625,0.011401


In [24]:

print(max_year - min_year)

24


In [28]:
possible_years[1996] += 1
possible_years[1997] += 1
possible_years[1998] += 1

In [29]:
print(possible_years)

defaultdict(<class 'int'>, {1996: 2, 1997: 1, 1998: 1})


In [42]:
def find_best_year_constraint(n, rmse_diff_limit):
    possible_years = defaultdict(int)
    min_year = movie_ratings['timestamp'].min().year + 5 ## No need to scan first 5 years, so less ratings
    max_year = datetime.now().year - 5                   ## No need to scan last 5 years, almost all, no meaning of filter
    
    # For random 'n' users
    for i in range(n):
        # Choose user
        user_id = random.randint(1,610)
        # Find Best Year
        for year in range(min_year, max_year):
            rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
            rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5, dt=datetime(year,1,15),
                                                                   time_constraint=PredictionTimeConstraint.AT))
            if abs(rmse_no_constraint - rmse_with_constraint) <= rmse_diff_limit:
                possible_years[year] += 1
    return possible_years

In [45]:
best_years = find_best_year_constraint(n = 5, rmse_diff_limit = 0.1)
print(best_years)

defaultdict(<class 'int'>, {2001: 2, 2002: 3, 2003: 3, 2004: 3, 2005: 3, 2006: 2, 2007: 1, 2008: 1, 2009: 1, 2010: 1, 2011: 1, 2013: 1, 2014: 1})


In [82]:
def compare_decay_intervals(n):
    results = list()
    min_year = movie_ratings['timestamp'].min().year
    max_year = datetime.now().year    
        
    # For random 'n' users
    for i in range(n):
        # Choose user
        user_id = random.randint(1,610)
        usr_avg_rating = evaluator.get_user_avg_rating(user_id)
        chosen_movie_id = evaluator.get_random_movie_watched(user_id)
        # Calculate RMSE for each period in between [2 and 6)
        for period in range(2,6):
            curr_year = min_year
            predictions = list()
            while (curr_year+period) <= max_year:
                # Predict Random Movie'Forrest Gump'-> 356
                prediction = evaluator.predict_movie(user_id=user_id, movie_id=chosen_movie_id, k=5, 
                                                                    start_dt=datetime(curr_year,1,15), 
                                                                    end_dt = datetime(max_year,1,15),
                                                                    time_constraint=PredictionTimeConstraint.IN)
                # If no neighbours found, we predict as his average rating, but here we can not count them, pass them
                if prediction == usr_avg_rating:
                    curr_year += period
                    continue
                rating = evaluator.get_user_rating_for_movie(user_id=user_id,movie_id=356)
                print(f"Period={period} interval=({curr_year},{curr_year+period}) prediction={prediction} actual={rating} user_id={user_id} movie_id={chosen_movie_id}")
                curr_year += period
        print("\n")
        

In [86]:
evaluator.get_random_movie_watched(66)

47

In [88]:
evaluator.get_user_rating_for_movie(66,47)

5.0

In [89]:
compare_decay_intervals(5)

Period=2 interval=(1996,1998) prediction=2.970460033652823 actual=3.0 user_id=430 movie_id=356
Period=2 interval=(1998,2000) prediction=3.272166900230005 actual=3.0 user_id=430 movie_id=356
Period=2 interval=(2000,2002) prediction=3.0 actual=3.0 user_id=430 movie_id=356
Period=2 interval=(2002,2004) prediction=4.8081825499319155 actual=3.0 user_id=430 movie_id=356
Period=3 interval=(1996,1999) prediction=2.970460033652823 actual=3.0 user_id=430 movie_id=356
Period=3 interval=(1999,2002) prediction=3.0 actual=3.0 user_id=430 movie_id=356
Period=3 interval=(2002,2005) prediction=4.8081825499319155 actual=3.0 user_id=430 movie_id=356
Period=4 interval=(1996,2000) prediction=2.970460033652823 actual=3.0 user_id=430 movie_id=356
Period=4 interval=(2000,2004) prediction=3.0 actual=3.0 user_id=430 movie_id=356
Period=5 interval=(1996,2001) prediction=2.970460033652823 actual=3.0 user_id=430 movie_id=356
Period=5 interval=(2001,2006) prediction=4.0378430709629365 actual=3.0 user_id=430 movie_i

In [47]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=413, k=5, start_dt=datetime(2010,1,15), end_dt = datetime(2012,1,15),
                                                                   time_constraint=PredictionTimeConstraint.IN))

evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

0.34725765306122464

In [None]:
def find_best_at_constraint(number_of_users):
    
    results = list()    
    for i in range(number_of_users):
        user_id = random.randint(1,610)
        
        st = default_timer()
        rmse_no_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=10))
        runtime_no_constraint = default_timer() - st
        
        st = default_timer()
        rmse_with_constraint = RecSys.rmse(evaluator.predict_movies_watched(user_id=user_id, k=5,
                                                                       time_constraint=PredictionTimeConstraint.AT,
                                                                       dt=datetime(2010,7,15)))
        runtime_with_constraint = default_timer() - st
        
        results.append([user_id,rmse_no_constraint,runtime_no_constraint,rmse_with_constraint,runtime_with_constraint])
    results = pd.DataFrame(results)
    results.columns= ['user_id','rmse','runtime1','temporal_rmse','runtime2']
    results.set_index('user_id', inplace=True)
    return results

In [None]:
st = default_timer()
RecSys.rmse(evaluator.predict_movies_watched(user_id=449, k=10))
runtime = default_timer() - st
print(runtime)

In [None]:
compare_no_time_constraint_and_at_constraint()