# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from enum import auto, Enum
from datetime import datetime
from collections import defaultdict
import random
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Dataset Analysis

### Current Dataset Info

In [7]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Active Users

In [8]:
active_user_ratings = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].mean())
active_user_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].count())
active_user_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,rating,No_of_ratings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
414,3.391957,2698
599,2.64205,2478
474,3.398956,2108
448,2.847371,1864
274,3.235884,1346
610,3.688556,1302
68,3.23373,1260
380,3.673235,1218
606,3.657399,1115
288,3.145972,1055


## Most Rated Movies

In [9]:
mean_ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
mean_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
movies_with_mean_ratings = pd.merge(movies[['item_id','title']], mean_ratings, on='title').set_index('item_id')
movies_with_mean_ratings.columns = ['title', 'mean_rating', 'No_of_ratings']
movies_with_mean_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,title,mean_rating,No_of_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump,4.164134,329
318,"Shawshank Redemption, The",4.429022,317
296,Pulp Fiction,4.197068,307
593,"Silence of the Lambs, The",4.16129,279
2571,"Matrix, The",4.192446,278
260,Star Wars: Episode IV - A New Hope,4.231076,251
480,Jurassic Park,3.75,238
110,Braveheart,4.031646,237
589,Terminator 2: Judgment Day,3.970982,224
527,Schindler's List,4.225,220


## Recommendation System Framework For Decay

In [10]:
class PredictionTimeConstraint(Enum):
    AT = auto()
    IN = auto()
    NO = auto()

In [11]:
class RecSys:
    def __init__(self, movie_ratings_data, min_common_elements):
        self.movie_ratings = movie_ratings_data
        self.min_common_elements = min_common_elements
        # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
        self.user_movie_matrix = self.movie_ratings.pivot_table(index='title', columns='user_id', values='rating')
        self.user_corr_matrix = self.user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
        
        # Temporal User Correlations Matrix CACHE details
        # Since analysis requires us do more than one predictions with same dates save the user_corr_matrix for performance
        self.temporal_user_corr_matrix = None
        self.temporal_time_constraint = None
        self.temporal_dt = None
        self.temporal_start_dt = None
        self.temporal_end_dt = None
    
    def get_k_neighbours(self, user_id, k = 20, 
                         time_constraint=None,
                         start_dt=None,
                         end_dt=None,
                         dt=None):
        
        # Apply Time Constraint and get user_correlations to each other as matrix
        if time_constraint is None:
            user_corr_matrix = self.user_corr_matrix
        else:
            user_corr_matrix = self.create_temporal_corr_matrix(time_constraint=time_constraint,
                                                           dt=dt, start_dt=start_dt, end_dt=end_dt)
        
        # Exit if matrix is None
        if user_corr_matrix is None:
            return None
        
        # Get the 'user_id's sorrelations
        user_corrs = user_corr_matrix.get(user_id)
        if user_corrs is None:
            return None

        # Drop any null, if found
        user_corrs.dropna(inplace=True)

        # Create A dataframe from not-null correlations of the 'user_id'
        users_alike = pd.DataFrame(user_corrs)

        # Rename the only column to 'correlation'
        users_alike.columns= ['correlation']

        # Sort the user correlations in descending order 
        #     so that first one is the most similar, last one least similar
        users_alike.sort_values(by='correlation', ascending=False, inplace=True)

        # Eliminate Correlation to itself by deleting first row, 
        #     since biggest corr is with itself it is in first row
        return users_alike.iloc[1:k]
    
    def predict_movie(self, user_id, movie_id, k = 20, 
                      time_constraint=None,
                      start_dt=None,
                      end_dt=None,
                      dt=None):
        
        # If a movie with movie_id not exists, predict 0
        if self.movie_ratings.loc[self.movie_ratings['item_id'] == movie_id] is None:
            print("Movie Not Found!!")
            return 0

        # Get Nearest Neighbours of the 'user_id'
        k_nearest_neighbours = self.get_k_neighbours(user_id, k=k, 
                                                     time_constraint=time_constraint,
                                                     start_dt=start_dt, end_dt=end_dt,
                                                     dt=dt)

        # Calculate Mean Centered Prediciton
        user_avg_rating = self.get_user_avg_rating(user_id)
        
        #########################################################################
        ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS, PREDICT AS HIS AVERAGE
        if k_nearest_neighbours is None or k_nearest_neighbours.empty:
            return user_avg_rating
        #########################################################################
        weighted_sum = 0.0
        sum_of_weights = 0.0
        for neighbour_id, data in k_nearest_neighbours.iterrows():
            # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
            neighbour_corr = data['correlation']
            neighbour_data = self.movie_ratings.loc[(self.movie_ratings['user_id'] == neighbour_id) & (self.movie_ratings['item_id'] == movie_id)]
            # If the neighbour doesnt give rating to the movie_id, pass this around of the loop
            if neighbour_data.empty:
                continue
            neighbour_avg_rating = self.get_user_avg_rating(neighbour_id)
            neighbour_rating = float(neighbour_data.rating)
            neighbour_mean_centered_rating =  neighbour_rating - neighbour_avg_rating
            # Calculate Weighted sum and sum of weights
            weighted_sum += neighbour_mean_centered_rating * neighbour_corr
            sum_of_weights += neighbour_corr

        # Predict
        if sum_of_weights != 0:
            prediction_rating = user_avg_rating + (weighted_sum / sum_of_weights)
        else:
            #########################################################################
            ##### IN CASE WE COULDNT FIND ENOUGH NEIGHBOURS THAT GIVEN RATING TO THE movie_id, PREDICT AS HIS AVERAGE
            prediction_rating = user_avg_rating
            #########################################################################
            
        return prediction_rating
    
    def is_temporal_cache_valid(self,
                                time_constraint=None,
                                start_dt=None,
                                end_dt=None,
                                dt=None):
        
        if self.temporal_time_constraint != time_constraint or time_constraint is None:
            return False
        
        # If 'AT' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.AT and self.temporal_dt == dt:
            return True
        
        # If 'IN' constraint found and same as last calculated matrix constraints then cache valid
        if time_constraint == PredictionTimeConstraint.IN and self.temporal_start_dt == start_dt and self.temporal_end_dt == end_dt:
            return True

        # Otherwise cache invalid
        return False
    
    def create_temporal_corr_matrix(self, time_constraint=PredictionTimeConstraint.NO,
                                          start_dt=None,
                                          end_dt=None,
                                          dt=None):
        
        # Check cache, if valid return the cache
        temporal_cache_status = self.is_temporal_cache_valid(time_constraint=time_constraint,start_dt=start_dt,end_dt=end_dt,dt=dt)
        if temporal_cache_status == True:
            return self.temporal_user_corr_matrix
        
        # No valid cache, recreate the correlations matrix 
        if time_constraint == PredictionTimeConstraint.AT and dt != None:
            user_movie_matrix = self.movie_ratings[(self.movie_ratings.timestamp < dt)].pivot_table(index='title', columns='user_id', values='rating') 
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            user_movie_matrix =  self.movie_ratings[(self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)].pivot_table(index='title', columns='user_id', values='rating') 
        else:
            return None
        
        # Create CACHE
        self.temporal_time_constraint = time_constraint
        self.temporal_start_dt = start_dt
        self.temporal_end_dt = end_dt
        self.temporal_dt = dt
        self.temporal_user_corr_matrix = user_movie_matrix.corr(method='pearson', min_periods=self.min_common_elements) 
        
        return self.temporal_user_corr_matrix
        
    def get_user_rating(self, user_id):
        return self.movie_ratings.loc[self.movie_ratings['user_id'] == user_id]

    def get_user_rating_for_movie(self, user_id, movie_id):
        return self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,2]

    def get_rating_timestamp(self, user_id, movie_id):
        return self.movie_ratings.loc[ (self.movie_ratings['user_id'] == user_id) & (self.movie_ratings['item_id'] == movie_id) ].values[0,3]

    def get_movies_watched(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))][['item_id','title','rating']].set_index('item_id')

    def get_user_avg_rating(self, user_id):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id))].rating.mean()

    def get_user_avg_rating_at(self, user_id, year = 2015, month = 7, day = 15):
        return self.movie_ratings.loc[((self.movie_ratings['user_id'] == user_id)) & (self.movie_ratings.timestamp < datetime(year, month, day))].rating.mean()
    
    
    def get_movies_watched(self, user_id,
                           time_constraint=None, 
                           dt=None, start_dt=None, end_dt=None):
        if time_constraint == None or time_constraint == PredictionTimeConstraint.NO:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.AT and dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp < dt)][['item_id','rating']]
        elif time_constraint == PredictionTimeConstraint.IN and start_dt != None and end_dt != None:
            return self.movie_ratings.loc[(self.movie_ratings['user_id'] == user_id) & (self.movie_ratings.timestamp >= start_dt) & (self.movie_ratings.timestamp < end_dt)][['item_id','rating']]
        else:
            return None                                      # Unknown time constraint, exit the function
        
    def predict_movies_watched(self, user_id,
                               k = 10,
                               time_constraint=None,
                               dt = None,
                               start_dt = None,
                               end_dt = None):
        
        # Get all movies watched by a user
        movies_watched = self.get_movies_watched(user_id)

        if movies_watched is None or movies_watched.empty:
            return None

        predictions = list()
        
        number_of_predictions = 0
        for _,row in movies_watched.iterrows():
            prediction = self.predict_movie(user_id=user_id, movie_id=row['item_id'], 
                                              time_constraint=time_constraint,
                                              start_dt=start_dt, end_dt=end_dt,dt=dt)
            if number_of_predictions == k:
                break
            predictions.append([prediction,row['rating'],row['item_id']])
            number_of_predictions += 1

        predictions_df = pd.DataFrame(predictions, columns = ['prediction','rating', 'movie_id'])
        predictions_df.movie_id = predictions_df.movie_id.astype(int)
        return predictions_df.set_index('movie_id')
    
    @staticmethod
    def rmse(predictions_df):
        number_of_predictions = predictions_df.count()
        sum_of_square_differences = 0.0
        for _,row in predictions_df.iterrows():
            prediction_rating = row['prediction'] 
            if prediction_rating != 0:
                sum_of_square_differences += (row['rating'] - prediction_rating) ** 2
        result = sum_of_square_differences/number_of_predictions
        return result[0]

In [12]:
evaluator = RecSys(movie_ratings,20)

In [13]:
evaluator.predict_movies_watched(user_id=414, k=10)

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
3,3.391957,4.0
6,3.391957,3.0
47,4.14796,4.0
50,4.286183,5.0
101,3.391957,4.0
110,3.731189,5.0
151,3.391957,5.0
157,3.391957,4.0
163,3.713386,4.0
216,3.391957,3.0


In [14]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=414, k=10))

0.6225668481250979

In [15]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2015,7,15))

3.3135593974587114

In [16]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.AT, dt=datetime(2016,7,15))

4.410714285714286

In [17]:
evaluator.predict_movie(449,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

3.024993801250808

In [19]:
evaluator.predict_movie(413,3, time_constraint=PredictionTimeConstraint.IN, start_dt=datetime(2000,7,15), end_dt=datetime(2018,7,15))

4.262240539438904

In [20]:
evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15))

Unnamed: 0_level_0,prediction,rating
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6,3.834643,4.5
47,4.174495,4.0
50,4.457002,4.5
70,2.778037,4.5
110,3.948099,3.5
163,3.689912,3.5
223,3.588909,4.5
231,3.312023,0.5
235,4.455266,4.5
260,4.642807,3.5


In [21]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=10, 
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

1.3722583014611394

In [22]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=100,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

1.0556128111180385

In [23]:
RecSys.rmse(evaluator.predict_movies_watched(user_id=182, k=1000,                       
                       time_constraint=PredictionTimeConstraint.IN,
                       start_dt=datetime(2000,5,12), end_dt=datetime(2005,1,15)))

0.9881378316234875