# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from datetime import datetime
from collections import defaultdict
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


In [7]:
x = movie_ratings.loc[(movie_ratings['user_id'] == 19)].rating.mean()
print(x)

2.607396870554765


## Dataset Analysis

### Current Dataset Info

In [8]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Active Users

In [9]:
active_user_ratings = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].mean())
active_user_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('user_id')['rating'].count())
active_user_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,rating,No_of_ratings
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
414,3.391957,2698
599,2.64205,2478
474,3.398956,2108
448,2.847371,1864
274,3.235884,1346
610,3.688556,1302
68,3.23373,1260
380,3.673235,1218
606,3.657399,1115
288,3.145972,1055


## Most Rated Movies

In [10]:
mean_ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
mean_ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
movies_with_mean_ratings = pd.merge(movies[['item_id','title']], mean_ratings, on='title').set_index('item_id')
movies_with_mean_ratings.columns = ['title', 'mean_rating', 'No_of_ratings']
movies_with_mean_ratings.sort_values(by=['No_of_ratings'], ascending=False).head(20)

Unnamed: 0_level_0,title,mean_rating,No_of_ratings
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump,4.164134,329
318,"Shawshank Redemption, The",4.429022,317
296,Pulp Fiction,4.197068,307
593,"Silence of the Lambs, The",4.16129,279
2571,"Matrix, The",4.192446,278
260,Star Wars: Episode IV - A New Hope,4.231076,251
480,Jurassic Park,3.75,238
110,Braveheart,4.031646,237
589,Terminator 2: Judgment Day,3.970982,224
527,Schindler's List,4.225,220


## Predict Assuming System at The Given Specific Time

In [11]:
def get_user_correlations_at(movie_ratings_data, user_id, 
                             min_common_elements=20, 
                             year=2000, month=1, day=15):

    """
        Get the correlations of the 'user_id' to all other users assuming SYSTEM IS AT THE GIVEN DATE.
        Correlation is calculated if and only if both of them rated 'min_common_elements' number of elements in common.
    """
    
    # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
    # Only those ratings given before 'year/month/day' is included.
    user_movie_matrix = movie_ratings_data[(movie_ratings_data.timestamp < datetime(year, month, day))].pivot_table(index='title', columns='user_id', values='rating') 
    
    # Create a matrix of correlations :: Each row is a user, the columns are the all other users, values are the pearson corrs
    corrs = user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
    
    try:
        # Get the correlations of 'user_id'
        user_corrs = corrs[user_id]
    except(KeyError):
        # No Correlations found
        return None

    # Drop any null if found
    user_corrs.dropna(inplace=True)
    
    # Create A dataframe from not-null correlations of the 'user_id'
    users_alike = pd.DataFrame(user_corrs)
    
    # Rename the only column to 'correlation'
    users_alike.columns= ['correlation']
    
    # Sort the user correlations in descending order so that first one is the most similar, last one least similar
    users_alike.sort_values(by='correlation', ascending=False, inplace=True)
    
    # Eliminate Correlation to itself by deleting first row, since biggest corr is with itself it is in first row
    users_alike = users_alike.iloc[1:] 
    
    return users_alike

In [12]:
def predict_movie_at(movie_ratings_data, user_id, movie_id, 
                     min_common_elements = 20,
                     year = 2015, month = 7, day = 15,
                     default_value=-2):
    
    # If a movie with movie_id not exists, return -1
    if movie_ratings_data.loc[movie_ratings_data['item_id'] == movie_id] is None:
        return -1
    
    # Calculate User Correlations at given system date
    k_nearest_neighbours = get_user_correlations_at(movie_ratings_data, user_id, min_common_elements, year, month, day)

    # If no neighbour is found call the user supplied function
    if k_nearest_neighbours is None:
        return default_value
    
    weighted_sum = 0.0
    sum_of_weights = 0.0
    for neighbour_id, data in k_nearest_neighbours.iterrows():
        # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
        neighbour_corr = data['correlation']
        neighbour_data = movie_ratings_data.loc[(movie_ratings_data['user_id'] == neighbour_id) & (movie_ratings['item_id'] == movie_id)]
        if neighbour_data.empty:
            continue
        neighbour_rating = float(neighbour_data.rating)
        # Calculate Weighted sum and sum of weights
        weighted_sum += neighbour_rating * neighbour_corr
        sum_of_weights += neighbour_corr
    
    # Predict
    if sum_of_weights != 0.0:
        prediction_rating = weighted_sum / sum_of_weights
    else:
        prediction_rating = default_value
    return prediction_rating

In [13]:
x = predict_movie_at(movie_ratings,449,3, year=2015, month=1,day=1)
print(x)

3.148452477590854


## Predict Assuming System in Given Time Interval

In [14]:
def get_user_correlations_in(movie_ratings_data, user_id, 
                             min_common_elements=20, 
                             start_year=1995, start_month=1, start_day=15,
                             end_year=2000, end_month=1, end_day=15):

    """
        Get the correlations of the 'user_id' to all other users assuming rating collection started at 'start_year' ended at 'end_year'
        Correlation is calculated if and only if both of them rated 'min_common_elements' number of elements in common.
    """
    
    # User Movie Matrix, Rows-> Movie_Titles :: Columns -> User_IDs :: Values -> User Ratings
    # Only those ratings given before 'year/month/day' is included.
    user_movie_matrix = movie_ratings_data[(movie_ratings_data.timestamp >= datetime(start_year, start_month, start_day)) & (movie_ratings_data.timestamp < datetime(end_year, end_month, end_day))].pivot_table(index='title', columns='user_id', values='rating') 
    
    # Create a matrix of correlations :: Each row is a user, the columns are the all other users, values are the pearson corrs
    corrs = user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
    
    try:
        # Get the correlations of 'user_id'
        user_corrs = corrs[user_id]
    except(KeyError):
        # No Correlations found
        return None

    # Drop any null if found
    user_corrs.dropna(inplace=True)
    
    # Create A dataframe from not-null correlations of the 'user_id'
    users_alike = pd.DataFrame(user_corrs)
    
    # Rename the only column to 'correlation'
    users_alike.columns= ['correlation']
    
    # Sort the user correlations in descending order so that first one is the most similar, last one least similar
    users_alike.sort_values(by='correlation', ascending=False, inplace=True)
    
    # Eliminate Correlation to itself by deleting first row, since biggest corr is with itself it is in first row
    users_alike = users_alike.iloc[1:] 
    
    return users_alike

In [15]:
def predict_movie_in(movie_ratings_data, user_id, movie_id, 
                     min_common_elements = 20,
                     start_year=1995, start_month=1, start_day=15,
                     end_year=2000, end_month=1, end_day=15,
                     default_value=-2):
    
    # If a movie with movie_id not exists, return -1
    if movie_ratings_data.loc[movie_ratings_data['item_id'] == movie_id] is None:
        return -1
    
    # Calculate User Correlations at given system date
    k_nearest_neighbours = get_user_correlations_in(movie_ratings_data, user_id, min_common_elements, 
                                                    start_year, start_month, start_day,
                                                    end_year, end_month, end_day)
    
    # If no neighbour is found call the user supplied function
    if k_nearest_neighbours is None:
        return default_value
    
    weighted_sum = 0.0
    sum_of_weights = 0.0
    for neighbour_id, data in k_nearest_neighbours.iterrows():
        # Get each neighbour's correlation 'user_id' and her rating to 'movie_id' 
        neighbour_corr = data['correlation']
        neighbour_data = movie_ratings_data.loc[(movie_ratings_data['user_id'] == neighbour_id) & (movie_ratings['item_id'] == movie_id)]
        if neighbour_data.empty:
            continue
        neighbour_rating = float(neighbour_data.rating)
        # Calculate Weighted sum and sum of weights
        weighted_sum += neighbour_rating * neighbour_corr
        sum_of_weights += neighbour_corr
    
    # Predict
    if sum_of_weights != 0.0:
        prediction_rating = weighted_sum / sum_of_weights
    else:
        prediction_rating = default_value
    return prediction_rating

In [16]:
predict_movie_in(movie_ratings,
                 user_id=449,
                 movie_id=3, 
                 min_common_elements = 2,
                 start_year=2000, start_month=7, start_day=15, 
                 end_year=2020, end_month=7, end_day=15,
                 default_value=0)

3.0909869004190123

In [17]:
predict_movie_in(movie_ratings,
                 user_id=449,
                 movie_id=3, 
                 min_common_elements = 2,
                 start_year=2005, start_month=7, start_day=15, 
                 end_year=2010, end_month=7, end_day=15,
                 default_value=0)

0

## Let's Predict Ratings of All Movies Watched by a User

In [18]:
def get_movies_watched(movie_ratings_data, user_id, year = 2015, month = 7, day = 15):
    return movie_ratings.loc[((movie_ratings_data['user_id'] == user_id)) & (movie_ratings_data.timestamp < datetime(year, month, day))][['title','rating']]

In [19]:
print(get_movies_watched(movie_ratings, 449).head(10))

                             title  rating
515            Usual Suspects, The     4.5
3515              Schindler's List     4.0
6177                       Platoon     3.5
8053             Full Metal Jacket     3.0
10496  Back to the Future Part III     2.5
10838     Honey, I Shrunk the Kids     2.0
11746           American History X     4.5
11951                     Rushmore     4.0
12257                 Office Space     4.5
12844                   Mummy, The     2.5


In [20]:
def get_user_avg_rating_at(movie_ratings_data, user_id, year = 2015, month = 7, day = 15):
    return movie_ratings.loc[((movie_ratings_data['user_id'] == user_id)) & (movie_ratings_data.timestamp < datetime(year, month, day))].rating.mean()

In [21]:
print(get_user_avg_rating_at(movie_ratings, 449))

3.289473684210526


In [22]:
def predict_all_watched_movies(movie_ratings_data, user_id,
                               min_common_elements = 20, 
                               year = 2015, month = 7, day = 15, default_value=0):
    movies_watched = ratings.loc[((ratings['user_id'] == user_id)) & (ratings.timestamp < datetime(year, month, day))][['item_id','rating']]
    
    number_of_predictions = 0.0
    sum_of_square_differences = 0.0
    for _,row in movies_watched.iterrows():
        prediction = predict_movie_at(movie_ratings_data,
                                      user_id=user_id,
                                      movie_id=row['item_id'],
                                      year=year, month=month, day=day,
                                      min_common_elements=min_common_elements, 
                                      default_value=default_value)
        sum_of_square_differences += (row['rating'] - prediction) ** 2
        number_of_predictions += 1
        print(f"Rating={row['rating']}, Predicted={prediction}, Movie_id={row['item_id']}")
    rmse = sum_of_square_differences/number_of_predictions
    print(f"RMSE = {rmse}")

In [23]:
predict_all_watched_movies(movie_ratings, user_id=449, min_common_elements=2, year=2020)

Rating=4.0, Predicted=4.004435607830772, Movie_id=32.0
Rating=4.5, Predicted=4.543283422452302, Movie_id=50.0
Rating=0.5, Predicted=2.211862502290493, Movie_id=186.0
Rating=4.0, Predicted=3.992030909911633, Movie_id=293.0
Rating=5.0, Predicted=4.9974596746872075, Movie_id=318.0
Rating=3.5, Predicted=2.772287114397526, Movie_id=370.0
Rating=4.0, Predicted=4.482440390340926, Movie_id=527.0
Rating=1.5, Predicted=1.8923940053128374, Movie_id=762.0
Rating=3.5, Predicted=4.201817764969903, Movie_id=1090.0
Rating=3.0, Predicted=3.9466712659220256, Movie_id=1222.0
Rating=4.0, Predicted=4.210045181468951, Movie_id=1262.0
Rating=1.5, Predicted=2.7401432863869135, Movie_id=1917.0
Rating=1.0, Predicted=2.7680539598611325, Movie_id=2006.0
Rating=2.5, Predicted=3.247699656572003, Movie_id=2011.0
Rating=2.5, Predicted=3.1218841434360605, Movie_id=2012.0
Rating=2.0, Predicted=2.6115456410899465, Movie_id=2054.0
Rating=2.5, Predicted=2.9206018522437174, Movie_id=2302.0
Rating=3.0, Predicted=3.911586145

In [35]:
predict_all_watched_movies(movie_ratings, user_id=449, min_common_elements=2, year=2003)

Rating=4.0, Predicted=3.9039833652000735, Movie_id=32.0
Rating=4.5, Predicted=4.598229754264022, Movie_id=50.0
Rating=0.5, Predicted=2.3429443827127425, Movie_id=186.0
Rating=4.0, Predicted=3.7144254713814138, Movie_id=293.0
Rating=5.0, Predicted=5.183872953178719, Movie_id=318.0
Rating=3.5, Predicted=3.057438706999848, Movie_id=370.0
Rating=4.0, Predicted=4.717354829210552, Movie_id=527.0
Rating=1.5, Predicted=1.729218551154083, Movie_id=762.0
Rating=3.5, Predicted=4.237291708454862, Movie_id=1090.0
Rating=3.0, Predicted=4.006670089401794, Movie_id=1222.0
Rating=4.0, Predicted=4.531856620854257, Movie_id=1262.0
Rating=1.5, Predicted=2.4932114405032384, Movie_id=1917.0
Rating=1.0, Predicted=2.767780666656726, Movie_id=2006.0
Rating=2.5, Predicted=2.6550344992653936, Movie_id=2011.0
Rating=2.5, Predicted=3.045961570283972, Movie_id=2012.0
Rating=2.0, Predicted=2.7684976864075823, Movie_id=2054.0
Rating=2.5, Predicted=3.09535605439345, Movie_id=2302.0
Rating=3.0, Predicted=3.825579310405

In [24]:
print("First Movie Rating: ", movie_ratings['timestamp'].min() )
print("Today: ", datetime.now())

First Movie Rating:  1996-03-29 18:36:55
Today:  2020-03-31 10:08:58.782117


In [25]:
predict_movie_at(movie_ratings,user_id=449,movie_id=3,
                 year=2015, month=7, day=15,
                 min_common_elements=2, default_value=0)

3.1139499556033217

In [26]:
predict_movie_at(movie_ratings,user_id=449,movie_id=260,
                 year=2010, month=7, day=15,
                 min_common_elements=2, default_value=0)

4.381345234725266

In [57]:
def get_user_rating(ratings_data, user_id):
    return movie_ratings.loc[ratings_data['user_id'] == user_id]

In [58]:
def get_user_rating_for_movie(ratings_data, user_id, movie_id):
    return movie_ratings.loc[ (ratings_data['user_id'] == user_id) & (ratings_data['item_id'] == movie_id) ].values[0,2]

In [59]:
def get_rating_timestamp(ratings_data, user_id, movie_id):
    return movie_ratings.loc[ (ratings_data['user_id'] == user_id) & (ratings_data['item_id'] == movie_id) ].values[0,3]

In [63]:
get_user_rating(ratings,449).head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
70518,322,8228,3.5,2008-08-02 11:07:13,"Maltese Falcon, The (a.k.a. Dangerous Female)",Mystery,1931.0
70519,346,8228,3.0,2005-12-05 16:19:03,"Maltese Falcon, The (a.k.a. Dangerous Female)",Mystery,1931.0
70520,387,8228,4.0,2004-09-11 04:15:06,"Maltese Falcon, The (a.k.a. Dangerous Female)",Mystery,1931.0
70521,397,8228,4.0,2007-06-16 04:16:03,"Maltese Falcon, The (a.k.a. Dangerous Female)",Mystery,1931.0
70522,474,8228,4.0,2004-05-24 12:59:22,"Maltese Falcon, The (a.k.a. Dangerous Female)",Mystery,1931.0


In [64]:
get_rating_timestamp(ratings,449,32)

Timestamp('2008-08-02 11:07:13')

In [65]:
get_user_rating_for_movie(ratings,449,32)

3.5