In [1]:
import pandas as pd
import os

In [3]:
cwd = os.getcwd()
# Loading the dataset
all_ratings = pd.read_csv(os.path.join(cwd, 'data', 'ratings.csv'))

# Find the timestamp for a 0.90 quantile to keep ratings before this timestamp as training and the rest for evaluations
timestamp = all_ratings['timestamp'].quantile(0.90)

In [19]:
# Removing ratings that are zero as they are not correct
all_ratings = all_ratings[all_ratings['rating'] > 0.0]

# Splitting the dataset to train and evaluation
train_ratings = all_ratings[all_ratings['timestamp'] <= timestamp]
evaluation_ratings = all_ratings

print(f"Number of ratings in the Training Set: {len(train_ratings)}")
print(
    f"Number of ratings in the Evaluation Set Before Removing Users and Movies not Present in Training Set: {len(evaluation_ratings)}")

# Remove users in Evaluation set that are not in the training set
train_user_ids = train_ratings['userId'].unique().tolist()
evaluation_ratings = evaluation_ratings[evaluation_ratings['userId'].isin(train_user_ids)]
print(
    f"Number of ratings in the Evaluation Set After Removing Users not Present in Training Set: {len(evaluation_ratings)}")

# Remove movies in Evaluation set that are not in the training set
train_movie_ids = train_ratings['movieId'].unique().tolist()
evaluation_ratings = evaluation_ratings[evaluation_ratings['movieId'].isin(train_movie_ids)]
print(
    f"Number of ratings in the Evaluation Set After Removing Movies not Present in Training Set: {len(evaluation_ratings)}")

Number of ratings in the Training Set: 23421860
Number of ratings in the Evaluation Set Before Removing Users and Movies not Present in Training Set: 26024289
Number of ratings in the Evaluation Set After Removing Users not Present in Training Set: 23857492
Number of ratings in the Evaluation Set After Removing Movies not Present in Training Set: 23811904


In [20]:
# Some Insights
print(f"Total Number of Users: {len(evaluation_ratings['userId'].unique())}")
print(f"Total Number of Movies: {len(evaluation_ratings['movieId'].unique())}")
print(
    f"Evaluation and Training Sets have the Same Number of Users: {len(train_ratings['userId'].unique()) == len(evaluation_ratings['userId'].unique())}")
print(
    f"Evaluation and Training Sets have the Same Number of Movies: {len(train_ratings['movieId'].unique()) == len(evaluation_ratings['movieId'].unique())}")
print(
    f"Number of Ratings Available in Evaluation Set for Final Evaluation: {len(evaluation_ratings) - len(train_ratings)}")

Total Number of Users: 252361
Total Number of Movies: 36305
Evaluation and Training Sets have the Same Number of Users: True
Evaluation and Training Sets have the Same Number of Movies: True
Number of Ratings Available in Evaluation Set for Final Evaluation: 390044


In [21]:
# Save Training Dataset and Evaluation Dataset
if not os.path.exists(os.path.join(cwd, 'data', 'processed')):
    os.makedirs(os.path.join(cwd, 'data', 'processed'))
train_ratings.to_csv(os.path.join(cwd, 'data', 'processed', 'train_ratings.csv'), index=False)
evaluation_ratings.to_csv(os.path.join(cwd, 'data', 'processed', 'evaluation_ratings.csv'), index=False)

In [22]:
# Creating User-Item Rating Matrix
# Each row represents a user and each column is the user's rating to a movie
# Missing ratings have been filled with zero
# Since the number of users and movies are a lot, we will store user-item ratings matrix of each user in a separate file
movie_ids = evaluation_ratings['movieId'].unique().tolist()
movie_ids.sort()
movie_mappings = []
movie_mappings_dict = {}
movie_int_id = 0
print("Creating Movie Mappings")
for movie_id in movie_ids:
    movie_mappings_dict[str(movie_id)] = movie_int_id
    movie_mappings.append({'id': movie_int_id, 'rating_id': movie_id})
    movie_int_id += 1
movie_mappings_df = pd.DataFrame(movie_mappings)
movie_mappings_df.to_csv(os.path.join(cwd, 'data', 'processed', 'movie_mappings.csv'), index=False)

print("Creating Users Dataset")

user_ids = list(evaluation_ratings['userId'].unique())
user_ids.sort()

users_pd = pd.DataFrame(user_ids, columns=['id'])
users_pd.to_csv(os.path.join(cwd, 'data', 'processed', 'users.csv'), index=False)

Creating Movie Mappings
Creating Users Dataset


In [23]:
movies_ratings = {}
print("Calculating Movie Average Ratings")
for movie in evaluation_ratings['movieId'].unique().tolist():
    movies_ratings[movie] = {'ratings': 0.0, 'users': 0, 'average': 0}

for rating in evaluation_ratings.to_dict(orient='records'):
    movies_ratings[rating['movieId']]['ratings'] += rating['rating']
    movies_ratings[rating['movieId']]['users'] += 1
    average_rating = movies_ratings[rating['movieId']]['ratings'] / movies_ratings[rating['movieId']]['users']
    movies_ratings[rating['movieId']]['average'] = average_rating

ratings_info = []

for movie_id in movies_ratings:
    ratings_info.append({'id': movie_id, 'rating': movies_ratings[movie_id]['average']})

data_frame = pd.DataFrame(ratings_info)
data_frame.to_csv(os.path.join(cwd, 'data', 'processed', 'movies_average_ratings.csv'), index=False)


Calculating Movie Average Ratings


In [24]:
# Because of the file being huge, we will save every user's ratings separately
print("Creating User-Ratings")
if not os.path.exists(os.path.join(cwd, 'data', 'processed', 'train_user_ratings')):
    os.mkdir(os.path.join(cwd, 'data', 'processed', 'train_user_ratings'))
if not os.path.exists(os.path.join(cwd, 'data', 'processed', 'evaluation_user_ratings')):
    os.mkdir(os.path.join(cwd, 'data', 'processed', 'evaluation_user_ratings'))
for (_, user_id) in enumerate(user_ids):
    if (_+1) % 1000 == 0:
        print(f"{len(user_ids) - (_+1)} Users Left", end='-')
    if (_+1) % 50000 == 0:
        print("\n")
    train_user_ratings = train_ratings[train_ratings['userId'] == user_id]
    evaluation_user_ratings = evaluation_ratings[evaluation_ratings['userId'] == user_id]
    file_name = os.path.join(cwd, 'data', 'processed', 'train_user_ratings', str(user_id) + '.csv')
    train_user_ratings.to_csv(file_name, index=False)
    file_name = os.path.join(cwd, 'data', 'processed', 'evaluation_user_ratings', str(user_id) + '.csv')
    evaluation_user_ratings.to_csv(file_name, index=False)

print("All Done")


Creating User-Ratings
251361 Users Left-250361 Users Left-249361 Users Left-248361 Users Left-247361 Users Left-246361 Users Left-245361 Users Left-244361 Users Left-243361 Users Left-242361 Users Left-241361 Users Left-240361 Users Left-239361 Users Left-238361 Users Left-237361 Users Left-236361 Users Left-235361 Users Left-234361 Users Left-233361 Users Left-232361 Users Left-231361 Users Left-230361 Users Left-229361 Users Left-228361 Users Left-227361 Users Left-226361 Users Left-225361 Users Left-224361 Users Left-223361 Users Left-222361 Users Left-221361 Users Left-220361 Users Left-219361 Users Left-218361 Users Left-217361 Users Left-216361 Users Left-215361 Users Left-214361 Users Left-213361 Users Left-212361 Users Left-211361 Users Left-210361 Users Left-209361 Users Left-208361 Users Left-207361 Users Left-206361 Users Left-205361 Users Left-204361 Users Left-203361 Users Left-202361 Users Left-

201361 Users Left-200361 Users Left-199361 Users Left-198361 Users Left-1973