In [59]:
import random
from functools import lru_cache
import os
import math

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow import keras

import sklearn.preprocessing
from sklearn.preprocessing import LabelEncoder

In [60]:
#actual path
fileDir = os.path.dirname(os.path.realpath('__file__'))
#correct path
rating_path = os.path.join(fileDir, '../data/ratings.csv')
df = pd.read_csv(rating_path, usecols=['userId', 'movieId', 'rating'])

In [61]:
movie_path = os.path.join(fileDir, '../data/movies.csv')
movie_df = pd.read_csv(movie_path)
mdf = movie_df

In [62]:
print(mdf.shape)
mdf.title.nunique()

(58098, 3)


58020

In [63]:
# Shuffle (reproducibly)
df = df.sample(frac=1, random_state=1).reset_index(drop=True)

# Partitioning train/val according to behaviour of keras.Model.fit() when called with
# validation_split kwarg (which is to take validation data from the end as a contiguous
# chunk)
val_split = .05
n_ratings = len(df)
n_train = math.floor(n_ratings * (1-val_split))
itrain = df.index[:n_train]
ival = df.index[n_train:]

# Compactify movie ids. 
movie_id_encoder = LabelEncoder()
# XXX: Just fitting globally for simplicity. See movie_helpers.py for more 'principled'
# approach. I don't think there's any realistically useful data leakage here though.
orig_movieIds = df['movieId']
df['orig_movieId'] = orig_movieIds
df['movieId'] = movie_id_encoder.fit_transform(df['movieId'])

# Add centred target variable
df['y'] = df['rating'] - df.loc[itrain, 'rating'].mean()

SCALE = 0
if SCALE:
    # Add version of target variable scale to [0, 1]
    yscaler = sklearn.preprocessing.MinMaxScaler()
    yscaler.fit(df.loc[itrain, 'rating'].values.reshape(-1, 1))
    df['y_unit_scaled'] = yscaler.transform(df['rating'].values.reshape(-1, 1))


In [64]:
def munge_title(title):
    i = title.rfind(' (')
    if i != -1:
        title = title[:i]
    for suff_word in ['The', 'A', 'An']:
        suffix = ', {}'.format(suff_word)
        if title.endswith(suffix):
            title = suff_word + ' ' + title[:-len(suffix)]
    return title

def get_year(title):
    l = title.rfind('(') + 1
    try:
        return int(title[l:l+4])
    except ValueError:
        print(title, end='\t')
        return 0


# XXX: hack
assert mdf.loc[
    mdf.movieId==64997,
    'title'].iloc[0] == 'War of the Worlds (2005)'
mdf.loc[
    mdf.movieId==64997,
    'title'
] = 'War of the Worlds (2005)x'

mdf['orig_movieId'] = mdf['movieId']
n_orig = len(mdf)

# There are some movies listed in movie.csv which have no ratings. Drop them.
whitelist = set(movie_id_encoder.classes_)
mdf = mdf[mdf['movieId'].isin(whitelist)].copy()
print("Went from {} movies to {} after filtering out movies with no ratings".format(
    n_orig, len(mdf)
))

# New, compact movie Ids
mdf['movieId'] = movie_id_encoder.transform(mdf['movieId'].values)

mdf = mdf.sort_values(by='movieId').reset_index(drop=True)

# By default use original title field (which includes year of release) as unique key
mdf['key'] = mdf['title']

mdf['year'] = mdf['title'].map(get_year)
mdf['old_title'] = mdf['title']
mdf['title'] = mdf['title'].map(munge_title)

# For movies whose munged title are unique, use it as their key
title_counts = mdf.groupby('title').size()
unique_titles = title_counts.index[title_counts == 1]
unique_ids = mdf.index[mdf.title.isin(unique_titles)]
mdf.loc[unique_ids, 'key'] = mdf.loc[unique_ids, 'title']

mdf['n_ratings'] = df.groupby('movieId').size()
mean_ratings = df.groupby('movieId')['rating'].mean()
mdf['mean_rating'] = mean_ratings

Went from 58098 movies to 53889 after filtering out movies with no ratings
Category 6: Day of Destruction	Babylon 5	Millions Game, The (Das Millionenspiel)	Bicycle, Spoon, Apple (Bicicleta, cullera, poma)	Brazil: In the Shadow of the Stadiums	Slaying the Badger	Tatort: Im Schmerz geboren	Terrible Joe Moran	The Court-Martial of Jackie Robinson	In Our Garden	Stephen Fry In America - New World	Two: The Story of Roman & Nyro	A Year Along the Abandoned Road	Body/Cialo	Polskie gówno	The Third Reich: The Rise & Fall	My Own Man	Moving Alan	Michael Laudrup - en Fodboldspiller	Blueberry Hill	One Night Only	Doli Saja Ke Rakhna	The Dead Lands	C'mon, Let's Live a Little	For a Book of Dollars	Bad Boys 3	The Moon and the Sun	Señorita Justice	Red Victoria	Vaastupurush	Sierra Leone's Refugee All Stars	L'uomo della carità	Wolves In The Snow	Rosamunde Pilcher - September	The Ritual	The Expedition	Danielův svět	Filmage: The Story of Descendents/All	About Sarah	Swallows and Amazons	Ready Player One	Los ton

In [67]:
ratings_preprocessed_content_path = os.path.join(fileDir, '../processed_data/ratings_content.csv')
movies_preprocessed_content_path = os.path.join(fileDir, '../processed_data/movies_content.csv')

In [68]:
df.to_csv(ratings_preprocessed_content_path, index=False)

In [69]:
mdf.to_csv(movies_preprocessed_content_path, index=False)

In [65]:
print(df.shape)
df.head()

(27753444, 5)


Unnamed: 0,userId,movieId,rating,orig_movieId,y
0,99948,11043,4.0,45517,0.469583
1,151227,1181,4.5,1207,0.969583
2,22361,12300,5.0,56367,1.469583
3,36147,3082,4.0,3168,0.469583
4,143305,11792,5.0,52281,1.469583


In [66]:
print(mdf.shape)
print(mdf.title.nunique())
mdf.head()

(53889, 9)
50530


Unnamed: 0,movieId,title,genres,orig_movieId,key,year,old_title,n_ratings,mean_rating
0,0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1,Toy Story,1995,Toy Story (1995),68469,3.886649
1,1,Jumanji,Adventure|Children|Fantasy,2,Jumanji,1995,Jumanji (1995),27143,3.246583
2,2,Grumpier Old Men,Comedy|Romance,3,Grumpier Old Men,1995,Grumpier Old Men (1995),15585,3.173981
3,3,Waiting to Exhale,Comedy|Drama|Romance,4,Waiting to Exhale,1995,Waiting to Exhale (1995),2989,2.87454
4,4,Father of the Bride Part II,Comedy,5,Father of the Bride Part II,1995,Father of the Bride Part II (1995),15474,3.077291
