In [1]:
import numpy as np
import pandas as pd
import datetime

## Create loss tensor

In [2]:
movie_pd = pd.read_csv(
    'movies.csv'
)
rating_pd = pd.read_csv(
    'ratings.csv'
)
rating_pd = pd.merge(
    rating_pd, movie_pd, on="movieId", how="left", sort=False
)
rating_pd

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller
100833,610,168250,5.0,1494273047,Get Out (2017),Horror
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi


In [3]:
rating_pd['date'] = rating_pd['timestamp'].apply(
    lambda row: str(
        datetime.datetime.fromtimestamp(row).date()
    )
)
rating_pd

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,date
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2000-07-30
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,2000-07-30
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller,2000-07-30
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,2000-07-30
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,2000-07-30
...,...,...,...,...,...,...,...
100831,610,166534,4.0,1493848402,Split (2017),Drama|Horror|Thriller,2017-05-03
100832,610,168248,5.0,1493850091,John Wick: Chapter Two (2017),Action|Crime|Thriller,2017-05-03
100833,610,168250,5.0,1494273047,Get Out (2017),Horror,2017-05-08
100834,610,168252,5.0,1493846352,Logan (2017),Action|Sci-Fi,2017-05-03


In [4]:
rating_pd['genres'] = rating_pd['genres'].apply(
    lambda row: row.split('|')
)

all_pd = rating_pd.explode('genres', ignore_index=True)
all_pd = all_pd.sort_values(['date', 'userId', 'genres'])
all_pd

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,date
182375,429,151,4.0,828124616,Rob Roy (1995),Action,1996-03-29
182386,429,165,4.0,828124615,Die Hard: With a Vengeance (1995),Action,1996-03-29
182389,429,168,5.0,828124616,First Knight (1995),Action,1996-03-29
182395,429,185,5.0,828124616,"Net, The (1995)",Action,1996-03-29
182403,429,204,4.0,828124616,Under Siege 2: Dark Territory (1995),Action,1996-03-29
...,...,...,...,...,...,...,...
221438,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Action,2018-09-24
221440,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Action,2018-09-24
221439,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Comedy,2018-09-24
221441,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Comedy,2018-09-24


In [5]:
best_genre = all_pd[['rating', 'genres']] \
                .groupby('genres') \
                .mean()['rating'].idxmax()
best_genre

'Film-Noir'

In [6]:
loss = (all_pd['rating'].max() + all_pd['rating'].min() - all_pd['rating']) / all_pd['rating'].max()
all_pd['loss'] = loss
all_pd

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,date,loss
182375,429,151,4.0,828124616,Rob Roy (1995),Action,1996-03-29,0.3
182386,429,165,4.0,828124615,Die Hard: With a Vengeance (1995),Action,1996-03-29,0.3
182389,429,168,5.0,828124616,First Knight (1995),Action,1996-03-29,0.1
182395,429,185,5.0,828124616,"Net, The (1995)",Action,1996-03-29,0.1
182403,429,204,4.0,828124616,Under Siege 2: Dark Territory (1995),Action,1996-03-29,0.3
...,...,...,...,...,...,...,...,...
221438,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Action,2018-09-24,0.8
221440,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Action,2018-09-24,0.6
221439,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Comedy,2018-09-24,0.8
221441,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Comedy,2018-09-24,0.6


In [7]:
rec_loss = all_pd[['userId', 'loss', 'genres', 'date']] \
                    .groupby(['date', 'userId', 'genres']) \
                    .mean()
rec_loss

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,loss
date,userId,genres,Unnamed: 3_level_1
1996-03-29,429,Action,0.323529
1996-03-29,429,Adventure,0.316667
1996-03-29,429,Animation,0.166667
1996-03-29,429,Children,0.300000
1996-03-29,429,Comedy,0.319048
...,...,...,...
2018-09-23,514,Sci-Fi,0.550000
2018-09-23,514,Thriller,0.600000
2018-09-24,514,Action,0.700000
2018-09-24,514,Comedy,0.700000


In [8]:
# sanity check
assert rec_loss.groupby('genres').mean()['loss'].idxmin() == 'Film-Noir'
assert rec_loss.groupby('genres').mean()['loss'].max() <= 1
assert rec_loss.groupby('genres').mean()['loss'].min() >= 0

In [9]:
rec_loss.to_pickle('./MovieLens_loss.pkl')