In [1]:
import pandas as pd
import numpy as np

movies_df = pd.read_csv(
    'movies.csv'
)
ratings_df = pd.read_csv(
    'ratings.csv'
)
ratings_df = pd.merge(
    ratings_df,
    movies_df,
    on = 'movieId'
)
ratings_df['genres'] = ratings_df['genres'].apply(
    lambda row: row.split('|')
)
ratings_df = ratings_df.explode('genres').sort_values(['timestamp', 'userId', 'genres'])
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
2387,429,349,3.0,828124615,Clear and Present Danger (1994),Action
4249,429,592,5.0,828124615,Batman (1989),Action
29093,429,165,4.0,828124615,Die Hard: With a Vengeance (1995),Action
31903,429,434,4.0,828124615,Cliffhanger (1993),Action
57892,429,420,2.0,828124615,Beverly Hills Cop III (1994),Action
...,...,...,...,...,...,...
87722,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Action
87722,514,5247,2.5,1537757040,Smokey and the Bandit (1977),Comedy
87714,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Action
87714,514,5246,1.5,1537757059,Smokey and the Bandit II (1980),Comedy


In [2]:
ratings_gb = ratings_df.groupby(['userId', 'genres'])['rating'].aggregate(list).reset_index()
ratings_gb['rating'] = ratings_gb['rating'].apply(
    lambda row: [(v, i) for i, v in enumerate(row)]
)
ratings_gb = ratings_gb.explode('rating')
ratings_gb['epoch'] = ratings_gb['rating'].apply(
    lambda row: row[-1]
)
ratings_gb['rating'] = ratings_gb['rating'].apply(
    lambda row: row[0]
)
ratings_gb.reset_index(inplace=True)
ratings_gb

Unnamed: 0,index,userId,genres,rating,epoch
0,0,1,Action,5.0,0
1,0,1,Action,4.0,1
2,0,1,Action,4.0,2
3,0,1,Action,5.0,3
4,0,1,Action,4.0,4
...,...,...,...,...,...
274475,10026,610,Western,4.0,28
274476,10026,610,Western,5.0,29
274477,10026,610,Western,5.0,30
274478,10026,610,Western,3.0,31


In [3]:
all_df = pd.DataFrame(
    [{
        'userId': list(range(1, ratings_gb['userId'].max()+1)),
        'genres': list(ratings_gb['genres'].unique()),
        'epoch': list(range(ratings_gb['epoch'].max()+1))
    }]
)
all_df = all_df.explode(['epoch'])
all_df = all_df.explode(['userId'])
all_df = all_df.explode(['genres'])
all_df = pd.merge(
    all_df,
    ratings_gb,
    how='left',
    on = ['userId', 'genres', 'epoch']
)
all_df.fillna(
    value = (ratings_df['rating'].max() + ratings_df['rating'].min()) / 2,
    inplace = True
)
all_df

Unnamed: 0,userId,genres,epoch,index,rating
0,1,Action,0,0.00,5.00
1,1,Adventure,0,1.00,5.00
2,1,Animation,0,2.00,5.00
3,1,Children,0,3.00,5.00
4,1,Comedy,0,4.00,4.00
...,...,...,...,...,...
15969795,610,War,1308,2.75,2.75
15969796,610,Western,1308,2.75,2.75
15969797,610,Documentary,1308,2.75,2.75
15969798,610,IMAX,1308,2.75,2.75


In [5]:
ratings = all_df['rating'].to_numpy(dtype=np.float32)
ratings = ratings.reshape(
    ratings_gb['epoch'].max()+1,
    ratings_gb['userId'].max(),
    len(ratings_gb['genres'].unique()),
    order='C'
)
ratings.shape

(1309, 610, 20)

In [6]:
all_genres = all_df.head(20)['genres'].values.tolist()
all_genres

['Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western',
 'Documentary',
 'IMAX',
 '(no genres listed)']

In [12]:
idx = np.random.randint(0, ratings_gb.shape[0])
genres = all_df.head(20)['genres'].values.tolist()
epoch = ratings_gb['epoch'][idx]
i = ratings_gb['userId'][idx] - 1
k = genres.index(ratings_gb['genres'][idx])
assert ratings[epoch, i, k] == ratings_gb['rating'][idx]
assert ratings_df.shape[0] == len(np.where(ratings != 2.75)[0])

In [13]:
arm_id = np.argmax(
    np.mean(
        np.sum(ratings, axis=0),
        axis = 0
    )
)
print("The most popular genre is "+all_df.head(20)['genres'].values.tolist()[arm_id])

The most popular genre is Drama
