In [None]:
import pandas as pd
import numpy as np

In [None]:
# https://files.grouplens.org/datasets/movielens/ml-latest-README.html
movies_df = pd.read_csv(
    'ml-latest/movies.csv'
)

ratings_df = pd.read_csv(
    'ml-latest/ratings.csv'
)
ratings_df = pd.merge(
    ratings_df,
    movies_df,
    on = 'movieId'
)
del movies_df
ratings_df['genres'] = ratings_df['genres'].apply(
    lambda row: row.split('|')
)
ratings_df = ratings_df.explode('genres').sort_values(['timestamp', 'userId', 'genres'])

In [None]:
ratings_df[(ratings_df['userId'] == 697) & (ratings_df['genres'] == '(no genres listed)')]

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
26443116,697,143387,0.5,1516118571,Pitch Perfect 3 (2017),(no genres listed)
26444984,697,171495,5.0,1516118786,Cosmos,(no genres listed)
26445141,697,171749,4.5,1516118810,Death Note: Desu nôto (2006–2007),(no genres listed)
26445251,697,172591,5.0,1516118833,The Godfather Trilogy: 1972-1990 (1992),(no genres listed)
26445676,697,176601,5.0,1516118848,Black Mirror,(no genres listed)
24230940,697,141866,3.0,1517077209,Green Room (2015),(no genres listed)


## find the most popular genre

In [None]:
average_rating = ratings_df[['genres', 'rating']].groupby('genres').mean()

In [None]:
average_rating

Unnamed: 0_level_0,rating
genres,Unnamed: 1_level_1
(no genres listed),3.291397
Action,3.462566
Adventure,3.513284
Animation,3.607309
Children,3.423893
Comedy,3.419787
Crime,3.683522
Documentary,3.71086
Drama,3.676371
Fantasy,3.507208


## Select all the users that have rated for all genres

In [None]:
userId_ratedGenres = ratings_df[['userId', 'genres']].groupby('userId')['genres'].nunique()
selected_users = userId_ratedGenres[userId_ratedGenres > 19].index
selected_users = selected_users[-(int(len(selected_users)**.5)**2):]
index = ratings_df['userId'].isin(selected_users)
selected_df = ratings_df[index]
del ratings_df

In [None]:
selected_ratings = selected_df.groupby(['userId', 'genres'])['rating'].aggregate(list).reset_index()
del selected_df

## Smoothing the ratings

In [None]:
def stretch(v, target):
    n = len(v)
    res = np.concatenate(
        (
            np.repeat(v, target//n),
            np.ones(target%n) * v[-1]
        ),
        axis=0
    )
    return res

# v = [1, 2, 3]
# stretch(v, 8)

In [None]:
selected_ratings['rating_counts'] = selected_ratings['rating'].apply(
    lambda row: len(row)
)
selected_ratings['rating_smooth'] = selected_ratings['rating'].apply(
    lambda row: stretch(
        row,
        selected_ratings['rating_counts'].max()
    )
)
selected_ratings = selected_ratings[['userId', 'genres', 'rating_smooth']]
selected_ratings

Unnamed: 0,userId,genres,rating_smooth
0,697,(no genres listed),"[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
1,697,Action,"[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
2,697,Adventure,"[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
3,697,Animation,"[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
4,697,Children,"[4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, ..."
...,...,...,...
67275,283000,Romance,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."
67276,283000,Sci-Fi,"[1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, 1.5, ..."
67277,283000,Thriller,"[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, ..."
67278,283000,War,"[5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, ..."


## Generate the recommendation losses and save them

In [None]:
ratings = selected_ratings['rating_smooth'].to_list()
ratings = np.array(ratings)
ratings = ratings.flatten('F').reshape((12800, 58**2, 20))

In [None]:
losses = (np.max(ratings) + np.min(ratings) - ratings) / np.max(ratings)
np.save('movielens.npy', losses)

In [None]:
genres = selected_ratings['genres'].iloc[:20]
genres.to_list()

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']