In [141]:
import pandas as pd

# Importing dataframes
#df_tags = pd.read_csv('../data/raw/ml-25m/tags.csv')
df_gscores = pd.read_csv('../data/raw/ml-25m/genome-scores.csv')
df_gtags = pd.read_csv('../data/raw/ml-25m/genome-tags.csv')
#df_links = pd.read_csv('../data/raw/ml-25m/links.csv')
df_movies = pd.read_csv('../data/raw/ml-25m/movies.csv')
df_ratings = pd.read_csv('../data/raw/ml-25m/ratings.csv')

In [142]:
# Create combined dataframe from the smaller dataframes and cleaning it
df = pd.merge(df_movies, df_gscores, how="left", on='movieId')
df_gscores = pd.merge(df, df_gtags, how="left", on='tagId')
df_gscores.drop(columns=['tagId'], inplace=True)

In [143]:
df.head()

Unnamed: 0,movieId,title,genres,tagId,relevance
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,0.02875
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,0.02375
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,0.0625
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,0.07575
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,0.14075


In [144]:
# Keep only the top 40 tags for each movie
df_gscores = df_gscores.sort_values(by=['movieId', 'relevance'], ascending=[True, False])
df_gscores = df_gscores.groupby('movieId', group_keys=False).apply(lambda x: x.nlargest(40, 'relevance')).reset_index(drop=True)

In [None]:
# Aggregate the tags and relevances
df_gscores = df_gscores.groupby(['movieId', 'title', 'genres']).agg({'relevance': list, 'tag': list}).reset_index()

In [None]:
# Merge and clean more
df_ratings.drop(columns=['timestamp', 'title'], inplace=True)
df = pd.merge(df_gscores, df_ratings, on='movieId', how='left')
df.dropna(inplace=True)
df['userId'] = df['userId'].astype(int)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25000095 entries, 0 to 25003470
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   relevance  object 
 4   tag        object 
 5   userId     int32  
 6   rating     float64
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 1.4+ GB


In [None]:
# Count the occurrences of each movieId and filter out movieIds that occur less than 2000 times
movie_counts = df['movieId'].value_counts()
movie_ids_to_keep = movie_counts[movie_counts >= 2000].index
df = df[df['movieId'].isin(movie_ids_to_keep)]


In [None]:
df.head()

Unnamed: 0,movieId,title,genres,relevance,tag,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",2,3.5
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",3,4.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",4,3.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",5,4.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",8,4.0


In [None]:
import numpy as np

# Function to sample rows based on the logarithm to base 1.004 of the number of rows
def sample_log_base_1_004(group):
    n_rows = len(group)
    sample_size = int(np.round(np.log(n_rows) / np.log(1.004)))
    return group.sample(n=min(sample_size, n_rows))

# Group by 'movieId' and apply the sampling function
df = df.groupby('movieId', group_keys=False).apply(sample_log_base_1_004)

df['movieId'].value_counts()


  df = df.groupby('movieId', group_keys=False).apply(sample_log_base_1_004)


movieId
356       2833
318       2833
296       2827
593       2809
2571      2804
          ... 
8966      1905
86298     1905
4291      1904
6890      1904
100714    1904
Name: count, Length: 2428, dtype: int64

In [None]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5273559 entries, 48137 to 24962653
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   relevance  object 
 4   tag        object 
 5   userId     int32  
 6   rating     float64
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 301.8+ MB


Unnamed: 0,movieId,title,genres,relevance,tag,userId,rating
48137,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",136843,4.0
54244,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",153907,3.0
35688,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",101322,3.5
12334,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",34935,3.0
2707,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",7683,3.5


In [None]:
# Safe the dataframe locally

df.to_csv('../data/processed/ml-25m/preprocessed_data_movielens.csv', index=False)