In [None]:
import pandas as pd

# Importing dataframes
#df_tags = pd.read_csv('../data/raw/ml-25m/tags.csv')
df_gscores = pd.read_csv('../data/raw/ml-25m/genome-scores.csv')
df_gtags = pd.read_csv('../data/raw/ml-25m/genome-tags.csv')
#df_links = pd.read_csv('../data/raw/ml-25m/links.csv')
df_movies = pd.read_csv('../data/raw/ml-25m/movies.csv')
df_ratings = pd.read_csv('../data/raw/ml-25m/ratings.csv')

In [None]:
# Create combined dataframe from the smaller dataframes and cleaning it
df = pd.merge(df_movies, df_gscores, how="left", on='movieId')
df_gscores = pd.merge(df, df_gtags, how="left", on='tagId')
df_gscores.drop(columns=['tagId'], inplace=True)

In [None]:
# Keep only the top 40 tags for each movie
df_gscores = df_gscores.sort_values(by=['movieId', 'relevance'], ascending=[True, False])
df_gscores = df_gscores.groupby('movieId', group_keys=False).apply(lambda x: x.nlargest(40, 'relevance')).reset_index(drop=True)

In [None]:
# Aggregate the tags and relevances
df_gscores = df_gscores.groupby(['movieId', 'title', 'genres']).agg({'relevance': list, 'tag': list}).reset_index()

In [None]:
# Merge and clean more
df_ratings.drop(columns=['timestamp'], inplace=True)
df = pd.merge(df_gscores, df_ratings, on='movieId', how='left')
df.dropna(inplace=True)
df['userId'] = df['userId'].astype(int)
# removed user that is outlyer
df = df[df['userId'] != 75309]

In [None]:
# Count the occurrences of each movieId and filter out movieIds that occur less than 2000 times
movie_counts = df['movieId'].value_counts()
movie_ids_to_keep = movie_counts[movie_counts >= 2000].index
df = df[df['movieId'].isin(movie_ids_to_keep)]


In [None]:
import numpy as np

# Function to sample rows based on the logarithm to base 1.004 of the number of rows
def sample_log_base_1_004(group):
    n_rows = len(group)
    sample_size = int(np.round(np.log(n_rows) / np.log(1.004)))
    return group.sample(n=min(sample_size, n_rows))

# Group by 'movieId' and apply the sampling function
df = df.groupby('movieId', group_keys=False).apply(sample_log_base_1_004)

df['movieId'].value_counts()


In [None]:
# Safe the dataframe locally
df.to_csv('../data/processed/preprocessed_data_movielens.csv', index=False)

# more memory efficient alternatives for storing result locally
# import joblib
# joblib.dump(df, '../data/processed/preprocessed_data_movielens.pkl') # about 247 MB
# df.to_parquet('../data/processed/preprocessed_data_movielens.parquet', compression='snappy') # about 67 MB

In [None]:
# test if the outlyer is still in the dataframe
user_exists = df['userId'].isin([75309]).any()
print("User 75390 exists in DataFrame:", user_exists)