In [1]:
import pandas as pd

df_tags = pd.read_csv('../data/raw/ml-25m/tags.csv')
df_gscores = pd.read_csv('../data/raw/ml-25m/genome-scores.csv')
df_gtags = pd.read_csv('../data/raw/ml-25m/genome-tags.csv')
df_links = pd.read_csv('../data/raw/ml-25m/links.csv')
df_movies = pd.read_csv('../data/raw/ml-25m/movies.csv')
df_ratings = pd.read_csv('../data/raw/ml-25m/ratings.csv')

In [2]:
# Create combined dataframe from the smaller dataframes and cleaning it
df = pd.merge(df_movies, df_gscores, how="left", on='movieId')
df_gscores = pd.merge(df, df_gtags, how="left", on='tagId')
df_gscores.drop(columns=['tagId'], inplace=True)

In [3]:
df_gscores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [4]:
# Keep only the top 40 tags for each movie
df_gscores = df_gscores.sort_values(by=['movieId', 'relevance'], ascending=[True, False])
df_gscores = df_gscores.groupby('movieId', group_keys=False).apply(lambda x: x.nlargest(40, 'relevance')).reset_index(drop=True)

  df_gscores = df_gscores.groupby('movieId', group_keys=False).apply(lambda x: x.nlargest(40, 'relevance')).reset_index(drop=True)


In [5]:
#The same movieId refers to the same movie between `ratings.csv`, `tags.csv`, `movies.csv`, and `links.csv`
df = pd.merge(df_movies, df_ratings, how="left", on=['movieId'])
df = pd.merge(df, df_tags, on=['movieId', 'userId', 'timestamp'], how='left')
df = pd.merge(df, df_gtags, on=['tag'], how='left')
df = pd.merge(df, df_gscores, on=['tagId', 'movieId'], how='left')

In [6]:
# Merge and clean more
df_ratings.drop(columns=['timestamp'], inplace=True)
df = pd.merge(df_gscores, df_ratings, on='movieId', how='left')
df.dropna(inplace=True)
df['userId'] = df['userId'].astype(int)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25000095 entries, 0 to 25003470
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   relevance  object 
 4   tag        object 
 5   userId     int32  
 6   rating     float64
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 1.4+ GB


In [8]:
# Count the occurrences of each movieId and filter out movieIds that occur less than 2000 times
movie_counts = df['movieId'].value_counts()
movie_ids_to_keep = movie_counts[movie_counts >= 2000].index
df = df[df['movieId'].isin(movie_ids_to_keep)]


In [9]:
df['genres'].value_counts()

genres
Drama                                   1766138
Comedy                                  1513066
Comedy|Romance                           902164
Drama|Romance                            796458
Comedy|Drama                             721506
                                         ...   
Adventure|Children|Drama|War                  1
Adventure|Fantasy|Western                     1
Action|Comedy|Documentary|Thriller            1
Adventure|Fantasy|Sci-Fi|Western              1
Comedy|Horror|Mystery|Sci-Fi|Western          1
Name: count, Length: 1639, dtype: int64

In [10]:
##### Deadlines:
- Report 1. Data viz (5 graphs) + preprocessing. Friday, May 17
- Report 2. Modelling. Friday, June 14
- Final report. Friday, June 21


  df = df.groupby('movieId', group_keys=False).apply(sample_log_base_1_004)


movieId
356       2833
318       2833
296       2827
593       2809
2571      2804
          ... 
8966      1905
86298     1905
4291      1904
6890      1904
100714    1904
Name: count, Length: 2428, dtype: int64

In [11]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5273559 entries, 26201 to 24963297
Data columns (total 7 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   title      object 
 2   genres     object 
 3   relevance  object 
 4   tag        object 
 5   userId     int32  
 6   rating     float64
dtypes: float64(1), int32(1), int64(1), object(4)
memory usage: 301.8+ MB


Unnamed: 0,movieId,title,genres,relevance,tag,userId,rating
26201,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",74244,4.0
19183,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",54322,4.0
37420,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",106130,4.5
15349,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",43484,3.5
5934,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[0.99925, 0.99875, 0.99575, 0.98575, 0.98425, ...","[toys, computer animation, pixar animation, ki...",16874,4.0


In [12]:
# Safe the dataframe locally
df.to_csv('../data/processed/preprocessed_data_movielens.csv', index=False)