In [1]:
import tensorflow_datasets as tfds

In [2]:
# Dataset size (options: 100k, 1m, 10m, 20m, 25m)
DATASET_SIZE: str = '1m'

In [3]:
ratings_dataset, ratings_dataset_info = tfds.load(
    name = f'movielens/{DATASET_SIZE}-ratings',
    # MovieLens dataset is not splitted into `train` and `test` sets by default.
    # So TFDS has put it all into `train` split. We load it completely and split
    # it manually.
    split = 'train',
    # `with_info=True` makes the `load` function return a `tfds.core.DatasetInfo`
    # object containing dataset metadata like version, description, homepage,
    # citation, etc.
    with_info = True
)

# Convert the tf.data.DataFrame into a DataFrame.
df = tfds.as_dataframe(ratings_dataset, ratings_dataset_info)

# Convert byte values to strings.
df = df.map(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

2024-09-04 12:44:10.823098: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-04 12:44:10.873805: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-04 12:44:10.873850: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-04 12:44:10.874986: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-04 12:44:10.883079: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-04 12:44:10.884416: I tensorflow/core/platform/cpu_feature_guard.cc:1

### Ratings Dataset

In [4]:
# Create a DataFrame for the ratings with only the relevant columns.
# Columns: user_id, movie_id, timestamp, user_rating

ratings_df_columns = [
    'user_id',
    'movie_id',
    'timestamp',
    'user_rating',
]
ratings_df = df.loc[:, ratings_df_columns]

# Remove duplicates
ratings_df.drop_duplicates(inplace=True)

ratings_df.to_parquet(f'data/{DATASET_SIZE}-ratings.parquet', compression='brotli')

### Movies Dataset

In [5]:
# Many of the movie titles in the dataset contain their release years.
# Extracting this detail and treating it as a separate feature is more 
# valuable than leaving it combined with the title.

movies_df_columns = [
    'movie_id',
    'movie_title',
    'movie_genres',
]
movies_df = df.loc[:, movies_df_columns]

# Extract the release years into a separate column.
movies_df['movie_release_year'] = movies_df['movie_title'].str.extract(r'\((\d{4})\)')

# Remove the release years from the movie titles.
movies_df['movie_title'] = movies_df['movie_title'].str.replace(r'\s*\(\d{4}\)\s*', '', regex=True)

movies_df['movie_genres'] = movies_df['movie_genres'].apply(tuple)  # Convert the genres into a tuple.

# Remove duplicates
movies_df.drop_duplicates(inplace=True)

movies_df.to_parquet(f'data/{DATASET_SIZE}-movies.parquet', compression='brotli')

### Users Dataset

In [6]:
# Extract the user features into a separate DataFrame.
# Columns: user_id, user_gender, user_raw_age, user_zip_code,
# user_bucketized_age, user_occupation_text, user_occupation_label

users_df_columns = [
    'user_id',
    'user_gender',
    # 'raw_user_age',
    'user_zip_code',
    'bucketized_user_age',
    # 'user_occupation_text',
    'user_occupation_label',
]
users_df = df.loc[:, users_df_columns]

users_df.rename(
    columns = {
        'bucketized_user_age': 'user_bucketized_age'
    },
    inplace = True
)

users_df['user_gender'] = users_df['user_gender'].apply(lambda x: int(x))  # Cast booleans to integers

# Remove duplicates
users_df.drop_duplicates(inplace=True)

users_df.to_parquet(f'data/{DATASET_SIZE}-users.parquet', compression='brotli')