# Constants

In [1]:
WORK_DIR_PATH = ".."

# Packages

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.chdir(WORK_DIR_PATH)
print(f"DIRECTORY: {os.getcwd()}")

DIRECTORY: c:\Users\jayar\Desktop\바탕 화면\REPO\PROJECT\M1-PJT_DL


In [4]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

In [5]:
from EXPERIMENT.UTILS.constants import (
    DEFAULT_USER_COL,
    DEFAULT_ITEM_COL,
    DEFAULT_RATING_COL,
)

# Upload

In [6]:
ratings = pd.read_csv("./_data/ratings.csv")
movies = pd.read_csv("./_data/movies.csv")

# Timestamp

In [7]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')

In [8]:
ratings["year"] = ratings['timestamp'].dt.year
ratings["month"] = ratings['timestamp'].dt.month
ratings["day"] = ratings['timestamp'].dt.day
ratings["weekday"] = ratings['timestamp'].dt.weekday

In [9]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
year         0
month        0
day          0
weekday      0
dtype: int64

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month,day,weekday
0,1,1,4.0,2000-07-30 18:45:03,2000,7,30,6
1,1,3,4.0,2000-07-30 18:20:47,2000,7,30,6
2,1,6,4.0,2000-07-30 18:37:04,2000,7,30,6
3,1,47,5.0,2000-07-30 19:03:35,2000,7,30,6
4,1,50,5.0,2000-07-30 18:48:51,2000,7,30,6


# Launch

In [11]:
movies['launch'] = movies['title'].str.extract(r'\((\d{4})\)')

In [14]:
movies.isna().sum()

movieId     0
title       0
genres      0
launch     13
dtype: int64

In [12]:
launch_col = ['movieId', 'launch']
ratings = ratings.merge(movies[launch_col], on='movieId', how='left')

In [15]:
drop_movies = list(movies[movies['launch'].isna()==True]["movieId"])
mask = ~ratings["movieId"].isin(drop_movies)
ratings = ratings[mask]

In [16]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month,day,weekday,launch
0,1,1,4.0,2000-07-30 18:45:03,2000,7,30,6,1995
1,1,3,4.0,2000-07-30 18:20:47,2000,7,30,6,1995
2,1,6,4.0,2000-07-30 18:37:04,2000,7,30,6,1995
3,1,47,5.0,2000-07-30 19:03:35,2000,7,30,6,1995
4,1,50,5.0,2000-07-30 18:48:51,2000,7,30,6,1995


# Multi-Hot Encoding

In [17]:
movies['genre_list'] = movies['genres'].str.split('|')

In [18]:
mlb = MultiLabelBinarizer()
mhe = mlb.fit_transform(movies['genre_list'])

In [19]:
genres = pd.DataFrame(data=mhe, columns=[f"genres_{genre}" for genre in mlb.classes_])
genres["movieId"] = movies["movieId"]

In [20]:
genres.head()

Unnamed: 0,genres_(no genres listed),genres_Action,genres_Adventure,genres_Animation,genres_Children,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Fantasy,...,genres_Horror,genres_IMAX,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western,movieId
0,0,0,1,1,1,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,3
3,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,4
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5


In [21]:
ratings = ratings.merge(genres, on='movieId', how='left')

In [22]:
drop_movies = list(genres[genres["genres_(no genres listed)"]==1]["movieId"])
mask = ~ratings["movieId"].isin(drop_movies)
ratings = ratings[mask]

In [23]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month,day,weekday,launch,genres_(no genres listed),...,genres_Film-Noir,genres_Horror,genres_IMAX,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western
0,1,1,4.0,2000-07-30 18:45:03,2000,7,30,6,1995,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,2000-07-30 18:20:47,2000,7,30,6,1995,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,2000-07-30 18:37:04,2000,7,30,6,1995,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,2000-07-30 19:03:35,2000,7,30,6,1995,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,2000-07-30 18:48:51,2000,7,30,6,1995,0,...,0,0,0,0,1,0,0,1,0,0


# One-Hot Encoding

In [24]:
encoder_dict = dict()

for col in ["userId", "movieId", "year", "month", "day", "weekday", "launch"]:
    encoder = LabelEncoder()
    ratings[col] = encoder.fit_transform(ratings[col])
    encoder_dict[col] = encoder

In [25]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,year,month,day,weekday,launch,genres_(no genres listed),...,genres_Film-Noir,genres_Horror,genres_IMAX,genres_Musical,genres_Mystery,genres_Romance,genres_Sci-Fi,genres_Thriller,genres_War,genres_Western
0,0,0,4.0,2000-07-30 18:45:03,4,6,29,6,82,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2,4.0,2000-07-30 18:20:47,4,6,29,6,82,0,...,0,0,0,0,0,1,0,0,0,0
2,0,5,4.0,2000-07-30 18:37:04,4,6,29,6,82,0,...,0,0,0,0,0,0,0,1,0,0
3,0,43,5.0,2000-07-30 19:03:35,4,6,29,6,82,0,...,0,0,0,0,1,0,0,1,0,0
4,0,46,5.0,2000-07-30 18:48:51,4,6,29,6,82,0,...,0,0,0,0,1,0,0,1,0,0


# Rename columns

In [27]:
drop_col = ["timestamp", "genres_(no genres listed)"]
ratings = ratings.drop(columns=drop_col)

In [28]:
col_dict = dict(
    userId=DEFAULT_USER_COL,
    movieId=DEFAULT_ITEM_COL,
    rating=DEFAULT_RATING_COL,
)

ratings = ratings.rename(columns=col_dict)

# Save

In [29]:
ratings.to_csv('./_data/movielens.csv', index=False)