# 1. Import libraries

In [None]:
import sqlite3
import polars as pl
import numpy as np

# 2. Feature engineering

- First let load the data

In [None]:
conn = sqlite3.connect("/home/leminhohoho/repos/movie-lens/db/letterboxd_2.db")

activities = pl.read_database("SELECT * FROM users_and_movies WHERE rating IS NOT NULL", connection=conn)

movies = pl.read_database("SELECT * FROM movies", connection=conn)

genres_and_movies = pl.read_database("SELECT movie_id, name FROM genres_and_movies JOIN genres ON genres.id = genre_id", connection=conn)
genres_per_movie = (
    genres_and_movies
    .group_by("movie_id")
    .agg(pl.col("name").str.to_lowercase().alias("genres_name"))
).with_columns(pl.col("genres_name").list.unique().list.sort())

casts_and_movies = pl.read_database(
    "SELECT movie_id, name FROM crews_and_movies JOIN (SELECT * FROM crews WHERE role = 'Actor') ON id = crew_id", 
    connection=conn,
)
casts_per_movie = (
    casts_and_movies
    .group_by("movie_id")
    .agg(pl.col("name").str.to_lowercase().alias("casts_name"))
).with_columns(pl.col("casts_name").list.head(5))

languages_and_movies = pl.read_database("SELECT movie_id, language FROM languages_and_movies", connection=conn)
languages_per_movie = (
    languages_and_movies
    .group_by("movie_id")
    .agg(pl.col("language").str.to_lowercase().alias("languages_name"))
).with_columns(pl.col("languages_name").list.unique().list.sort())

releases = pl.read_database("SELECT movie_id, date FROM releases GROUP BY movie_id", connection=conn)

with pl.Config(tbl_cols=-1):
    print(activities)
    print(movies)
    print(genres_per_movie)
    print(casts_per_movie)
    print(languages_per_movie)
    print(releases)

## 2.1 Encoding features

- First we will encode `duration` & `date`:
    - For `duration`, we will use log and tanh to normalize it
    - For `date`, will will split into `year`, `month` and `date` then normalize each individually

In [None]:

enc_movies = movies.join(
        genres_per_movie, how="left", left_on="id", right_on="movie_id"
    ).join(
        languages_per_movie, how="left", left_on="id", right_on="movie_id"
    ).join(
        releases, how="left", left_on="id", right_on="movie_id"
    ).join(
        casts_per_movie, how="left", left_on="id", right_on="movie_id"
    ).with_columns(
        pl.col("date")
            .str.strptime(pl.Datetime, format="%d %b %Y", strict=True)
            .dt.replace_time_zone("UTC"),
        pl.col("duration").log(base=10).tanh()
    ).with_columns(
        (pl.col("date").dt.year() / 3000).alias("enc_year"),
        (pl.col("date").dt.month() / 12).alias("enc_month"),
        (pl.col("date").dt.day() / 31).alias("enc_day"),
    ).drop([
        "url", "poster_url", "backdrop_url", "trailer_url", "date",
    ])

print(enc_movies)
print(enc_movies[["duration", "enc_year", "enc_month", "enc_day"]].describe())

- Now we will encode user activities

In [None]:
enc_activities = activities.join(
    enc_movies, left_on="movie_id", right_on="id", how="semi"
).with_columns(
    (pl.col("date").str.slice(0, 18) + "Z")
        .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%SZ", strict=False)
        .dt.replace_time_zone("UTC") 
).with_columns(
    (pl.col("date").dt.year() / 3000).alias("enc_year"),
    (pl.col("date").dt.month() / 12).alias("enc_month"),
    (pl.col("date").dt.day() / 31).alias("enc_day"),
    (pl.col("date").dt.hour() / 31).alias("enc_hour"),
    (pl.col("date").dt.minute() / 60).alias("enc_minute"),
    pl.col("rating").is_null().alias("rating_missing"),
    pl.col("rating") / 5,
).drop(["date", "is_watch"])

with pl.Config(tbl_cols=-1):
    print(enc_activities)
    print(enc_activities[["rating", "enc_year", "enc_month", "enc_day"]].describe())

# 3. Save dataset

In [None]:
enc_movies.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies_2.parquet")
enc_activities.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities_2.parquet")