# 1. Import libraries

In [None]:
import sqlite3
import polars as pl
import gensim.downloader as gsdl
import numpy as np

# 2. Feature engineering

- First let load the data

In [None]:
conn = sqlite3.connect("/home/leminhohoho/repos/movie-lens/db/letterboxd.db")

activities = pl.read_database("SELECT * FROM users_and_movies", connection=conn)

movies = pl.read_database("SELECT * FROM movies", connection=conn)

genres_and_movies = pl.read_database("SELECT movie_id, name FROM genres_and_movies JOIN genres ON genres.id = genre_id", connection=conn)
genres_per_movie = (
    genres_and_movies
    .group_by("movie_id")
    .agg(pl.col("name").str.to_lowercase().alias("genres_name"))
)

languages_and_movies = pl.read_database("SELECT movie_id, language FROM languages_and_movies", connection=conn)
languages_per_movie = (
    languages_and_movies
    .group_by("movie_id")
    .agg(pl.col("language").str.to_lowercase().alias("languages_name"))
)

releases = pl.read_database("SELECT movie_id, date FROM releases GROUP BY movie_id", connection=conn)

with pl.Config(tbl_cols=-1):
    print(activities)
    print(movies)
    print(genres_per_movie)
    print(languages_per_movie)
    print(releases)

## 2.1 Encoding features

- Since genres & languages are not fixed, it would be impossible to use them as the vocabulary for the model, instead we will vectorized them using pre trained embedding models
- First let load the encoder and the embedding model

In [None]:
word_vecs = gsdl.load("word2vec-google-news-300")

- Since languages have varations (e.g `greek (modern)`), which if passed without modifcation to the embedding model will cause error since they are not presented, we will create a perform string extraction to make the valid for embedding before doing so

In [None]:
import re

def embed_languages(languages):
    vecs = []

    for lang in languages:
          try:
              vecs.append(word_vecs.get_vector(lang))
          except KeyError:
              vecs.append(word_vecs.get_mean_vector(re.split(r'[^a-zA-Z0-9]+', lang)))

    return np.mean(vecs, axis=0)

- Now we will embed the `genres` and `languages`, also we will convert date

In [None]:

enc_movies = movies.join(
        genres_per_movie, how="left", left_on="id", right_on="movie_id"
    ).join(
        languages_per_movie, how="left", left_on="id", right_on="movie_id"
    ).join(
        releases, how="left", left_on="id", right_on="movie_id"
    ).with_columns(
        pl.col("genres_name")
            .map_elements(word_vecs.get_mean_vector)
            .map_elements(lambda x: x.tolist(), return_dtype=pl.List(pl.Float32))
            .alias("embd_genres"),
        pl.col("languages_name")
            .map_elements(embed_languages)
            .map_elements(lambda x: x.tolist(), return_dtype=pl.List(pl.Float32))
            .alias("embd_languages"),
        # pl.when(pl.col("desc").is_not_null())
        #     .then(pl.col("desc").str.to_lowercase().map_elements(enc.encode, return_dtype=pl.List(pl.Int64)))
        #     .otherwise(pl.lit([])).alias("enc_movie_desc"),
        # pl.when(pl.col("name").str.to_lowercase().is_not_null())
        #     .then(pl.col("name").map_elements(enc.encode, return_dtype=pl.List(pl.Int64)))
        #     .otherwise(pl.lit([])).alias("enc_movie_name"),
        (pl.col("date")
                .str.strptime(pl.Datetime, format="%d %b %Y", strict=True)
                .dt.replace_time_zone("UTC") 
                .dt.timestamp() / 10**16)            
                .alias("enc_release_date"),
        pl.col("duration").log(base=10).tanh()
    ).drop([
        "url", "poster_url", "backdrop_url", "trailer_url", "date", "genres_name", "languages_name", "desc",
    ]).filter(
        pl.col("embd_genres").is_not_null()
    )

enc_movies = enc_movies.with_columns(
    pl.col("duration").fill_null(pl.col("duration").mean()),
    pl.col("duration").is_null().alias("duration_missing"),
    pl.col("embd_languages").fill_null(pl.lit(word_vecs.get_vector("unknown").tolist())),
    pl.col("enc_release_date").fill_null(pl.col("enc_release_date").mean()),
    pl.col("enc_release_date").is_null().alias("releases_date_missing")
)

print(enc_movies)
print(enc_movies.drop("name").describe())

- Now we will encode user activities

In [None]:
enc_activities = activities.join(
    enc_movies, left_on="movie_id", right_on="id", how="semi"
).with_columns(
    # pl.when(pl.col("review").is_not_null())
    #   .then(pl.col("review").map_elements(enc.encode, return_dtype=pl.List(pl.Int64)))
    #   .otherwise(pl.lit([])).alias("enc_review"),
    ((pl.col("date").str.slice(0, 18) + "Z")
        .str.strptime(pl.Datetime, format="%Y-%m-%dT%H:%M:%SZ", strict=False)
        .dt.replace_time_zone("UTC") 
        .dt.timestamp() / 10**16)
        .alias("enc_time")
    
).drop(["date", "review"])

enc_activities = enc_activities.with_columns(
    pl.col("rating").fill_null(pl.col("rating").mean()) / 5,
    pl.col("rating").is_null().alias("rating_missing"),
    pl.col("enc_time").fill_null(pl.col("enc_time").mean()),
    pl.col("enc_time").is_null().alias("time_missing")
)


with pl.Config(tbl_cols=-1):
    print(enc_activities)
    print(enc_activities.drop(["movie_id", "user_id"]).describe())

# 3. Save dataset

In [None]:
enc_movies.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies.parquet")
enc_activities.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities.parquet")