# 1. Import libraries

In [None]:
import polars as pl
import torch
from sentence_transformers import SentenceTransformer

# 2. Import dataset

In [None]:
enc_movies = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies_2.parquet")
enc_activities = pl.read_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities_2.parquet")

with pl.Config(tbl_cols=-1):
    print(enc_movies)
    print(enc_activities)

# 3. Feature engineering

## 3.1 Load SBERT model

In [None]:
sbert = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
sbert.encode("Star wars: a new hope")

## 3.2 Embedding movies's features

In [None]:
enc_movies_2 = enc_movies.with_columns(
    pl.col("name")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_name"),
    pl.col("desc")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_desc"),
    pl.col("genres_name").list.join(", ")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_genres"),
    pl.col("languages_name").list.join(", ")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_languages"),
    pl.col("casts_name").list.join(", ")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_casts"),
).drop(["name", "desc", "genres_name", "languages_name", "casts_name"])

enc_movies_2

## 3.3 Embedding user activities's features

In [None]:
enc_activities_2 = enc_activities.with_columns(
    pl.col("review")
        .map_elements(lambda x: sbert.encode(x).tolist(), skip_nulls=True)
        .alias("enc_review"),
).drop("review")

enc_activities_2

## 3.4 Replacing null features

In [None]:
embedded_unkown = sbert.encode("unknown").tolist()

enc_movies_2 = enc_movies_2.with_columns(
    pl.col("duration").fill_null(0),
    pl.col("enc_year").fill_null(0),
    pl.col("enc_month").fill_null(0),
    pl.col("enc_day").fill_null(0),
    pl.col("enc_desc").fill_null(embedded_unkown),
    pl.col("enc_genres").fill_null(embedded_unkown),
    pl.col("enc_casts").fill_null(embedded_unkown),
    pl.col("enc_languages").fill_null(embedded_unkown),
)

enc_activities_2 = enc_activities_2.with_columns(
    pl.col("rating").fill_null(-1),
    pl.col("enc_review").fill_null(embedded_unkown),
).sort(pl.col("user_id"), descending=False)

with pl.Config(tbl_cols=-1):
    print(enc_movies_2)
    print(enc_activities_2)

# 4. Save dataset

In [None]:
enc_movies_2.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_movies_2.parquet")
enc_activities_2.write_parquet("/home/leminhohoho/repos/movie-lens/ml/data/encoded_activities_2.parquet")