In [1]:
import pandas as pd
from scipy.linalg import triu
# pip install --upgrade scipy
from gensim.models import Word2Vec
# pip install gensim --only-binary :all:
import numpy as np

df = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')


In [2]:
# Train Word2Vec model
model = Word2Vec(sentences=df['tag'], vector_size=100, window=5, min_count=1, sg=1)

# Function to get the embedding for each tag
def get_embedding(tags, model):
    return sum(model.wv[tag] for tag in tags) / len(tags)

# Apply the embedding function to the 'tag' column
df['tag_embedding'] = df['tag'].apply(lambda tags: get_embedding(tags, model))

In [3]:
# Drop the original 'tag' column
df = df.drop(columns=['tag', 'relevance'])

# Convert embeddings to a more compact form to save memory space
if isinstance(df['tag_embedding'].iloc[0], np.ndarray):
    df['tag_embedding'] = df['tag_embedding'].apply(lambda x: x.astype(np.float32))

In [4]:
# split genres
df['genres'] = df['genres'].apply(lambda x: x.split('|'))

df_genres = df['genres'].explode()  # Transform each list of genres into separate rows
df_genres = pd.get_dummies(df_genres).groupby(level=0).sum()  # One-hot encode and then sum to combine rows back

# Join the one-hot encoded genres back with the main dataframe
df = df.join(df_genres)

In [5]:
# Drop 'title' and 'genre' column
df = df.drop('title', axis=1)
df = df.drop('genres', axis=1)

df.head()

Unnamed: 0,movieId,userId,rating,tag_embedding,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,144188,2.5,"[0.05508393, 0.08796407, -0.15735401, -0.10904...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,13198,3.0,"[0.05508393, 0.08796407, -0.15735401, -0.10904...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,49836,4.0,"[0.05508393, 0.08796407, -0.15735401, -0.10904...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32754,4.0,"[0.05508393, 0.08796407, -0.15735401, -0.10904...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78445,5.0,"[0.05508393, 0.08796407, -0.15735401, -0.10904...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# save edited dataframe, because the Word2Vec takes over 90 min.
df.to_parquet('../data/processed/preprocessed_data_with_embeddings_and_ohe.parquet', compression='snappy')
