In [1]:
import pandas as pd
from scipy.linalg import triu
# pip install --upgrade scipy
from gensim.models import Word2Vec
# pip install gensim --only-binary :all:
import numpy as np

df = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')


In [2]:
# Prepare tags for Word2Vec model training
df['tag'] = df['tag'].apply(lambda x: x.split())
tags_list = df['tag'].tolist()

# Train Word2Vec model for tags
model = Word2Vec(sentences=tags_list, vector_size=100, window=5, min_count=1, sg=1)

# Function to get the embedding for each tag
def get_embedding(tags, model):
    vectors = [model.wv[tag] for tag in tags if tag in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Apply the embedding function to the 'tag' column
df['tag_embedding'] = df['tag'].apply(lambda tags: get_embedding(tags, model))

In [3]:
# Drop the original 'tag', 'relevance' and 'title' columns
df = df.drop(columns=['tag', 'relevance', 'title'])

# Convert embeddings to a more compact form to save memory space
if isinstance(df['tag_embedding'].iloc[0], np.ndarray):
    df['tag_embedding'] = df['tag_embedding'].apply(lambda x: x.astype(np.float32))

In [4]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
# Find unique genres
unique_genres = set(genre for sublist in df['genres'] for genre in sublist)
print("There are", len(unique_genres), "distinct genres in df")

There are 19 distinct genres in df


In [5]:
# One-hot encode the 'genres' column
genres_dummies = pd.get_dummies(df['genres'].apply(pd.Series).stack()).groupby(level=0).sum()

# Combine the one-hot encoded genres with the original DataFrame
df = df.drop(columns=['genres'])
df = df.join(genres_dummies)

df.head()

Unnamed: 0,movieId,userId,rating,tag_embedding,Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,144188,2.5,"[0.30380347, 0.23757882, -0.09688654, 0.218109...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,13198,3.0,"[0.30380347, 0.23757882, -0.09688654, 0.218109...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,49836,4.0,"[0.30380347, 0.23757882, -0.09688654, 0.218109...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,32754,4.0,"[0.30380347, 0.23757882, -0.09688654, 0.218109...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,1,78445,5.0,"[0.30380347, 0.23757882, -0.09688654, 0.218109...",0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# save edited dataframe, because the Word2Vec takes over 90 min.
df.to_parquet('../data/processed/preprocessed_data_with_embeddings_and_ohe.parquet', index=False)
