In [4]:
import pandas as pd
from scipy.linalg import triu
# pip install --upgrade scipy
from gensim.models import Word2Vec
# pip install gensim --only-binary :all:
import numpy as np

df = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')


In [2]:
# Train Word2Vec model for tags
model = Word2Vec(sentences=df['tag'], vector_size=100, window=5, min_count=1, sg=1)

# Function to get the embedding for each tag
def get_embedding(tags, model):
    return sum(model.wv[tag] for tag in tags) / len(tags)

# Apply the embedding function to the 'tag' column
df['tag_embedding'] = df['tag'].apply(lambda tags: get_embedding(tags, model))

In [3]:
# Drop the original 'tag', 'relevance' and 'title' columns
df = df.drop(columns=['tag', 'relevance', 'title'])

# Convert embeddings to a more compact form to save memory space
if isinstance(df['tag_embedding'].iloc[0], np.ndarray):
    df['tag_embedding'] = df['tag_embedding'].apply(lambda x: x.astype(np.float32))

In [5]:
df['genres'] = df['genres'].apply(lambda x: x.split('|'))
# Find unique genres
unique_genres = set(genre for sublist in df['genres'] for genre in sublist)
print("There are", len(unique_genres), "distinct genres in df")

There are 19 distinct genres in df


In [6]:
# One-hot encode the 'genres' column
genres_dummies = pd.get_dummies(df['genres'].apply(pd.Series).stack()).sum(level=0)

# Combine the one-hot encoded genres with the original DataFrame
df = df.drop(columns=['genres'])
df = df.join(genres_dummies)

import ace_tools as tools; tools.display_dataframe_to_user(name="One-Hot Encoded MovieLens Data", dataframe=df)

# Display the updated DataFrame
df.head()


In [7]:
# save edited dataframe, because the Word2Vec takes over 90 min.
df.to_parquet('../data/processed/preprocessed_data_with_embeddings_and_ohe.parquet', index=False)


In [11]:
df.head()

Unnamed: 0,movieId,genres_embedding,userId,rating,tag_embedding
0,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,144188,2.5,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
1,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,13198,3.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
2,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,49836,4.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
3,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,32754,4.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
4,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,78445,5.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
