In [1]:
import pandas as pd
from scipy.linalg import triu
# pip install --upgrade scipy
from gensim.models import Word2Vec
# pip install gensim --only-binary :all:
import numpy as np

df = pd.read_csv('../data/processed/preprocessed_data_movielens.csv')


In [2]:
# Train Word2Vec model
model = Word2Vec(sentences=df['tag'], vector_size=100, window=5, min_count=1, sg=1)

# Function to get the embedding for each tag
def get_embedding(tags, model):
    return sum(model.wv[tag] for tag in tags) / len(tags)

# Apply the embedding function to the 'tag' column
df['tag_embedding'] = df['tag'].apply(lambda tags: get_embedding(tags, model))

In [3]:
# Drop the original 'tag' column
df = df.drop(columns=['tag', 'relevance'])

# Convert embeddings to a more compact form to save memory space
if isinstance(df['tag_embedding'].iloc[0], np.ndarray):
    df['tag_embedding'] = df['tag_embedding'].apply(lambda x: x.astype(np.float32))

In [4]:
# there are 531 distinct genres in df.
print("There are", df['genres'].nunique(), "distinct genres in df")
df['genres'] = df['genres'].apply(lambda x: x.split('|'))


There are 531 distinct genres in df


In [5]:
# Training a Word2Vec model for genres
model = Word2Vec(sentences=df['genres'], vector_size=50, window=3, min_count=1, workers=4)

# Function to calculate the mean vector for a list of genres
def mean_genre_vector(genres):
    vectors = []
    for genre in genres:
        if genre in model.wv:  # Check if the genre is in the model's vocabulary
            vectors.append(model.wv[genre])  # Append the vector for the genre to the list

    # If no vectors were added, return a zero vector
    if not vectors:
        return np.zeros(model.vector_size)
    # Calculate the mean of the vectors
    return np.mean(vectors, axis=0)

# Apply the function to each row in the DataFrame to create the new column
df['genres'] = df['genres'].apply(mean_genre_vector)


In [6]:
# Drop 'title' column
df = df.drop('title', axis=1)
# Rename the column
df.rename(columns={'genres': 'genres_embedding'}, inplace=True)

In [7]:
# save edited dataframe, because the Word2Vec takes over 90 min.
df.to_csv('../data/processed/preprocessed_data_with_embeddings.csv', index=False, float_format='%.6f')

In [11]:
df.head()

Unnamed: 0,movieId,genres_embedding,userId,rating,tag_embedding
0,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,144188,2.5,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
1,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,13198,3.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
2,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,49836,4.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
3,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,32754,4.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...
4,1,[ 0.29733244 -0.46585637 0.01453016 0.345878...,78445,5.0,[ 7.64919892e-02 -8.15666839e-02 -7.72340521e-...


In [8]:
#load edited dataframe
df = pd.read_csv('../data/processed/preprocessed_data_with_embeddings.csv')

In [None]:
# one-hot-encoding genres
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_pickle.html
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_parquet.html