In [None]:
!pip3 install google-generativeai

In [None]:
import numpy as np
import pandas as pd
import math
import google.generativeai as genai
import os

# Add your API key into ''
os.environ["GOOGLE_API_KEY"] = '' 

In [None]:
rating_df = pd.read_csv('../data/cleaned/cleaned_user_rating.csv')
rating_df_copy = rating_df.copy()
movies_df = pd.read_csv('../data/cleaned/cleaned_movies_details.csv', usecols=['movie_id', 'title', 'genres', 'overview', 'director', 'stars', 'img_url'])

# Only keep users with >= 5 ratings
user_rating_counts = rating_df_copy['user_id'].value_counts()
users_with_5_or_more_ratings = user_rating_counts[user_rating_counts >= 5].index
rating_df_copy = rating_df_copy[rating_df_copy['user_id'].isin(users_with_5_or_more_ratings)]
rating_df_copy = rating_df_copy.reset_index(drop=True)

# ID to number
rating_df_copy['user_id_number'] = rating_df_copy['user_id'].astype('category').cat.codes.values
rating_df_copy['movie_id_number'] = rating_df_copy['movie_id'].astype('category').cat.codes.values

all_data = rating_df_copy[['user_id_number', 'movie_id_number', 'rating']].values

In [None]:
def split_ratings_by_user(train_data, random_state=420):
    """
    Split ratings for each user into train, validation, and test sets
    with 60:20:20 split and maintaining rating count distribution
    """
    np.random.seed(random_state)
    
    # Unique users
    users = np.unique(train_data[:, 0])
    
    # Initialize empty lists for split data
    train_data_list = []
    val_data_list = []
    test_data_list = []
    
    for user in users:
        user_ratings = train_data[train_data[:, 0] == user]
        user_ratings = user_ratings[user_ratings[:, 2].argsort()]
        total_ratings = len(user_ratings)
        train_end = math.ceil(total_ratings * 0.6)
        val_end = train_end + math.ceil(total_ratings * 0.2)
        
        # Split the data
        train_data_list.append(user_ratings[:train_end])
        val_data_list.append(user_ratings[train_end:val_end])
        test_data_list.append(user_ratings[val_end:])
    
    # Concatenate the lists
    train_data = np.vstack(train_data_list)
    val_data = np.vstack(val_data_list)
    test_data = np.vstack(test_data_list)
    
    return train_data, val_data, test_data

In [None]:
train_data, val_data, test_data = split_ratings_by_user(all_data)
np.savez_compressed('../cleaned_data/data.npz', train=train_data, val=val_data, test=test_data)

In [None]:
movies_df['genres'] = movies_df['genres'].fillna('Unknown')
movies_df['director'] = movies_df['director'].fillna('Unknown')
movies_df['stars'] = movies_df['stars'].fillna('Unknown')
movies_df['duration'] = movies_df['duration'].fillna(0)  # Default duration as 0
movies_df['rating'] = movies_df['rating'].fillna(0.0)    # Default rating as 0.0
movies_df['oscar'] = movies_df['oscar'].fillna(0)        # Default oscar count as 0
movies_df['country'] = movies_df['country'].fillna('Unknown')
movies_df['overview'] = movies_df['overview'].fillna('No overview available')

movies_df['info'] = 'Genres: ' + movies_df['genres'] + '\n' \
                    + 'Director: ' + movies_df['director'] + '\n' \
                    + 'Cast: ' + movies_df['stars'] + '\n' \
                    + 'Duration: ' + movies_df['duration'].astype(str) + ' minutes \n' \
                    + 'Average rating: ' + movies_df['rating'].astype(str) + '\n' \
                    + 'Number of oscars: ' + movies_df['oscar'].astype(str) + '\n' \
                    + 'Country: ' + movies_df['country'] + '\n' \
                    + 'Overview: ' + movies_df['overview']

In [None]:
info_list = movies_df['info'].tolist()
overview_embedding_list = genai.embed_content(
    model="models/text-embedding-004", content=info_list
)['embedding']

np.savez('embeddings.npz', embeds=np.array(overview_embedding_list))