In [17]:
import pandas as pd
import numpy as np
import re
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import relu
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertModel, BertTokenizer
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
pd.set_option('display.max_columns',200,'display.max_rows',300, 'display.max_colwidth',None)
warnings.filterwarnings("ignore")

# Parameters

In [2]:
# embedding size for nn embedding
embedding_size=32

# batch size for BERT
batch_size=8

# movie category embedding size
category_embedding_size = 8

# transformer chunk size
chunk_size = 100


# EDA

In [3]:
# movies table
raw_movies_info_df=pd.read_csv('../interview_recsys_by_Zhouzi_Hu/data/ml-1m/movies.dat',\
                               sep='\t',\
                               names=["movie_info"],\
                               encoding='latin-1'\
                            )


for index,row in raw_movies_info_df.iterrows():
    l=row["movie_info"].split('::')
    res_l=list(filter(None,l))
    raw_title=res_l[1]
    title=re.sub(r'\([^)]*\)', '',res_l[1])
    title=title.split(', ')
    reversed_title=title[::-1]
    raw_movies_info_df.at[index,'movie_id']=res_l[0]  
    raw_movies_info_df.at[index,'Movie_name']=' '.join(reversed_title)
    raw_movies_info_df.at[index,'category']=res_l[2]

raw_movies_info_df['year']=raw_movies_info_df["movie_info"].str.extract(r'\((\d+)\)')
raw_movies_info_df['original_name']=raw_movies_info_df["movie_info"].str.extract(r'\((?![0-9]+\))([^)]+)\)')
movies_info_df=raw_movies_info_df[['movie_id','Movie_name','category','year','original_name']]
movies_info_df = movies_info_df.astype(str).applymap(lambda x: x.strip())
movies_info_df = movies_info_df.astype(str).applymap(lambda x: x.replace("\\n", ""))
movies_info_df = movies_info_df.astype(str).applymap(lambda x: x.replace("  ", " ")) 
movies_info_df['Movie_name']=movies_info_df['Movie_name'].str.replace("  ", " ")
movies_info_df["movie_id"] = movies_info_df["movie_id"].apply(int)

genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime"]
genres += ["Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical"]
genres += ["Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

for genre in genres:
    movies_info_df[genre] = movies_info_df["category"].apply(
        lambda values: int(genre in values.split("|"))
    )

movies_info_df.head()

Unnamed: 0,movie_id,Movie_name,category,year,original_name,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story,Animation|Children's|Comedy,1995,,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,Adventure|Children's|Fantasy,1995,,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,Comedy|Romance,1995,,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,Comedy|Drama,1995,,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II,Comedy,1995,,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Rating table 
ratings=pd.read_csv(\
    '../interview_2_recsys 2/data/ml-1m/ratings.dat',\
    sep='::',\
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
    encoding='latin-1'\
)
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

del ratings["unix_timestamp"]
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [5]:
# User table
users=pd.read_csv(\
    '../interview_2_recsys 2/data/ml-1m/users.dat.txt',\
    sep='::',\
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
    encoding='latin-1'\
)

users["age_group"] = users["age_group"].apply(lambda x: f"age_{x}")
users["occupation"] = users["occupation"].apply(lambda x: f"job_{x}")
users["zip_code"] = users["zip_code"].apply(lambda x: f"zc_{x}")

users.head(3)

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,1,F,age_1,job_10,zc_48067
1,2,M,age_56,job_16,zc_70072
2,3,M,age_25,job_15,zc_55117


In [6]:
# Movie meta table
raw_movies_meta_df=pd.read_csv('../interview_2_recsys 2/data/ml-1m/movies_metadata.csv')
raw_movies_meta_df['Year']=raw_movies_meta_df['release_date'].str.split('-').str.get(0).str.strip()
raw_movies_meta_df['title']=raw_movies_meta_df['title'].str.strip()
raw_movies_meta_df['overview']=raw_movies_meta_df['overview']\
    .str.strip()\
        .str.replace("\\n", "")\
            .str.replace("  ", " ")

In [7]:
# Grouped movies_meta table based on "Year" and "title"
grouped=raw_movies_meta_df.groupby(['title','Year'])['overview'].first().reset_index()
grouped.head(3)

Unnamed: 0,title,Year,overview
0,!Women Art Revolution,2010,"Through intimate interviews, provocative art, and rare, historical film and video footage, this feature documentary reveals how art addressing political consequences of discrimination and violence, the Feminist Art Revolution radically transformed the art and culture of our times."
1,#1 Cheerleader Camp,2010,A pair of horny college guys get summer jobs at a sexy cheerleader camp.
2,#Horror,2015,"Inspired by actual events, a group of 12 year old girls face a night of horror when the compulsive addiction of an online social media game turns a moment of cyber bullying into a night of insanity."


In [8]:
# join meta table with movies.dat for overview info
merged=movies_info_df.merge(grouped,left_on=['Movie_name','year'],right_on=['title','Year'],how='left')\
     .merge(grouped,left_on=['original_name','year'],right_on=['title','Year'],how='left'
)

merged['overview']=np.where(\
    merged['overview_x'].isna(),\
        merged['overview_y'],\
            merged['overview_x']\
                )
                
movie_df_p1=merged.query('overview.notna()')[\
    ['movie_id','Movie_name','year','category','original_name','overview']\
        ]

# there a great portion (700) of movie records from meta table has wrong release year, so gonna need a second concat
grouped2=raw_movies_meta_df.groupby('title')['overview'].first().reset_index()
movie_df_p2=merged.query('overview.isna()')[['movie_id','Movie_name','year','category','original_name']]\
    .merge(grouped2,left_on='Movie_name',right_on='title',how='left').drop(columns='title')

movie_df_r=pd.concat([movie_df_p1,movie_df_p2],ignore_index=True)
movie_df_r.head(2)

Unnamed: 0,movie_id,Movie_name,year,category,original_name,overview
0,1,Toy Story,1995,Animation|Children's|Comedy,,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences."
1,2,Jumanji,1995,Adventure|Children's|Fantasy,,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures."


# Embedding user&movie interactions

In [9]:
# Merge user ratings with user information
user_ratings_with_info_df=pd.merge(users, ratings, on='user_id')
user_ratings_with_info_df=user_ratings_with_info_df.merge(\
    movies_info_df[['movie_id', 'year', 'Action',
       'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']],on='movie_id')
user_ratings_with_info_df['year']=user_ratings_with_info_df['year'].apply(int)


# Unique years and create a dictionary to map them to indices
unique_years = user_ratings_with_info_df['year'].sort_values().unique()
year_to_index = {year: index for index, year in enumerate(unique_years)}

# Map 'year' to indices
user_ratings_with_info_df['year_index'] = user_ratings_with_info_df['year'].map(year_to_index)
user_ratings_with_info_df.head(3)

Unnamed: 0,user_id,sex,age_group,occupation,zip_code,movie_id,rating,year,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year_index
0,1,F,age_1,job_10,zc_48067,1193,5.0,1975,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,55
1,2,M,age_56,job_16,zc_70072,1193,5.0,1975,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,55
2,12,M,age_25,job_12,zc_32793,1193,4.0,1975,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,55


In [10]:
# Create a dictionary to map user IDs to their corresponding features
user_info_dict = users.set_index('user_id')\
    [['age_group', 'occupation', 'sex','zip_code']].\
        to_dict(orient='index'\
                )

# Unique user and movie id
unique_user_ids = user_ratings_with_info_df['user_id'].unique()
unique_movie_ids = user_ratings_with_info_df['movie_id'].unique()

# Convert data to PyTorch tensors
user_ids_tensor = torch.tensor(user_ratings_with_info_df['user_id'].values, dtype=torch.long)
movie_ids_tensor = torch.tensor(user_ratings_with_info_df['movie_id'].values, dtype=torch.long)
ratings_tensor = torch.tensor(user_ratings_with_info_df['rating'].values, dtype=torch.float)

# User Embedding Layer
user_embedding_layer = nn.Embedding(\
    num_embeddings=max(unique_user_ids) + 1, \
        embedding_dim=embedding_size\
            )

user_embedded = user_embedding_layer(user_ids_tensor)

# Movie Embedding Layer
movie_embedding_layer = nn.Embedding(\
    num_embeddings=max(unique_movie_ids) + 1, \
        embedding_dim=embedding_size\
            )

movie_embedded = movie_embedding_layer(movie_ids_tensor)

# Movie Category Embedding Layer
category_columns = [\
    'Action', 'Adventure', 'Animation', 'Children\'s', \
        'Comedy', 'Crime', 'Documentary', 'Drama',
                    'Fantasy', 'Film-Noir', 'Horror', 'Musical', \
                        'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'\
                            ]

movie_categories_tensor = torch.tensor(\
    user_ratings_with_info_df[category_columns].values, \
        dtype=torch.float\
            )

num_categories = movie_categories_tensor.shape[1]  
movie_category_embedding_layer = nn.Embedding(\
    num_embeddings=2, embedding_dim=category_embedding_size\
        ) 

category_embeddings = movie_category_embedding_layer(movie_categories_tensor.long())

# Aggregate category embeddings for each user
user_category_embeddings = category_embeddings.sum(dim=1)

# Movie Year Embedding Layer
year_tensor = torch.tensor(user_ratings_with_info_df['year_index'].values, dtype=torch.long)
year_embedding_layer = nn.Embedding(num_embeddings=len(unique_years), embedding_dim=embedding_size)
year_embedding = year_embedding_layer(year_tensor)

# Combine user and movie embeddings
embedding_input = torch.cat([user_embedded, movie_embedded,user_category_embeddings,year_embedding], dim=1)

# Convert categorical features to numeric representations
age_tensor = torch.tensor(\
    user_ratings_with_info_df['age_group'].astype('category').cat.codes.values, dtype=torch.long)
occupation_tensor = torch.tensor(\
    user_ratings_with_info_df['occupation'].astype('category').cat.codes.values, dtype=torch.long)
gender_tensor = torch.tensor(\
    user_ratings_with_info_df['sex'].astype('category').cat.codes.values, dtype=torch.long)
zipcode_tensor = torch.tensor(\
    user_ratings_with_info_df['zip_code'].astype('category').cat.codes.values, dtype=torch.long)

# Define embedding layers for categorical features
embedding_layer_age = nn.Embedding(\
    num_embeddings=user_ratings_with_info_df['age_group'].nunique(), \
        embedding_dim=embedding_size\
            )

embedding_layer_occupation = nn.Embedding(\
    num_embeddings=user_ratings_with_info_df['occupation'].nunique(), \
        embedding_dim=embedding_size\
            )

embedding_layer_gender = nn.Embedding(\
    num_embeddings=user_ratings_with_info_df['sex'].nunique(), \
        embedding_dim=embedding_size\
            )

embedding_layer_zipcode = nn.Embedding(\
    num_embeddings=user_ratings_with_info_df['zip_code'].nunique(), \
        embedding_dim=embedding_size\
            )

# Convert categorical features to numeric representations using embedding layers
age_embedding = embedding_layer_age(age_tensor)
occupation_embedding = embedding_layer_occupation(occupation_tensor)
gender_embedding = embedding_layer_gender(gender_tensor)
zipcode_embedding = embedding_layer_zipcode(zipcode_tensor)

# Concatenate or sum embeddings
categorical_embeddings = torch.cat(\
    [age_embedding, occupation_embedding, gender_embedding, zipcode_embedding], \
        dim=1\
            )

# Combine with embedding_input
combined_input = torch.cat([embedding_input, categorical_embeddings], dim=1)


# GPU Setting

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
torch.cuda.empty_cache()

Using device: cuda


# Transformer Layer

In [13]:
chunk_size = 100
num_chunks = combined_input.size(0) // chunk_size + 1
agg_result = torch.zeros_like(combined_input)
# Transformer Layer
transformer_layer = nn.Transformer(d_model=combined_input.size(1), nhead=2, num_encoder_layers=1)

# Process input in chunks
for i in tqdm(range(num_chunks), desc="Processing Chunks", unit="chunk"):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size
    input_chunk = combined_input[start_idx:end_idx]

    # Pass the chunk through the transformer layer
    transformer_output_chunk = transformer_layer(input_chunk, input_chunk)

    # Aggregate the transformer_output_chunk using sum
    agg_result[start_idx:end_idx] += transformer_output_chunk

# Calculate the mean of aggregated results
final_result = agg_result / num_chunks



Processing Chunks:   5%|▌         | 517/10003 [00:12<03:47, 41.77chunk/s]


KeyboardInterrupt: 

# BERT Embedding

In [10]:
# BERT input df
df_set_BERT=movie_df_r.query('overview.notna()')[['movie_id','overview']].reset_index(drop=True)
df_set_BERT.shape

(3457, 2)

In [19]:
# Load pre-trained BERT model and tokenizer
model_name='bert-base-uncased'
tokenizer=BertTokenizer.from_pretrained(model_name)
model=BertModel.from_pretrained(model_name)
model.eval()
model.to('cuda')

# Tokenize overview col
df_set_BERT['tokens_BERT'] = df_set_BERT['overview'].apply(\
    lambda x: tokenizer.encode(x, add_special_tokens=True)\
)


# Assuming df_set_BERT['tokens_BERT'] contains the tokenized sequences
max_seq_length = df_set_BERT['tokens_BERT'].apply(len).max()

# Move tokens to GPU
tokens_on_gpu = [torch.tensor(tokens).to('cuda') for tokens in df_set_BERT['tokens_BERT']]

# Pad sequences and create padded_tokens tensor on GPU
padded_tokens = pad_sequence([\
    torch.cat([tokens, \
        torch.zeros(max_seq_length - len(tokens)).to('cuda')]) for tokens in tokens_on_gpu], \
            batch_first=True\
                )

# Convert padded_tokens to Long type
padded_tokens = padded_tokens.type(torch.LongTensor).to('cuda')

# Create DataLoader for batching
data_loader = DataLoader(TensorDataset(padded_tokens), batch_size=batch_size, shuffle=False)

# Initialize an empty list to store hidden states
all_hidden_states = []

# Process batches
with torch.no_grad(),tqdm(total=len(data_loader)) as pbar:
    for batch_tokens in data_loader:
        # Ensure batch_tokens is a PyTorch tensor
        batch_tokens = batch_tokens[0].to('cuda')

        outputs = model(batch_tokens)

        # Extract all 12 layers of hidden states
        hidden_states = outputs.last_hidden_state

        # Get the last 2 layers and take the average along the layer-axis
        last_2_layer_hidden_states = torch.mean(hidden_states[-2:], axis=0)

        # Mean pooling along the token-axis
        sentence_embedding = torch.mean(last_2_layer_hidden_states, axis=1)

        # Append the embeddings to the list
        all_hidden_states.append(sentence_embedding)

        # Update the progress bar
        pbar.update(1)

# Concatenate the embeddings
df_set_BERT['sentence_embedding'] = torch.cat(all_hidden_states, dim=0)[:len(df_set_BERT)].tolist()

100%|██████████| 433/433 [00:14<00:00, 30.14it/s]


# 3xDense Layers with ReLU

In [None]:
# Convert BERT embeddings to tensor
bert_embeddings = torch.tensor(df_set_BERT['sentence_embedding'].tolist()).to('cuda')

# Concatenate BERT embeddings and transformer output
combined_features = torch.cat([bert_embeddings, final_result], dim=1)

# Define three dense layers with ReLU activation
dense_layer1 = nn.Linear(combined_features.size(1), 512)
dense_layer2 = nn.Linear(512, 256)
dense_layer3 = nn.Linear(256, 1)

# Move dense layers to GPU
dense_layer1.to('cuda')
dense_layer2.to('cuda')
dense_layer3.to('cuda')

# Pass through dense layers with ReLU
output = relu(dense_layer1(combined_features))
output = relu(dense_layer2(output))
output = dense_layer3(output)

# Evaluation

In [None]:
# Define predicted rating
predictions=output.cpu().detach().numpy().flatten()

# Define actual rating
actual=user_ratings_with_info_df['ratings'].values

# Adding to GPU
predicted_ratings_tensor = torch.tensor(predictions, dtype=torch.float).to('cuda')
actual_ratings_tensor = torch.tensor(actual, dtype=torch.float).to('cuda')

# Calculate Mean Squared Error (MSE)
mse_loss = nn.MSELoss()(predicted_ratings_tensor, actual_ratings_tensor)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(predictions, actual)

print(f'Mean Squared Error (MSE): {mse_loss.item()}')
print(f'Mean Absolute Error (MAE): {mae}')