In [2]:
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from transformers import BertTokenizer, BertModel
import torch
import sklearn

In [10]:
nRowsRead = None # specify 'None' if want to read whole file
# movie_metadata.csv has 5044 rows in reality, but we are only loading/previewing the first 1000 rows
movies = pd.read_csv('../data/movie-lens-small/movies.csv', delimiter=',', nrows = nRowsRead)
ratings = pd.read_csv('../data/movie-lens-small/ratings.csv', delimiter=',', nrows = nRowsRead)
tags = pd.read_csv('../data/movie-lens-small/tags.csv', delimiter=',', nrows = nRowsRead)


# download movie lens images (https://github.com/antonsteenvoorden/ml1m-images)
image_url = pd.read_csv('../data/ml1m_images.csv', delimiter=',', nrows = nRowsRead)

In [11]:
pd.set_option('display.max_columns', None)

display(movies.head(5))
display(ratings.head(5))
display(tags.head(5))
display(image_url.head(5))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


Unnamed: 0,item_id,image
0,3186,https://m.media-amazon.com/images/M/MV5BNzdjZD...
1,1270,https://m.media-amazon.com/images/M/MV5BZmU0M2...
2,1721,https://m.media-amazon.com/images/M/MV5BMDdmZG...
3,1022,https://m.media-amazon.com/images/M/MV5BMWE3Nz...
4,2340,https://m.media-amazon.com/images/M/MV5BNTc0Mz...


In [12]:
movies = movies.merge(image_url, left_on="movieId", right_on="item_id", how="left").drop('item_id', axis=1)


In [13]:
movies.head()

Unnamed: 0,movieId,title,genres,image
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [23]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [24]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Use DistilBert tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = encoder(**inputs)
    # DistilBERT does not output the 'pooler_output' so use mean of last hidden state
    return outputs.last_hidden_state.mean(dim=1)

def calculate_cosine_similarity(query, title):
    query_emb = get_bert_embedding(query)
    title_emb = get_bert_embedding(title)
    return cosine_similarity(query_emb.detach().numpy(), title_emb.detach().numpy())[0][0]

# Example usage
similarity = calculate_cosine_similarity("Your query here", "Movie title here")


## Feature engineering

In [14]:
merged_data = pd.merge(ratings, movies, on='movieId')
user_stats = ratings.groupby('userId')['rating'].agg(['mean', 'std']).reset_index()
user_stats.rename(columns={'mean': 'avg_rating', 'std': 'rating_stddev'}, inplace=True)


In [26]:
import numpy as np

# Assuming you have functions to calculate embeddings and user statistics
# Example function signatures:
# get_bert_embedding(text) -> np.array
# get_user_stats(user_id) -> (average_rating, std_dev)



In [27]:
import random
import string

def generate_random_string(length):
    # Choose from letters
    characters = string.ascii_letters # + string.digits + string.punctuation
    # Generate a random string of the specified length
    random_string = ''.join(random.choice(characters) for i in range(length))
    return random_string

# Example usage: Generate a random string of length 10
random_string = generate_random_string(2)
print(random_string)


YJ


In [28]:
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split

# Sort the DataFrame by 'timestamp'
sorted_data = merged_data.sort_values(by='timestamp')

# Calculate the index for splitting the data (80% for training)
split_index = int(len(sorted_data) * 0.8)

# Split the sorted DataFrame
train_data = sorted_data.iloc[:split_index]
val_data = sorted_data.iloc[split_index:]

# Function to generate the combined features
def combine_features(query, user_id, movie_title):
    query_emb = get_bert_embedding(query).detach().numpy().flatten()
    user_avg, user_std = user_stats[user_stats['userId'] == user_id][['avg_rating', 'rating_stddev']].iloc[0]
    movie_emb = get_bert_embedding(movie_title).detach().numpy().flatten()
    
    combined = np.concatenate([query_emb, [user_avg, user_std], movie_emb])
    return combined

# Function to generate features and labels lists from the DataFrame
def generate_arrays(data):
    features_list = []
    labels_list = []

    for _, row in data.iterrows():
        user_id = row['userId']
        movie_title = row['title']
        random_string = generate_random_string(2)
        query = random_string + ' ' + row['title']  # Replace this with your actual query or a method to generate queries

        # Compute feature vector
        feature_vector = combine_features(query, user_id, movie_title)
        features_list.append(feature_vector)

        # Get the corresponding label (rating)
        label = row['rating']
        labels_list.append(label)
        
    return np.array(features_list), np.array(labels_list)

In [29]:
train_features_array, train_labels_array = generate_arrays(train_data)
val_features_array, val_labels_array = generate_arrays(val_data)

In [30]:
train_features = torch.tensor(train_features_array, dtype=torch.float32)
train_labels = torch.tensor(train_labels_array, dtype=torch.float32).unsqueeze(1)
val_features = torch.tensor(val_features_array, dtype=torch.float32)
val_labels = torch.tensor(val_labels_array, dtype=torch.float32).unsqueeze(1)

In [31]:
train_dataset = TensorDataset(train_features, train_labels)
val_dataset = TensorDataset(val_features, val_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

## MLP

In [32]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [14]:
# Define the model, criterion, optimizer with L2 regularization
input_size = len(train_features_array[0])
rtr_mlp = MLP()
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(rtr_mlp.parameters(), lr=0.001, weight_decay=1e-5)  # Added L2 regularization

# Early stopping parameters
patience = 3
best_val_loss = float('inf')
patience_counter = 0

# Training loop with early stopping
num_epochs = 100
for epoch in range(num_epochs):
    rtr_mlp.train()  # Set model to training mode
    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = rtr_mlp(batch_features)
        loss = criterion(outputs, batch_labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

    # Validation phase
    rtr_mlp.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_loss = 0
        for batch_features, batch_labels in val_loader:
            outputs = rtr_mlp(batch_features)
            val_loss += criterion(outputs, batch_labels).item()

        val_loss /= len(val_loader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}')

    # Check early stopping condition
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Stopping early due to no improvement in validation loss.")
        break

Epoch [1/100], Training Loss: 0.7409, Validation Loss: 1.2138
Epoch [2/100], Training Loss: 0.9021, Validation Loss: 1.2037
Epoch [3/100], Training Loss: 0.7961, Validation Loss: 1.3315
Epoch [4/100], Training Loss: 0.7466, Validation Loss: 1.1415
Epoch [5/100], Training Loss: 0.7925, Validation Loss: 1.2470
Epoch [6/100], Training Loss: 0.6768, Validation Loss: 1.1813
Epoch [7/100], Training Loss: 1.0099, Validation Loss: 1.1311
Epoch [8/100], Training Loss: 0.5760, Validation Loss: 1.1594
Epoch [9/100], Training Loss: 0.6223, Validation Loss: 1.1235
Epoch [10/100], Training Loss: 0.5250, Validation Loss: 1.1422
Epoch [11/100], Training Loss: 0.8373, Validation Loss: 1.1163
Epoch [12/100], Training Loss: 0.6426, Validation Loss: 1.0980
Epoch [13/100], Training Loss: 0.8911, Validation Loss: 1.2045
Epoch [14/100], Training Loss: 0.5684, Validation Loss: 1.1020
Epoch [15/100], Training Loss: 0.7418, Validation Loss: 1.1065
Stopping early due to no improvement in validation loss.


In [17]:
def prepare_features(userId, query, movieId):
    # Extract movie title from movieId
    movie_title = movies[movies['movieId'] == movieId]['title'].iloc[0]
    print("DEBUG info:\n", f'movie_title: {movie_title};\n', f'query: {query};\n', f'userId: {userId}\n')
    
    # Generate feature vector (same method as used in training)
    feature_vector = combine_features(query, userId, movie_title)
    
    # Convert the feature vector to a single numpy array before converting to a tensor
    feature_tensor = torch.tensor(np.array([feature_vector]), dtype=torch.float32)

    return feature_tensor

def predict_score(userId, query, movieId):
    rtr_mlp.eval()  # Set it to evaluation mode

    with torch.no_grad():  # No need to track gradients for prediction
        input_features = prepare_features(userId, query, movieId)
        output = rtr_mlp(input_features)
        
        # The output is a tensor, get the scalar value
        predicted_score = output.item()

    return predicted_score

In [21]:
# Example usage
userId = 1  # Example userId
query = "Toy Story (1995)"
movieId = 1  # Example movieId

    
predicted_rating = predict_score(userId, query, movieId)
print(f"Predicted Rating: {predicted_rating}")


DEBUG info:
 movie_title: Toy Story (1995);
 query: Toy Story (1995);
 userId: 1

Predicted Rating: 3.634709596633911


In [15]:
display(user_stats.head())
display(movies.head())

Unnamed: 0,userId,avg_rating,rating_stddev
0,1,4.366379,0.800048
1,2,3.948276,0.805615
2,3,2.435897,2.090642
3,4,3.555556,1.314204
4,5,3.636364,0.990441


Unnamed: 0,movieId,title,genres,image
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,Grumpier Old Men (1995),Comedy|Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [20]:
movies

Unnamed: 0,movieId,title,genres,image,event_timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,https://m.media-amazon.com/images/M/MV5BMDU2ZW...,2024-01-19 07:47:20.894504
1,2,Jumanji (1995),Adventure|Children|Fantasy,https://m.media-amazon.com/images/M/MV5BZTk2Zm...,2024-01-19 07:47:20.894504
2,3,Grumpier Old Men (1995),Comedy|Romance,https://m.media-amazon.com/images/M/MV5BMjQxM2...,2024-01-19 07:47:20.894504
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,https://m.media-amazon.com/images/M/MV5BYzcyMD...,2024-01-19 07:47:20.894504
4,5,Father of the Bride Part II (1995),Comedy,https://m.media-amazon.com/images/M/MV5BOTEyNz...,2024-01-19 07:47:20.894504
...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,,2024-01-19 07:47:20.894504
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,,2024-01-19 07:47:20.894504
9739,193585,Flint (2017),Drama,,2024-01-19 07:47:20.894504
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,,2024-01-19 07:47:20.894504


## save features and model artifacts

In [16]:
import pandas as pd
from datetime import datetime

user_stats['event_timestamp'] = datetime.now()
movies['event_timestamp'] = datetime.now()

In [17]:
# Write the DataFrame to a CSV file with header
# user_stats.to_csv('../data/feature-store/user_stats.csv', header=True, index=False)
# movies.to_csv('../data/feature-store/movies.csv', header=True, index=False)

prefix = '../feast-feature-store/main_pangolin/feature_repo/data'
# Write the DataFrame to a Parquet file
user_stats.to_parquet(f'{prefix}/user_stats.parquet', index=False)
movies.to_parquet(f'{prefix}/movies.parquet', index=False)


In [28]:
# Save the trained model
torch.save(rtr_mlp.state_dict(), '../model-artifacts/mlp_model.pth')