In [13]:
import torch.nn as nn

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.layers(x)

In [14]:
# Define the model, criterion, optimizer with L2 regularization
input_size = len(train_features_array[0])
rtr_mlp = MLP()
criterion = torch.nn.L1Loss()
optimizer = torch.optim.Adam(rtr_mlp.parameters(), lr=0.001, weight_decay=1e-5)  # Added L2 regularization

# Early stopping parameters
patience = 3
best_val_loss = float('inf')
patience_counter = 0

# Training loop with early stopping
num_epochs = 100
for epoch in range(num_epochs):
    rtr_mlp.train()  # Set model to training mode
    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()

        # Forward pass
        outputs = rtr_mlp(batch_features)
        loss = criterion(outputs, batch_labels)

        # Backward and optimize
        loss.backward()
        optimizer.step()

    # Validation phase
    rtr_mlp.eval()  # Set model to evaluation mode
    with torch.no_grad():
        val_loss = 0
        for batch_features, batch_labels in val_loader:
            outputs = rtr_mlp(batch_features)
            val_loss += criterion(outputs, batch_labels).item()

        val_loss /= len(val_loader)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss:.4f}')

    # Check early stopping condition
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
    else:
        patience_counter += 1

    if patience_counter >= patience:
        print("Stopping early due to no improvement in validation loss.")
        break

Epoch [1/100], Training Loss: 0.7409, Validation Loss: 1.2138
Epoch [2/100], Training Loss: 0.9021, Validation Loss: 1.2037
Epoch [3/100], Training Loss: 0.7961, Validation Loss: 1.3315
Epoch [4/100], Training Loss: 0.7466, Validation Loss: 1.1415
Epoch [5/100], Training Loss: 0.7925, Validation Loss: 1.2470
Epoch [6/100], Training Loss: 0.6768, Validation Loss: 1.1813
Epoch [7/100], Training Loss: 1.0099, Validation Loss: 1.1311
Epoch [8/100], Training Loss: 0.5760, Validation Loss: 1.1594
Epoch [9/100], Training Loss: 0.6223, Validation Loss: 1.1235
Epoch [10/100], Training Loss: 0.5250, Validation Loss: 1.1422
Epoch [11/100], Training Loss: 0.8373, Validation Loss: 1.1163
Epoch [12/100], Training Loss: 0.6426, Validation Loss: 1.0980
Epoch [13/100], Training Loss: 0.8911, Validation Loss: 1.2045
Epoch [14/100], Training Loss: 0.5684, Validation Loss: 1.1020
Epoch [15/100], Training Loss: 0.7418, Validation Loss: 1.1065
Stopping early due to no improvement in validation loss.


In [17]:
def prepare_features(userId, query, movieId):
    # Extract movie title from movieId
    movie_title = movies[movies['movieId'] == movieId]['title'].iloc[0]
    print("DEBUG info:\n", f'movie_title: {movie_title};\n', f'query: {query};\n', f'userId: {userId}\n')
    
    # Generate feature vector (same method as used in training)
    feature_vector = combine_features(query, userId, movie_title)
    
    # Convert the feature vector to a single numpy array before converting to a tensor
    feature_tensor = torch.tensor(np.array([feature_vector]), dtype=torch.float32)

    return feature_tensor

def predict_score(userId, query, movieId):
    rtr_mlp.eval()  # Set it to evaluation mode

    with torch.no_grad():  # No need to track gradients for prediction
        input_features = prepare_features(userId, query, movieId)
        output = rtr_mlp(input_features)
        
        # The output is a tensor, get the scalar value
        predicted_score = output.item()

    return predicted_score

In [21]:
# Example usage
userId = 1  # Example userId
query = "Toy Story (1995)"
movieId = 1  # Example movieId

    
predicted_rating = predict_score(userId, query, movieId)
print(f"Predicted Rating: {predicted_rating}")


DEBUG info:
 movie_title: Toy Story (1995);
 query: Toy Story (1995);
 userId: 1

Predicted Rating: 3.634709596633911


In [23]:
display(user_stats.head())
display(movies.head())

Unnamed: 0,userId,avg_rating,rating_stddev
0,1,4.366379,0.800048
1,2,3.948276,0.805615
2,3,2.435897,2.090642
3,4,3.555556,1.314204
4,5,3.636364,0.990441


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


## save features and model artifacts

In [27]:
# Write the DataFrame to a CSV file with header
# user_stats.to_csv('../data/feature-store/user_stats.csv', header=True, index=False)
# movies.to_csv('../data/feature-store/movies.csv', header=True, index=False)
# Save the trained model
# torch.save(rtr_mlp.state_dict(), '../model-artifacts/mlp_model.pth')

In [3]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import torch
import torch.nn as nn
import numpy as np
import pandas as pd

# Use DistilBert tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
    outputs = encoder(**inputs)
    # DistilBERT does not output the 'pooler_output' so use mean of last hidden state
    return outputs.last_hidden_state.mean(dim=1)

# Define the MLP model class
class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        return self.layers(x)

# Function to generate the combined features
def combine_features(query, user_id, movie_title):
    query_emb = get_bert_embedding(query).detach().numpy().flatten()
    user_avg, user_std = user_stats[user_stats['userId'] == user_id][['avg_rating', 'rating_stddev']].iloc[0]
    movie_emb = get_bert_embedding(movie_title).detach().numpy().flatten()
    
    combined = np.concatenate([query_emb, [user_avg, user_std], movie_emb])
    return combined

# Function to prepare input features for the model
def prepare_features(userId, query, movieId):
    # Extract movie title from movieId
    movie_title = movies[movies['movieId'] == movieId]['title'].iloc[0]
    print("DEBUG info:\n", f'movie_title: {movie_title};\n', f'query: {query};\n', f'userId: {userId}\n')
    
    # Generate feature vector
    feature_vector = combine_features(query, userId, movie_title)
    
    # Convert the feature vector to a tensor
    feature_tensor = torch.tensor(np.array([feature_vector]), dtype=torch.float32)

    return feature_tensor


In [4]:
def predict_score(model, userId, query, movieId):
    model.eval() # Set the model to evaluation mode
    with torch.no_grad():  # No need to track gradients for prediction
        input_features = prepare_features(userId, query, movieId)
        output = model(input_features)
        
        # The output is a tensor, get the scalar value
        predicted_score = output.item()
    
    return predicted_score

## load the model

In [6]:
# Create a new instance of your MLP model
input_size = 768 * 2 + 2 # Example size: 2 * BERT embeddings + user avg and stddev

loaded_model = MLP(input_size)

# Load the model's state dictionary
loaded_model.load_state_dict(torch.load('../model-artifacts/mlp_model.pth'))

# Set the model to evaluation mode
loaded_model.eval()

MLP(
  (layers): Sequential(
    (0): Linear(in_features=1538, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=64, bias=True)
    (3): ReLU()
    (4): Linear(in_features=64, out_features=1, bias=True)
  )
)

## load features

In [9]:
user_stats = pd.read_csv('../data/feature-store/user_stats.csv')

movies = pd.read_csv('../data/feature-store/movies.csv')

## Inference 

In [10]:
user_id = 1
query = "example query"
movie_id = 101
score = predict_score(loaded_model, user_id, query, movie_id)
print("Predicted Score:", score)

DEBUG info:
 movie_title: Bottle Rocket (1996);
 query: example query;
 userId: 1

Predicted Score: 3.241020441055298
