In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, explained_variance_score, max_error, d2_absolute_error_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from surprise import SVD, Reader, Dataset, SVDpp, accuracy
from collections import defaultdict
import gzip
from surprise.model_selection import cross_validate, GridSearchCV

### Further Ideas:
- use the BPE tokens and create word embeddings from them
    - using Word2Vec or using pretrained which will use the raw reviews however
    - this means we can create sentiment analysis based on the food itself
- SVDpp makes latent features that are put into LightGBM model

### Loading

In [None]:
df_train = pd.read_csv('interactions_train.csv')
df_test = pd.read_csv('interactions_test.csv')
df_validation = pd.read_csv('interactions_validation.csv')
pp_recipes = pd.read_csv('PP_recipes.csv')
pp_users = pd.read_csv('PP_users.csv')

In [None]:
df_train.head()

In [None]:
def feat(df):
    df['name_embds'] = df['name_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['ingredient_embds'] = df['ingredient_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['steps_length'] = df['steps_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    return df

def add_avg_user_rating(df, user_avg_rating):
    df = df.merge(user_avg_rating, on='u', how='left')
    df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
    return df

user_avg_rating = df_train.groupby('u')['rating'].mean().reset_index()
user_avg_rating.columns = ['u', 'avg_rating']

df_train_enriched = df_train.merge(pp_recipes, left_on='i', right_on='id')
df_train_enriched = df_train_enriched.merge(pp_users, left_on='u', right_on='u')

df_valid_enriched = df_validation.merge(pp_recipes, left_on='i', right_on='id')
df_valid_enriched = df_valid_enriched.merge(pp_users, left_on='u', right_on='u')

df_train_enriched = add_avg_user_rating(df_train_enriched, user_avg_rating)
df_valid_enriched = add_avg_user_rating(df_valid_enriched, user_avg_rating)

df_train_enriched = feat(df_train_enriched)
df_valid_enriched = feat(df_valid_enriched)

df_train_svd = df_train_enriched[['u', 'id', 'rating']]
df_valid_svd = df_valid_enriched[['u', 'id', 'rating']]

df_train_enriched.head()

In [None]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_train_svd, reader)

trainset = data.build_full_trainset()

best_params = {
    'n_factors': 150,
    'lr_all': 0.005,
    'reg_all': 0.1,
    'n_epochs': 10,
    "verbose": True
}

In [None]:
algo = SVDpp(**best_params)
algo.fit(trainset)

In [None]:
valid_data = Dataset.load_from_df(df_valid_svd, reader)
validset = valid_data.build_full_trainset().build_testset()

predictions = algo.test(validset)

rmse = accuracy.rmse(predictions, verbose=True)

print(f"RMSE: {rmse}")

In [None]:
# def extract_latent_features(model, df, user_col, item_col):
#     user_features = {}
#     item_features = {}

#     for user_id in df[user_col].unique():
#         user_features[user_id] = model.pu[model.trainset.to_inner_uid(user_id)]
#     for item_id in df[item_col].unique():
#         item_features[item_id] = model.qi[model.trainset.to_inner_iid(item_id)]

#     return user_features, item_features

# user_features, item_features = extract_latent_features(algo, df_train_enriched, 'u', 'id')

# def add_latent_features(df, user_features, item_features):
#     df['user_latent'] = df['u'].apply(lambda x: user_features.get(x, np.zeros(len(next(iter(user_features.values()))))))
#     df['item_latent'] = df['id'].apply(lambda x: item_features.get(x, np.zeros(len(next(iter(item_features.values()))))))
#     return df

# df_train_enriched = add_latent_features(df_train_enriched, user_features, item_features)
# df_valid_enriched = add_latent_features(df_valid_enriched, user_features, item_features)

# def expand_latent_features(df):
#     latent_dim = len(df['user_latent'][0])  # Assume user and item latent dimensions are the same
#     for i in range(latent_dim):
#         df[f'user_latent_{i}'] = df['user_latent'].apply(lambda x: x[i])
#         df[f'item_latent_{i}'] = df['item_latent'].apply(lambda x: x[i])
#     return df.drop(columns=['user_latent', 'item_latent'])

# df_train_enriched = expand_latent_features(df_train_enriched)
# df_valid_enriched = expand_latent_features(df_valid_enriched)

def predict_svd_ratings(model, df):
    predictions = []
    for _, row in df.iterrows():
        user = row['u']
        item = row['id']
        try:
            pred = model.predict(user, item).est
        except ValueError:
            pred = model.trainset.global_mean
        predictions.append(pred)
    df['latent_rating'] = predictions
    return df

df_train_enriched = predict_svd_ratings(algo, df_train_enriched)
df_valid_enriched = predict_svd_ratings(algo, df_valid_enriched)

df_train_enriched.head()

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import ast

# Example preprocessing
def pad_sequences(sequences, maxlen, padding='post', value=0):
    padded = np.full((len(sequences), maxlen), fill_value=value)
    for idx, seq in enumerate(sequences):
        seq = ast.literal_eval(seq)
        print(seq)
        padded[idx, :min(len(seq), maxlen)] = seq[:maxlen]
    return padded

# Pad name_tokens, ingredient_tokens, and steps_tokens
max_name_len = 10
max_steps_len = 50
max_ingredient_len = 20  # Per ingredient
max_ingredients = 10     # Number of ingredients

# Prepare padded sequences
pp_recipes['name_tokens_padded'] = pad_sequences(pp_recipes['name_tokens'].tolist(), max_name_len).tolist()
pp_recipes['steps_tokens_padded'] = pad_sequences(pp_recipes['steps_tokens'].tolist(), max_steps_len).tolist()

# Convert data into PyTorch-friendly format
class RecipeDataset(Dataset):
    def __init__(self, recipes):
        self.name_tokens = torch.tensor(np.array(list(recipes['name_tokens_padded'])), dtype=torch.long)
        self.steps_tokens = torch.tensor(np.array(list(recipes['steps_tokens_padded'])), dtype=torch.long)

    def __len__(self):
        return len(self.name_tokens)

    def __getitem__(self, idx):
        return {
            'name_tokens': self.name_tokens[idx],
            'steps_tokens': self.steps_tokens[idx],
        }

class RecipeEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(RecipeEmbeddingModel, self).__init__()
        # Embedding layers
        self.name_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.steps_embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        self.fc = nn.Sequential(
            nn.Linear(embedding_dim * 3, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
        )

    def forward(self, name_tokens, steps_tokens, ingredient_tokens):
        # Embedding lookup
        name_embeds = self.name_embedding(name_tokens).mean(dim=1)
        steps_embeds = self.steps_embedding(steps_tokens).mean(dim=1)
        combined = torch.cat([name_embeds, steps_embeds], dim=1)
        output = self.fc(combined)
        return output


In [None]:
dataset = RecipeDataset(pp_recipes)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

vocab_size = 50000
embedding_dim = 128
learning_rate = 0.001
num_epochs = 10

# Model, optimizer, loss
model = RecipeEmbeddingModel(vocab_size, embedding_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch in dataloader:
        name_tokens = batch['name_tokens']
        steps_tokens = batch['steps_tokens']

        outputs = model(name_tokens, steps_tokens)

        targets = torch.tensor(df_train['rating'].values, dtype=torch.float32)
        loss = criterion(outputs.squeeze(), targets)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(dataloader):.4f}")


In [None]:
# Reload embedding layers
name_embedding = model.name_embedding
steps_embedding = model.steps_embedding
# ingredient_embedding = model.ingredient_embedding

# Set embeddings to evaluation mode (freeze weights)
name_embedding.eval()
steps_embedding.eval()
# ingredient_embedding.eval()

In [None]:
# Function to compute embedding features
def compute_embeddings(recipe, name_emb_layer, steps_emb_layer, ingredient_emb_layer):
    # Convert tokens to tensors
    name_tokens = torch.tensor(recipe['name_tokens_padded'], dtype=torch.long)
    steps_tokens = torch.tensor(recipe['steps_tokens_padded'], dtype=torch.long)
    with torch.no_grad():
        name_emb = name_emb_layer(name_tokens).mean(dim=0).numpy()
        steps_emb = steps_emb_layer(steps_tokens).mean(dim=0).numpy()

    return np.concatenate([name_emb, steps_emb])


# Apply to all recipes
embedding_features = pp_recipes.apply(
    lambda x: compute_embeddings(
        # x, name_embedding, steps_embedding, ingredient_embedding
        x, name_embedding, steps_embedding

    ),
    axis=1
)

embedding_features_df = pd.DataFrame(embedding_features.tolist())
embedding_features_df.columns = [f'embedding_feature_{i}' for i in range(embedding_features_df.shape[1])]

pp_recipes = pd.concat([pp_recipes, embedding_features_df], axis=1)

In [None]:
def add_recipe_embeddings(df, recipe_embeddings):
    return df.merge(recipe_embeddings, left_on='i', right_on='id', how='left')

df_train_enriched = add_recipe_embeddings(df_train_enriched, pp_recipes)
df_valid_enriched = add_recipe_embeddings(df_valid_enriched, pp_recipes)

In [None]:
import lightgbm as lgb

features = ['u','id','name_embds', 'ingredient_embds', 'steps_length', 'calorie_level', 'avg_rating']
latent_features = [col for col in df_train_enriched.columns if 'latent' in col]
features = features \
        + latent_features \
        # + embedding_feature_cols
target = 'rating'

X_train = df_train_enriched[features].astype("int")
y_train = df_train_enriched[target].astype("int")
X_val = df_valid_enriched[features].astype("int")
y_val = df_valid_enriched[target].astype("int")

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Set LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 63,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=3000,
)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)

rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f"Validation RMSE: {rmse_val:.4f}")