In [97]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, explained_variance_score, max_error, d2_absolute_error_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from surprise import SVD, Reader, Dataset, SVDpp, accuracy
from collections import defaultdict
import gzip
from surprise.model_selection import cross_validate, GridSearchCV

### Further Ideas:
- use the BPE tokens and create word embeddings from them
    - using Word2Vec or using pretrained which will use the raw reviews however
    - this means we can create sentiment analysis based on the food itself
- SVDpp makes latent features that are put into LightGBM model

### Loading

In [98]:
df_train = pd.read_csv('interactions_train.csv')
df_test = pd.read_csv('interactions_test.csv')
df_validation = pd.read_csv('interactions_validation.csv')
pp_recipes = pd.read_csv('PP_recipes.csv')
pp_users = pd.read_csv('PP_users.csv')

In [99]:
df_train.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [None]:
def feat(df):
    df['name_embds'] = df['name_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['ingredient_embds'] = df['ingredient_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['steps_length'] = df['steps_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    return df

def add_avg_user_rating(df, user_avg_rating):
    df = df.merge(user_avg_rating, on='u', how='left')
    df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
    return df

user_avg_rating = df_train.groupby('u')['rating'].mean().reset_index()
user_avg_rating.columns = ['u', 'avg_rating']

df_train_enriched = df_train.merge(pp_recipes, left_on='i', right_on='id')
df_train_enriched = df_train_enriched.merge(pp_users, left_on='u', right_on='u')

df_valid_enriched = df_validation.merge(pp_recipes, left_on='i', right_on='id')
df_valid_enriched = df_valid_enriched.merge(pp_users, left_on='u', right_on='u')

df_train_enriched = add_avg_user_rating(df_train_enriched, user_avg_rating)
df_valid_enriched = add_avg_user_rating(df_valid_enriched, user_avg_rating)

df_train_enriched = feat(df_train_enriched)
df_valid_enriched = feat(df_valid_enriched)

df_train_svd = df_train_enriched[['u', 'id', 'rating']]
df_valid_svd = df_valid_enriched[['u', 'id', 'rating']]

df_train_enriched.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)


In [None]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_train_svd, reader)

trainset = data.build_full_trainset()

best_params = {
    'n_factors': 100,
    'lr_all': 0.005,
    'reg_bu': 0.3,
    'reg_yj': 0.02,
    'reg_bi': 0.03,
    'reg_pu': 0.6,
    'reg_qi': 0.4,
    'n_epochs': 150,
    "verbose": True
}

In [None]:
algo = SVDpp(**best_params)
algo.fit(trainset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x17b9e9370>

In [None]:
valid_data = Dataset.load_from_df(df_valid_svd, reader)
validset = valid_data.build_full_trainset().build_testset()

predictions = algo.test(validset)

rmse = accuracy.rmse(predictions, verbose=True)

print(f"RMSE: {rmse}")

RMSE: 1.3065
RMSE: 1.306490910166436


In [None]:
def extract_latent_features(model, df, user_col, item_col):
    user_features = {}
    item_features = {}

    for user_id in df[user_col].unique():
        user_features[user_id] = model.pu[model.trainset.to_inner_uid(user_id)]
    for item_id in df[item_col].unique():
        item_features[item_id] = model.qi[model.trainset.to_inner_iid(item_id)]

    return user_features, item_features

user_features, item_features = extract_latent_features(algo, df_train_enriched, 'u', 'id')

def add_latent_features(df, user_features, item_features):
    df['user_latent'] = df['u'].apply(lambda x: user_features.get(x, np.zeros(len(next(iter(user_features.values()))))))
    df['item_latent'] = df['id'].apply(lambda x: item_features.get(x, np.zeros(len(next(iter(item_features.values()))))))
    return df

df_train_enriched = add_latent_features(df_train_enriched, user_features, item_features)
df_valid_enriched = add_latent_features(df_valid_enriched, user_features, item_features)

# Combine latent features with metadata for LightGBM
def expand_latent_features(df):
    latent_dim = len(df['user_latent'][0])  # Assume user and item latent dimensions are the same
    for i in range(latent_dim):
        df[f'user_latent_{i}'] = df['user_latent'].apply(lambda x: x[i])
        df[f'item_latent_{i}'] = df['item_latent'].apply(lambda x: x[i])
    return df.drop(columns=['user_latent', 'item_latent'])

df_train_enriched = expand_latent_features(df_train_enriched)
df_valid_enriched = expand_latent_features(df_valid_enriched)

  df['user_latent'] = df['u'].apply(lambda x: user_features.get(x, np.zeros(len(next(iter(user_features.values()))))))
  df['item_latent'] = df['id'].apply(lambda x: item_features.get(x, np.zeros(len(next(iter(item_features.values()))))))
  df['user_latent'] = df['u'].apply(lambda x: user_features.get(x, np.zeros(len(next(iter(user_features.values()))))))
  df['item_latent'] = df['id'].apply(lambda x: item_features.get(x, np.zeros(len(next(iter(item_features.values()))))))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

features = ['u','id','name_embds', 'ingredient_embds', 'steps_length', 'calorie_level', 'avg_rating']
features = features + [col for col in df_train_enriched.columns if 'latent' in col]
target = 'rating'

X_train = df_train_enriched[features].astype("int")
y_train = df_train_enriched[target].astype("int")
X_val = df_valid_enriched[features].astype("int")
y_val = df_valid_enriched[target].astype("int")

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Set LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'verbose': 1
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)

rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f"Validation RMSE: {rmse_val:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001637 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 796
[LightGBM] [Info] Number of data points in the train set: 319572, number of used features: 7
[LightGBM] [Info] Start training from score 4.573426
Validation RMSE: 1.3055


