In [75]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, explained_variance_score, max_error, d2_absolute_error_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from surprise import SVD, Reader, Dataset, SVDpp, accuracy
from collections import defaultdict
import gzip
from surprise.model_selection import cross_validate, GridSearchCV

### Further Ideas:
- use the BPE tokens and create word embeddings from them
    - using Word2Vec or using pretrained which will use the raw reviews however
    - this means we can create sentiment analysis based on the food itself
- SVDpp makes latent features that are put into LightGBM model

### Loading

In [76]:
df_train = pd.read_csv('interactions_train.csv')
df_test = pd.read_csv('interactions_test.csv')
df_validation = pd.read_csv('interactions_validation.csv')
pp_recipes = pd.read_csv('PP_recipes.csv')
pp_users = pd.read_csv('PP_users.csv')

In [77]:
df_train.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [79]:
def feat(df):
    df['name_embds'] = df['name_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['ingredient_embds'] = df['ingredient_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['steps_length'] = df['steps_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    return df

def add_avg_user_rating(df, user_avg_rating):
    df = df.merge(user_avg_rating, on='u', how='left')
    df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
    return df

user_avg_rating = df_train.groupby('u')['rating'].mean().reset_index()
user_avg_rating.columns = ['u', 'avg_rating']

df_train_enriched = df_train.merge(pp_recipes, left_on='i', right_on='id')
df_train_enriched = df_train_enriched.merge(pp_users, left_on='u', right_on='u')

df_valid_enriched = df_validation.merge(pp_recipes, left_on='i', right_on='id')
df_valid_enriched = df_valid_enriched.merge(pp_users, left_on='u', right_on='u')

df_train_enriched = add_avg_user_rating(df_train_enriched, user_avg_rating)
df_valid_enriched = add_avg_user_rating(df_valid_enriched, user_avg_rating)

df_train_enriched = feat(df_train_enriched)
df_valid_enriched = feat(df_valid_enriched)

df_train_svd = df_train_enriched[['u', 'id', 'rating']]
df_valid_svd = df_valid_enriched[['u', 'id', 'rating']]

df_train_enriched.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)


Unnamed: 0,user_id,recipe_id,date,rating,u,i_x,id,i_y,name_tokens,ingredient_tokens,...,ingredient_ids,techniques_y,items,n_items,ratings,n_ratings,avg_rating,name_embds,ingredient_embds,steps_length
0,2046,3431,2000-04-07,5.0,22095,101723,101723,5963,"[40480, 5811, 13569, 9540, 40481]","[[10444, 6020], [5541, 1322], [2507, 6878], [3...",...,"[5006, 6426, 790, 7922, 7679, 7655, 1297, 2251...","[3, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[44367, 87844, 101723, 134551]",4,"[5.0, 5.0, 5.0, 5.0]",4,5.0,5,14,148
1,2312,1232,2000-10-17,4.0,1674,15498,15498,11051,"[40480, 27102, 22942, 40481]","[[17869, 6020], [2056, 11332], [3484, 11332], ...",...,"[7557, 3440, 7982, 5010, 2131, 1318, 553, 7850...","[57, 2, 2, 17, 45, 0, 0, 11, 1, 45, 2, 5, 2, 1...","[127175, 151793, 15498, 14380, 96573, 107132, ...",153,"[5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...",153,4.575163,4,12,47
2,2625,471,2000-10-18,3.0,20667,35144,35144,100513,"[40480, 9472, 744, 272, 12519, 25, 573, 10825,...","[[12519, 25, 573, 10825], [8289], [664, 260, 6...",...,"[8021, 1803, 7233, 1329, 1119, 3502, 7620, 725...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ...","[35144, 105632, 37468]",3,"[3.0, 0.0, 5.0]",3,2.666667,11,12,105
3,2178,4366,2000-11-04,5.0,6065,50924,50924,93558,"[40480, 1265, 4405, 25388, 5639, 40481]","[[1657, 17004, 17918], [15473, 8361], [6953], ...",...,"[7887, 332, 6270, 800, 840, 4969, 5850, 6526, ...","[14, 0, 0, 3, 7, 0, 0, 3, 0, 8, 0, 0, 0, 0, 1,...","[3172, 50924, 135342, 59617, 62652, 23801, 339...",33,"[3.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, ...",33,4.606061,6,9,119
4,3794,7508,2000-12-11,4.0,12353,10114,10114,143805,"[40480, 1631, 252, 6609, 15022, 13473, 10734, ...","[[15022], [1061, 494, 813, 2141], [6953], [132...",...,"[3184, 4623, 6270, 4287]","[5, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, ...","[10114, 37830, 63320, 103800, 73972, 106137, 1...",7,"[4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]",7,4.857143,13,4,41


In [80]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_train_svd, reader)

trainset = data.build_full_trainset()

best_params = {
    'n_factors': 100,
    'lr_all': 0.01,
    'reg_bu': 0.3,
    'reg_yj': 0.02,
    'reg_bi': 0.03,
    'reg_pu': 0.6,
    'reg_qi': 0.4,
    'n_epochs': 100,
    "verbose": False
}

In [None]:
algo = SVDpp(**best_params)
algo.fit(trainset)

In [None]:
valid_data = Dataset.load_from_df(df_valid_svd, reader)
validset = valid_data.build_full_trainset().build_testset()

predictions = algo.test(validset)

rmse = accuracy.rmse(predictions, verbose=True)

print(f"RMSE: {rmse}")

RMSE: 1.2937
RMSE: 1.293678666337787


In [None]:
def extract_latent_features(model, df, user_col, item_col):
    user_features = {}
    item_features = {}

    for user_id in df[user_col].unique():
        user_features[user_id] = model.pu[model.trainset.to_inner_uid(user_id)]
    for item_id in df[item_col].unique():
        item_features[item_id] = model.qi[model.trainset.to_inner_iid(item_id)]

    return user_features, item_features

user_features, item_features = extract_latent_features(algo, df_train_enriched, 'u', 'i')

def add_latent_features(df, user_features, item_features):
    df['user_latent'] = df['u'].apply(lambda x: user_features.get(x, np.zeros(len(next(iter(user_features.values()))))))
    df['item_latent'] = df['i'].apply(lambda x: item_features.get(x, np.zeros(len(next(iter(item_features.values()))))))
    return df

df_train_enriched = add_latent_features(df_train_enriched, user_features, item_features)
df_valid_enriched = add_latent_features(df_valid_enriched, user_features, item_features)

# Combine latent features with metadata for LightGBM
def expand_latent_features(df):
    latent_dim = len(df['user_latent'][0])  # Assume user and item latent dimensions are the same
    for i in range(latent_dim):
        df[f'user_latent_{i}'] = df['user_latent'].apply(lambda x: x[i])
        df[f'item_latent_{i}'] = df['item_latent'].apply(lambda x: x[i])
    return df.drop(columns=['user_latent', 'item_latent'])

df_train_enriched = expand_latent_features(df_train_enriched)
df_valid_enriched = expand_latent_features(df_valid_enriched)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

features = ['u','id','name_embds', 'ingredient_embds', 'steps_length', 'calorie_level', 'avg_rating']
features = features + [col for col in df_train_enriched.columns if 'latent' in col]
target = 'rating'

X_train = df_train_enriched[features].astype("int")
y_train = df_train_enriched[target].astype("int")
X_val = df_valid_enriched[features].astype("int")
y_val = df_valid_enriched[target].astype("int")

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Set LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 100,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 5,
    'verbose': 1
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=5000,
)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)

rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f"Validation RMSE: {rmse_val:.4f}")