In [62]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error, explained_variance_score, max_error, d2_absolute_error_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from surprise import SVD, Reader, Dataset, SVDpp, accuracy
from collections import defaultdict
import gzip
from surprise.model_selection import cross_validate, GridSearchCV

In [63]:
df_train = pd.read_csv('interactions_train.csv')
df_test = pd.read_csv('interactions_test.csv')
df_validation = pd.read_csv('interactions_validation.csv')
pp_recipes = pd.read_csv('PP_recipes.csv')
pp_users = pd.read_csv('PP_users.csv')

In [64]:
def feat(df):
    df['name_embds'] = df['name_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['ingredient_embds'] = df['ingredient_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    df['steps_length'] = df['steps_tokens'].apply(lambda x: len(eval(x)) if isinstance(x, str) else 0)
    return df

def add_avg_user_rating(df, user_avg_rating):
    df = df.merge(user_avg_rating, on='u', how='left')
    df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
    return df

user_avg_rating = df_train.groupby('u')['rating'].mean().reset_index()
user_avg_rating.columns = ['u', 'avg_rating']

df_train_enriched = df_train.merge(pp_recipes, left_on='i', right_on='id')
df_train_enriched = df_train_enriched.merge(pp_users, left_on='u', right_on='u')

df_valid_enriched = df_validation.merge(pp_recipes, left_on='i', right_on='id')
df_valid_enriched = df_valid_enriched.merge(pp_users, left_on='u', right_on='u')

df_train_enriched = add_avg_user_rating(df_train_enriched, user_avg_rating)
df_valid_enriched = add_avg_user_rating(df_valid_enriched, user_avg_rating)

df_train_enriched = feat(df_train_enriched)
df_valid_enriched = feat(df_valid_enriched)

df_train_svd = df_train_enriched[['u', 'id', 'rating']]
df_valid_svd = df_valid_enriched[['u', 'id', 'rating']]

df_train_enriched.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['avg_rating'].fillna(df_train['rating'].mean(), inplace=True)


Unnamed: 0,user_id,recipe_id,date,rating,u,i_x,id,i_y,name_tokens,ingredient_tokens,...,ingredient_ids,techniques_y,items,n_items,ratings,n_ratings,avg_rating,name_embds,ingredient_embds,steps_length
0,2046,3431,2000-04-07,5.0,22095,101723,101723,5963,"[40480, 5811, 13569, 9540, 40481]","[[10444, 6020], [5541, 1322], [2507, 6878], [3...",...,"[5006, 6426, 790, 7922, 7679, 7655, 1297, 2251...","[3, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[44367, 87844, 101723, 134551]",4,"[5.0, 5.0, 5.0, 5.0]",4,5.0,5,14,148
1,2312,1232,2000-10-17,4.0,1674,15498,15498,11051,"[40480, 27102, 22942, 40481]","[[17869, 6020], [2056, 11332], [3484, 11332], ...",...,"[7557, 3440, 7982, 5010, 2131, 1318, 553, 7850...","[57, 2, 2, 17, 45, 0, 0, 11, 1, 45, 2, 5, 2, 1...","[127175, 151793, 15498, 14380, 96573, 107132, ...",153,"[5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...",153,4.575163,4,12,47
2,2625,471,2000-10-18,3.0,20667,35144,35144,100513,"[40480, 9472, 744, 272, 12519, 25, 573, 10825,...","[[12519, 25, 573, 10825], [8289], [664, 260, 6...",...,"[8021, 1803, 7233, 1329, 1119, 3502, 7620, 725...","[1, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ...","[35144, 105632, 37468]",3,"[3.0, 0.0, 5.0]",3,2.666667,11,12,105
3,2178,4366,2000-11-04,5.0,6065,50924,50924,93558,"[40480, 1265, 4405, 25388, 5639, 40481]","[[1657, 17004, 17918], [15473, 8361], [6953], ...",...,"[7887, 332, 6270, 800, 840, 4969, 5850, 6526, ...","[14, 0, 0, 3, 7, 0, 0, 3, 0, 8, 0, 0, 0, 0, 1,...","[3172, 50924, 135342, 59617, 62652, 23801, 339...",33,"[3.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, ...",33,4.606061,6,9,119
4,3794,7508,2000-12-11,4.0,12353,10114,10114,143805,"[40480, 1631, 252, 6609, 15022, 13473, 10734, ...","[[15022], [1061, 494, 813, 2141], [6953], [132...",...,"[3184, 4623, 6270, 4287]","[5, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, ...","[10114, 37830, 63320, 103800, 73972, 106137, 1...",7,"[4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]",7,4.857143,13,4,41


In [65]:
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(df_train_svd, reader)

trainset = data.build_full_trainset()

best_params = {
    'n_factors': 150,
    'lr_all': 0.005,
    'reg_all': 0.1,
    'n_epochs': 10,
    "verbose": True
}
algo = SVDpp(**best_params)
algo.fit(trainset)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x1696a9d90>

In [66]:
valid_data = Dataset.load_from_df(df_valid_svd, reader)
validset = valid_data.build_full_trainset().build_testset()

predictions = algo.test(validset)

rmse = accuracy.rmse(predictions, verbose=True)

print(f"RMSE: {rmse}")

RMSE: 1.3062
RMSE: 1.3061806434343644


In [67]:
def predict_svd_ratings(model, df):
    predictions = []
    for _, row in df.iterrows():
        user = row['u']
        item = row['id']
        try:
            pred = model.predict(user, item).est
        except ValueError:
            pred = model.trainset.global_mean
        predictions.append(pred)
    df['latent_rating'] = predictions
    return df

df_train_enriched = predict_svd_ratings(algo, df_train_enriched)
df_valid_enriched = predict_svd_ratings(algo, df_valid_enriched)

df_train_enriched.head()

Unnamed: 0,user_id,recipe_id,date,rating,u,i_x,id,i_y,name_tokens,ingredient_tokens,...,techniques_y,items,n_items,ratings,n_ratings,avg_rating,name_embds,ingredient_embds,steps_length,latent_rating
0,2046,3431,2000-04-07,5.0,22095,101723,101723,5963,"[40480, 5811, 13569, 9540, 40481]","[[10444, 6020], [5541, 1322], [2507, 6878], [3...",...,"[3, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...","[44367, 87844, 101723, 134551]",4,"[5.0, 5.0, 5.0, 5.0]",4,5.0,5,14,148,4.868973
1,2312,1232,2000-10-17,4.0,1674,15498,15498,11051,"[40480, 27102, 22942, 40481]","[[17869, 6020], [2056, 11332], [3484, 11332], ...",...,"[57, 2, 2, 17, 45, 0, 0, 11, 1, 45, 2, 5, 2, 1...","[127175, 151793, 15498, 14380, 96573, 107132, ...",153,"[5.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...",153,4.575163,4,12,47,4.375769
2,2625,471,2000-10-18,3.0,20667,35144,35144,100513,"[40480, 9472, 744, 272, 12519, 25, 573, 10825,...","[[12519, 25, 573, 10825], [8289], [664, 260, 6...",...,"[1, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, ...","[35144, 105632, 37468]",3,"[3.0, 0.0, 5.0]",3,2.666667,11,12,105,4.208406
3,2178,4366,2000-11-04,5.0,6065,50924,50924,93558,"[40480, 1265, 4405, 25388, 5639, 40481]","[[1657, 17004, 17918], [15473, 8361], [6953], ...",...,"[14, 0, 0, 3, 7, 0, 0, 3, 0, 8, 0, 0, 0, 0, 1,...","[3172, 50924, 135342, 59617, 62652, 23801, 339...",33,"[3.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 5.0, ...",33,4.606061,6,9,119,4.266259
4,3794,7508,2000-12-11,4.0,12353,10114,10114,143805,"[40480, 1631, 252, 6609, 15022, 13473, 10734, ...","[[15022], [1061, 494, 813, 2141], [6953], [132...",...,"[5, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, ...","[10114, 37830, 63320, 103800, 73972, 106137, 1...",7,"[4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0]",7,4.857143,13,4,41,4.350914


In [78]:
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

# features = ['u','id','name_embds', 'ingredient_embds', 'steps_length', 'calorie_level', 'avg_rating']
features = ['u','id', 'avg_rating','n_items', 'steps_length', 'name_embds', 'ingredient_embds']
# latent_features = [col for col in df_train_enriched.columns if 'latent' in col]
# embedding_feature_cols = [col for col in df_train.columns if 'embedding_feature_' in col]
features = features \
        # + latent_features \
        # + embedding_feature_cols
print(features)
target = 'rating'

X_train = df_train_enriched[features].astype("int")
y_train = df_train_enriched[target].astype("int")

# Prepare validation data (X_val and y_val)
X_val = df_valid_enriched[features].astype("int")
y_val = df_valid_enriched[target].astype("int")

# scaler = StandardScaler()

# X_train['latent_rating'] = scaler.fit_transform(X_train[['latent_rating']])

# X_val['latent_rating'] = scaler.transform(X_val[['latent_rating']])

train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

# Set LightGBM parameters
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting_type': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 103,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 1
}

# Train model
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)

rmse_val = mean_squared_error(y_val, y_val_pred, squared=False)

print(f"Validation RMSE: {rmse_val:.4f}")

['u', 'id', 'avg_rating', 'n_items', 'steps_length', 'name_embds', 'ingredient_embds']
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1037
[LightGBM] [Info] Number of data points in the train set: 319572, number of used features: 7
[LightGBM] [Info] Start training from score 4.573426
Validation RMSE: 1.3016




In [79]:
importance = model.feature_importance(importance_type='split')  # or 'gain'

feature_names = model.feature_name()
import pandas as pd
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


            Feature  Importance
0                 u       22472
3           n_items       20416
4      steps_length       20229
1                id       19177
6  ingredient_embds        8645
5        name_embds        7216
2        avg_rating        3845
