# Data import

In [53]:
!gdown https://drive.google.com/uc?id=1lNlp9iRAiYfDxwishtN0tONqjI4FZCIs

Downloading...
From: https://drive.google.com/uc?id=1lNlp9iRAiYfDxwishtN0tONqjI4FZCIs
To: /content/data.zip
100% 211M/211M [00:07<00:00, 28.8MB/s]


In [54]:
!unzip data.zip

Archive:  data.zip
   creating: data/
  inflating: data/RAW_interactions.csv  
  inflating: data/RAW_recipes.csv    
  inflating: data/interactions_test.csv  
  inflating: data/interactions_train.csv  


In [56]:
import pandas as pd

Path = "./data/"

# Read interactions_test.csv
df_interactions_test = pd.read_csv(Path + 'interactions_test.csv')

# Read interactions_train.csv
df_interactions_train = pd.read_csv(Path + 'interactions_train.csv')

# Read RAW_interactions.csv
df_raw_interactions = pd.read_csv(Path + 'RAW_interactions.csv')

# Read RAW_recipes.csv
df_raw_recipes = pd.read_csv(Path + 'RAW_recipes.csv')


In [57]:
# Show range of user_id and recipe_id for df_interactions_test
print("df_interactions_test:")
print("user_id range:", df_interactions_test['user_id'].min(), "-", df_interactions_test['user_id'].max())
print("recipe_id range:", df_interactions_test['recipe_id'].min(), "-", df_interactions_test['recipe_id'].max())
print()

# Show range of user_id and recipe_id for df_interactions_train
print("df_interactions_train:")
print("user_id range:", df_interactions_train['user_id'].min(), "-", df_interactions_train['user_id'].max())
print("recipe_id range:", df_interactions_train['recipe_id'].min(), "-", df_interactions_train['recipe_id'].max())
print()

# Show range of user_id and recipe_id for df_raw_interactions
print("df_raw_interactions:")
print("user_id range:", df_raw_interactions['user_id'].min(), "-", df_raw_interactions['user_id'].max())
print("recipe_id range:", df_raw_interactions['recipe_id'].min(), "-", df_raw_interactions['recipe_id'].max())


df_interactions_test:
user_id range: 1533 - 2002254807
recipe_id range: 120 - 537716

df_interactions_train:
user_id range: 1533 - 2002312797
recipe_id range: 38 - 537458

df_raw_interactions:
user_id range: 1533 - 2002372706
recipe_id range: 38 - 537716


In [4]:
# Delete 'u' and 'i' columns from df_interactions_test
df_interactions_test.drop(['u', 'i'], axis=1, inplace=True)

# Delete 'u' and 'i' columns from df_interactions_train
df_interactions_train.drop(['u', 'i'], axis=1, inplace=True)

In [5]:
# List of DataFrames with names
dataframes = [
    ("df_interactions_test", df_interactions_test),
    ("df_interactions_train", df_interactions_train),
    ("df_raw_interactions", df_raw_interactions),
    ("df_raw_recipes", df_raw_recipes)
]

# Apply describe() and print shape for each DataFrame
for name, df in dataframes:
    print(f"DataFrame: {name}")
    print(f"Shape: {df.shape}")
    print(df.describe())
    print('\n')  # Add a newline for separation

DataFrame: df_interactions_test
Shape: (12455, 4)
            user_id      recipe_id        rating
count  1.245500e+04   12455.000000  12455.000000
mean   2.912689e+07  209323.124528      4.213087
std    2.334357e+08  135001.832923      1.338503
min    1.533000e+03     120.000000      0.000000
25%    1.698420e+05   94616.000000      4.000000
50%    3.829540e+05  195040.000000      5.000000
75%    8.016370e+05  314928.500000      5.000000
max    2.002255e+09  537716.000000      5.000000


DataFrame: df_interactions_train
Shape: (698901, 4)
            user_id      recipe_id         rating
count  6.989010e+05  698901.000000  698901.000000
mean   1.247694e+07  156173.409849       4.574090
std    1.525031e+08  126594.880211       0.959022
min    1.533000e+03      38.000000       0.000000
25%    1.059880e+05   53169.000000       4.000000
50%    2.301020e+05  116484.000000       5.000000
75%    4.801950e+05  234516.000000       5.000000
max    2.002313e+09  537458.000000       5.000000


Dat

In [6]:
# Print first 5 rows for each DataFrame
for name, df in dataframes:
    print(f"DataFrame: {name}")
    print(f"Shape: {df.shape}")
    print(df.head(5))
    print('\n')  # Add a newline for separation

DataFrame: df_interactions_test
Shape: (12455, 4)
   user_id  recipe_id        date  rating
0     8937      44551  2005-12-23     4.0
1    56680     126118  2006-10-07     4.0
2   349752     219596  2008-04-12     0.0
3   628951      82783  2007-11-13     2.0
4    92816     435013  2013-07-31     3.0


DataFrame: df_interactions_train
Shape: (698901, 4)
   user_id  recipe_id        date  rating
0     2046       4684  2000-02-25     5.0
1     2046        517  2000-02-25     5.0
2     1773       7435  2000-03-13     5.0
3     1773        278  2000-03-13     4.0
4     2046       3431  2000-04-07     5.0


DataFrame: df_raw_interactions
Shape: (1132367, 5)
   user_id  recipe_id        date  rating  \
0    38094      40893  2003-02-17       4   
1  1293707      40893  2011-12-21       5   
2     8937      44394  2002-12-01       4   
3   126440      85009  2010-02-27       5   
4    57222      85009  2011-10-01       5   

                                              review  
0  Great with

# Data preprocessing

In [7]:
ratings = df_interactions_train
ratings.shape

(698901, 4)

In [8]:
# Delete 'date' column from df_interactions_train
ratings = ratings.drop(['date'], axis=1)
ratings.head(5)

Unnamed: 0,user_id,recipe_id,rating
0,2046,4684,5.0
1,2046,517,5.0
2,1773,7435,5.0
3,1773,278,4.0
4,2046,3431,5.0


In [9]:
nombre_utilisateurs = ratings['user_id'].nunique()
nombre_utilisateurs

25076

In [10]:
# Calculate the number of votes per recipe
recipe_votes = ratings.groupby('recipe_id')['rating'].count()

# Sort recipes by the number of votes in descending order
top_rated_recipes = recipe_votes.sort_values(ascending=False)

# Select the top 200 rated recipes
top_200_recipes = top_rated_recipes.head(200)

# Calculate the number of votes per user
user_votes = ratings.groupby('user_id')['rating'].count()

# Sort users by the number of votes in descending order
top_voting_users = user_votes.sort_values(ascending=False)

# Select the top 4000 voting users
top_200_users = top_voting_users.head(4000)

# Filter the ratings DataFrame based on the selected recipes and users
trainset = ratings[
    ratings['recipe_id'].isin(top_200_recipes.index) &
    ratings['user_id'].isin(top_200_users.index)
]


In [11]:
trainset.shape

(29900, 3)

In [39]:
# testset = df_interactions_test.drop(['date'], axis=1)
# testset.shape

(12455, 3)

In [40]:
# testset = testset[testset['user_id'].isin(trainset['user_id']) & testset['recipe_id'].isin(trainset['recipe_id'])]


In [42]:
test_ratio = 0.2
testset = trainset.copy()
test_size = int(len(trainset) * test_ratio)  # Set the desired size of the test set
testset = testset.sample(test_size, replace=False, random_state=42)  # Randomly sample the test set

In [43]:
assert set(testset.user_id.unique()).issubset(trainset.user_id.unique())
assert set(testset.recipe_id.unique()).issubset(trainset.recipe_id.unique())

# Model

In [44]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from statistics import mean

In [45]:
user_list = trainset.user_id.unique()
item_list = trainset.recipe_id.unique()
user2id = {w: i for i, w in enumerate(user_list)}
item2id = {w: i for i, w in enumerate(item_list)}

In [46]:
class Ratings_Datset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = user2id[self.df['user_id'][idx]]
        user = torch.tensor(user, dtype=torch.long)
        item = item2id[self.df['recipe_id'][idx]]
        item = torch.tensor(item, dtype=torch.long)
        rating = torch.tensor(self.df['rating'][idx], dtype=torch.float)
        return user, item, rating


trainloader = DataLoader(Ratings_Datset(trainset), batch_size=512, shuffle=True ,num_workers=2)
testloader = DataLoader(Ratings_Datset(testset), batch_size=64, num_workers=2)

In [47]:
class NCF(nn.Module):

    def __init__(self, n_users, n_items, n_factors=8):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors)
        self.predictor = torch.nn.Sequential(
            nn.Linear(in_features=n_factors*2, out_features=64),
            nn.Linear(in_features=64, out_features=32),
            nn.Linear(in_features=32, out_features=1),
            nn.Sigmoid()
        )


    def forward(self, user, item):


        u = self.user_embeddings(user)
        i = self.item_embeddings(item)

        # Concat the two embedding layers
        z = torch.cat([u, i], dim=-1)
        return self.predictor(z)

# Training and test procedures

In [48]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from statistics import mean
import math


def train(model, optimizer, trainloader, epochs=30):
    criterion = nn.MSELoss(reduction='mean')
    t = tqdm(range(epochs))
    for epoch in t:
        corrects = 0
        total = 0
        train_loss = []
        for users, items, r in trainloader:
            users = users.cuda()
            items = items.cuda()
            r = r.cuda() / 5
            y_hat = model(users, items)
            loss = criterion(y_hat, r.unsqueeze(1).float())
            train_loss.append(loss.item())
            total += r.size(0)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t.set_description(f"loss: {mean(train_loss)}")


def test(model, testloader, m_eval=False):
    running_mae = 0
    with torch.no_grad():
        corrects = 0
        total = 0
        for users, items, r in testloader:
            users = users.cuda()
            items = items.cuda()
            y = r.cuda() / 5
            y_hat = model(users, items).flatten()
            error = torch.abs(y_hat - y).sum().data

            running_mae += error
            total += y.size(0)

    mae = running_mae/total
    return mae * 5

# Model training

In [49]:
n_users = trainset['user_id'].nunique()
n_items = trainset['recipe_id'].nunique()
model = NCF(n_users, n_items).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, optimizer, trainloader, epochs=5)


  0%|          | 0/5 [00:00<?, ?it/s]

# Model Testing

In [50]:
test(model, testloader)

tensor(0.5823, device='cuda:0')

In [51]:
users, recipes, r = next(iter(testloader))
users = users.cuda()
recipes = recipes.cuda()
r = r.cuda()

y = model(users, recipes)*5
print("ratings", r[:10].data)
print("predictions:", y.flatten()[:10].data)

ratings tensor([5., 5., 5., 5., 5., 5., 4., 5., 5., 5.], device='cuda:0')
predictions: tensor([4.5461, 4.7016, 4.7616, 4.6514, 4.5324, 4.5484, 4.6179, 4.6419, 4.7085,
        4.7034], device='cuda:0')


# Model weights submit

In [52]:
torch.save(model.state_dict(), 'NCF_model_weights.pth')