# Data import

In [None]:
!gdown https://drive.google.com/uc?id=1lNlp9iRAiYfDxwishtN0tONqjI4FZCIs

In [4]:
!unzip data.zip

Archive:  data.zip

   creating: data/

  inflating: data/RAW_interactions.csv  

  inflating: data/RAW_recipes.csv    

  inflating: data/interactions_test.csv  

  inflating: data/interactions_train.csv  


In [1]:
import pandas as pd

Path = "./data/"

# Read interactions_train.csv
df_interactions = pd.read_csv(Path + 'interactions_train.csv')
# df_interactions = pd.read_csv('/kaggle/input/train-data/interactions_train.csv')

In [2]:
# Show range of user_id and recipe_id for df_interactions
print("df_interactions:")
print("user_id range:", df_interactions['user_id'].min(), "-", df_interactions['user_id'].max())
print("recipe_id range:", df_interactions['recipe_id'].min(), "-", df_interactions['recipe_id'].max())
print()


df_interactions:
user_id range: 1533 - 2002312797
recipe_id range: 38 - 537458



In [29]:
# Delete 'u' and 'i' columns from df_interactions
df_interactions.drop(['u', 'i'], axis=1, inplace=True)

In [30]:
df_interactions.head(3)

Unnamed: 0,user_id,recipe_id,date,rating
0,2046,4684,2000-02-25,5.0
1,2046,517,2000-02-25,5.0
2,1773,7435,2000-03-13,5.0


# Data preprocessing

In [31]:
from os import replace
# Delete 'date' column from df_interactions_train
df_interactions.drop(['date'], axis=1, inplace=True)

In [32]:
df_interactions

Unnamed: 0,user_id,recipe_id,rating
0,2046,4684,5.0
1,2046,517,5.0
2,1773,7435,5.0
3,1773,278,4.0
4,2046,3431,5.0
...,...,...,...
698896,926904,457971,5.0
698897,2002312797,27208,5.0
698898,1290903,131607,5.0
698899,226867,363072,5.0


In [33]:
unique_user_ids = df_interactions.user_id.unique()
unique_recipe_ids = df_interactions.recipe_id.unique()

trainset = df_interactions[['user_id', 'recipe_id', 'rating']].groupby('user_id', as_index=False).first()
trainset = pd.concat([trainset, df_interactions[['user_id', 'recipe_id', 'rating']].groupby('recipe_id', as_index=False).first()])

supplement_data = pd.concat([trainset, df_interactions[['user_id', 'recipe_id', 'rating']]]).drop_duplicates(keep=False)
testset = supplement_data.sample(frac=0.1, replace=False)
additional_trainset = supplement_data[~supplement_data.index.isin(testset.index)]
trainset = pd.concat([trainset, additional_trainset])


In [34]:
trainset.shape

(650852, 3)

In [35]:
testset.shape

(51653, 3)

# Model

In [36]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from statistics import mean

In [37]:
user_list = trainset.user_id.unique()
item_list = trainset.recipe_id.unique()
user2id = {w: i for i, w in enumerate(user_list)}
item2id = {w: i for i, w in enumerate(item_list)}

In [29]:
class Ratings_Datset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = user2id[self.df['user_id'][idx]]
        user = torch.tensor(user, dtype=torch.long)
        item = item2id[self.df['recipe_id'][idx]]

        item = torch.tensor(item, dtype=torch.long)
        rating = torch.tensor(self.df['rating'][idx], dtype=torch.float)
        return user, item, rating


trainloader = DataLoader(Ratings_Datset(trainset), batch_size=512, shuffle=True ,num_workers=2)
testloader = DataLoader(Ratings_Datset(testset), batch_size=64, shuffle=True)

In [30]:
class NCF(nn.Module):

    def __init__(self, n_users, n_items, n_factors=8):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors)
        self.predictor = torch.nn.Sequential(
            nn.Linear(in_features=n_factors*2, out_features=64),
            nn.Linear(in_features=64, out_features=32),
            nn.Linear(in_features=32, out_features=1),
            nn.Sigmoid()
        )


    def forward(self, user, item):


        u = self.user_embeddings(user)
        i = self.item_embeddings(item)

        # Concat the two embedding layers
        z = torch.cat([u, i], dim=-1)
        return self.predictor(z)

# Training and test procedures

In [31]:
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
from statistics import mean
import math


def train(model, optimizer, trainloader, epochs=30):
    criterion = nn.MSELoss(reduction='mean')
    t = tqdm(range(epochs))
    for epoch in t:
        corrects = 0
        total = 0
        train_loss = []
        for users, items, r in trainloader:
            users = users.cuda()
            items = items.cuda()
            r = r.cuda() / 5
            y_hat = model(users, items)
            loss = criterion(y_hat, r.unsqueeze(1).float())
            train_loss.append(loss.item())
            total += r.size(0)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            t.set_description(f"loss: {mean(train_loss)}")


def test(model, testloader, m_eval=False):
    running_mae = 0
    with torch.no_grad():
        corrects = 0
        total = 0
        for users, items, r in testloader:
            users = users.cuda()
            items = items.cuda()
            y = r.cuda() / 5
            y_hat = model(users, items).flatten()
            error = torch.abs(y_hat - y).sum().data

            running_mae += error
            total += y.size(0)

    mae = running_mae/total
    return mae * 5

# Model training

In [32]:
n_users = trainset['user_id'].nunique()
n_items = trainset['recipe_id'].nunique()
model = NCF(n_users, n_items).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, optimizer, trainloader, epochs=5)


  0%|          | 0/5 [00:00<?, ?it/s]

# Model Testing

In [33]:
test(model, testloader)

tensor(0.5587, device='cuda:0')

In [34]:
users, recipes, r = next(iter(testloader))
users = users.cuda()
recipes = recipes.cuda()
r = r.cuda()

y = model(users, recipes)*5
print("ratings", r[:10].data)
print("predictions:", y.flatten()[:10].data)

ratings tensor([4., 5., 5., 5., 5., 5., 5., 4., 5., 5.], device='cuda:0')
predictions: tensor([4.5235, 4.8889, 4.7887, 4.6720, 4.7919, 4.8360, 3.8849, 3.2089, 4.3854,
        4.6956], device='cuda:0')


# Retrain the model on the whole trainset

In [35]:
trainset = df_interactions

In [None]:
user_list = trainset.user_id.unique()
item_list = trainset.recipe_id.unique()
user2id = {w: i for i, w in enumerate(user_list)}
item2id = {w: i for i, w in enumerate(item_list)}

In [36]:
import pickle

# Save the dictionaries as a pickle file
with open('mappings.pkl', 'wb') as file:
    pickle.dump({'user_list': user_list, 'item_list': item_list, 'user2id': user2id, 'item2id': item2id}, file)


In [37]:
class Ratings_Datset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        user = user2id[self.df['user_id'][idx]]
        user = torch.tensor(user, dtype=torch.long)
        item = item2id[self.df['recipe_id'][idx]]

        item = torch.tensor(item, dtype=torch.long)
        rating = torch.tensor(self.df['rating'][idx], dtype=torch.float)
        return user, item, rating


trainloader = DataLoader(Ratings_Datset(trainset), batch_size=512, shuffle=True ,num_workers=2)

In [38]:
class NCF(nn.Module):

    def __init__(self, n_users, n_items, n_factors=8):
        super().__init__()
        self.user_embeddings = torch.nn.Embedding(n_users, n_factors)
        self.item_embeddings = torch.nn.Embedding(n_items, n_factors)
        self.predictor = torch.nn.Sequential(
            nn.Linear(in_features=n_factors*2, out_features=64),
            nn.Linear(in_features=64, out_features=32),
            nn.Linear(in_features=32, out_features=1),
            nn.Sigmoid()
        )


    def forward(self, user, item):


        u = self.user_embeddings(user)
        i = self.item_embeddings(item)

        # Concat the two embedding layers
        z = torch.cat([u, i], dim=-1)
        return self.predictor(z)

In [39]:
n_users = trainset['user_id'].nunique()
n_items = trainset['recipe_id'].nunique()
model = NCF(n_users, n_items).cuda()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
train(model, optimizer, trainloader, epochs=5)


  0%|          | 0/5 [00:00<?, ?it/s]

# Model weights submit

In [40]:
# torch.save(model.state_dict(), '/kaggle/working/weights.pth')
torch.save(model.state_dict(), 'weights.pth')

### Absent recipe_id in test_script.csv

In [10]:
import pandas as pd

# Load test_script.csv into a DataFrame
test_script = pd.read_csv('/kaggle/input/test-script/test_script.csv')

# Extract user_id values from test_script DataFrame
test_user_ids = test_script['user_id'].unique()

# Check if all user_id values from the test set are in the trainset
all_user_ids_present = all(user_id in trainset['user_id'].values for user_id in test_user_ids)

# Print the result
if all_user_ids_present:
    print("All user_id values from the test set are present in the trainset.")
else:
    missing_user_ids = [user_id for user_id in test_user_ids if user_id not in trainset['user_id'].values]
    print("Some user_id values from the test set are missing in the trainset.")
    print("Missing user_ids:", missing_user_ids)


All user_id values from the test set are present in the trainset.


In [11]:
# Load test_script.csv into a DataFrame
test_script = pd.read_csv('/kaggle/input/test-script/test_script.csv')

# Extract recipe_id values from test_script DataFrame
test_recipe_ids = test_script['recipe_id'].unique()

# Check if all recipe_id values from the test set are in the trainset
all_recipe_ids_present = all(recipe_id in trainset['recipe_id'].values for recipe_id in test_recipe_ids)

# Print the result
if all_recipe_ids_present:
    print("All recipe_id values from the test set are present in the trainset.")
else:
    missing_recipe_ids = [recipe_id for recipe_id in test_recipe_ids if recipe_id not in trainset['recipe_id'].values]
    print("Some recipe_id values from the test set are missing in the trainset.")
    print("Missing recipe_ids:", missing_recipe_ids)

Some recipe_id values from the test set are missing in the trainset.
Missing recipe_ids: [33627, 75307, 100961, 154105, 14525, 132931, 378255, 303527, 180890, 359362]


In [22]:
# Load test_script.csv into a DataFrame
test_script = pd.read_csv('/kaggle/input/test-script/test_script.csv')

# Extract recipe_id values from test_script DataFrame
test_recipe_ids = test_script['i'].unique()

# Check if all recipe_id values from the test set are in the trainset
missing_recipe_ids = [recipe_id for recipe_id in test_recipe_ids if recipe_id not in trainset['recipe_id'].values]

# Print the result
if not missing_recipe_ids:
    print("All recipe_id values from the test set are present in the trainset.")
else:
    print("Some recipe_id values from the test set are missing in the trainset.")
    print("Missing recipe_ids:")
    print(missing_recipe_ids)

Some recipe_id values from the test set are missing in the trainset.
Missing recipe_ids:
[177317, 165555, 177453, 142367, 162220, 174635, 11059]


In [17]:
154105 in trainset['recipe_id'].unique()

False

In [21]:
154105 in trainset['recipe_id'].unique()

False