# Imports

In [2]:
!pip install LightFM



In [3]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split

# Help functions

In [None]:
def precision_recall_at_k(actual, predicted, k=10):
    actual_set = set(actual[:k])
    predicted_set = set(predicted[:k])
    intersection = len(actual_set & predicted_set)

    precision = intersection / k if k else 0
    recall = intersection / len(actual_set) if actual_set else 0

    return precision, recall

# Datasets preparation

In [5]:
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return json.load(f)

In [6]:
users_file = "users.json"
history_file = "history.json"
products_file = "products.json"

In [7]:
users_data = load_json(users_file)["data"]
history_data = load_json(history_file)["data"]
products_data = load_json(products_file)["data"]

In [8]:
users_list = []
for user_id, user_info in users_data.items():
    if "__collections__" in user_info and "adresses" in user_info["__collections__"]:
        for address_id, address_info in user_info["__collections__"]["adresses"].items():
            users_list.append({
                "user_id": user_id,
                "name": user_info.get("name", ""),
                "street": address_info.get("street", ""),
                "house": address_info.get("house", ""),
                "entrance": address_info.get("buildingEntrance", ""),
                "apartment": address_info.get("doorNumber", ""),
            })

users_df = pd.DataFrame(users_list)


In [9]:
users_df.head()

Unnamed: 0,user_id,name,street,house,entrance,apartment
0,00Pcnq3nkRQGsC2ikBgYNEs26xq1,Наталья Меженина,Мингажева,107,3.0,94
1,02tX1BaL17Vw5lJgecyp1Fw2qvm2,Наталья,Королева,15,,106
2,04sDdtrpuCYF74FJqzgC,ильшат,Георгия Мушникова,9,6.0,246
3,02BZCYNc81PAcKXirv7g,Артур Ринатович,Кремлевская,76,1.0,1
4,01bHeClE6YiTqmXmd3Le,Ильдар Ямалеев,дагестанская,27,3.0,83


In [10]:
products_list = []
for city_id, city_info in products_data.items():
    if "__collections__" in city_info and "products" in city_info["__collections__"]:
        for product_id, product_info in city_info["__collections__"]["products"].items():
            products_list.append({
                "product_id": product_id,
                "name": product_info.get("name", ""),
                "price": product_info.get("price", 0),
                "category": product_info["categoryRef"]["__ref__"].split("/")[-1] if "categoryRef" in product_info else None,
                "discount": product_info.get("allowDiscount", False),
                "image": product_info.get("iconImage", ""),
                "description": product_info.get("description", "")
            })

products_df = pd.DataFrame(products_list)


In [11]:
products_df.head()

Unnamed: 0,product_id,name,price,category,discount,image,description
0,00f99b1f69e97c12569ff27c1d793b23,Японика,999,Ri8FZcAkRLBplNv6hkKb,True,https://filadelffia.ru/uploads/2020/02/sets/Ya...,"Дайкон, С креветкой, Лосось хот, Окунь хот, Ми..."
1,037ea244ac4bc82697c1be85b48b5830,Кентуки,249,haQ10FrMZHTNwd6b0UWQ,True,https://filadelffia.ru/uploads/2020/02/coldrol...,"Тигровые креветки в кляре, сливочный сыр Creme..."
2,07853274ac3383709e9997b729273638,Экзотический сок,150,NWls3mSdIqvCEOYCUBSD,True,http://www.filadelffia.ru/uploads/2016/Napitki...,Вкус экзотических фруктов
3,0888cf628febddd1de519fb637939a28,Яблочный сок,150,NWls3mSdIqvCEOYCUBSD,True,http://www.filadelffia.ru/uploads/2016/Napitki...,Очень вкусный яблочный сок
4,0a972e39117c432d1392f478a54b8dc2,Креветка хот,269,oVtEe6HzDRUbUGZ0ODZN,True,https://filadelffia.ru/uploads/2020/02/hotroll...,"Тигровые креветки, сырный соус, унаги соус"


In [12]:
orders_list = []
for order_id, order_info in history_data.items():
    user_ref = None
    if isinstance(order_info.get("userRef"), dict):
        user_ref = order_info["userRef"].get("__ref__", "").split("/")[-1]

    if "__collections__" in order_info and "products" in order_info["__collections__"]:
        for product_id, product_info in order_info["__collections__"]["products"].items():
            product_ref = None
            if isinstance(product_info.get("productRef"), dict):
                product_ref = product_info["productRef"].get("__ref__", "").split("/")[-1]

            orders_list.append({
                "order_id": order_id,
                "user_id": user_ref,
                "product_id": product_ref,
                "count": product_info.get("count", 1),
                "price": order_info.get("price", 0),
                "address": order_info.get("adress", ""),
                "order_date": order_info.get("orderDate", {}).get("__time__", None)
            })

orders_df = pd.DataFrame(orders_list)

In [13]:
orders_df.head()

Unnamed: 0,order_id,user_id,product_id,count,price,address,order_date
0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z
1,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,3e65b63c9122072f16f374775f826cc0,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z
2,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,rQtl1nmQvL8JUWwCaUeC,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z
3,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,c01feb26a59ba7ea12b096bc04002449,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z
4,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,yqiObhaBaxhfov3ujHPX,2,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z


In [14]:
merged_df = orders_df.merge(users_df, on="user_id", how="left").merge(products_df, on="product_id", how="left")

merged_df["family_id"] = (
    merged_df["street"] + "_" + merged_df["house"].astype(str) + "_" +
    merged_df["entrance"].astype(str) + "_" + merged_df["apartment"].astype(str)
)

In [15]:
merged_df.head()

Unnamed: 0,order_id,user_id,product_id,count,price_x,address,order_date,name_x,street,house,entrance,apartment,name_y,price_y,category,discount,image,description,family_id
0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z,Раиса,Муксинова,2/3,3,124,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,https://filadelffia.ru/uploads/2023/03/19/roll...,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124
1,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,3e65b63c9122072f16f374775f826cc0,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z,Раиса,Муксинова,2/3,3,124,,,,,,,Муксинова_2/3_3_124
2,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,rQtl1nmQvL8JUWwCaUeC,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z,Раиса,Муксинова,2/3,3,124,Имбирь,25.0,BFvVAGV9rV6Fkp1DZTwm,True,https://filadelffia.ru/uploads/2023/10/18/sous...,"Чуть сладкий, чуть острый - отборный вкусный и...",Муксинова_2/3_3_124
3,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,c01feb26a59ba7ea12b096bc04002449,1,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z,Раиса,Муксинова,2/3,3,124,Филадельфия Лайт,389.0,v0ufIpn2ogrwKL6hvhgj,True,http://filadelffia.ru/uploads/2021/08/Filka.png,"Фарерский лосось, сливочный сыр Cremette, огурец",Муксинова_2/3_3_124
4,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,yqiObhaBaxhfov3ujHPX,2,1401,Муксинова 2/3,2022-05-06T11:25:00.692839Z,Раиса,Муксинова,2/3,3,124,Васаби,20.0,BFvVAGV9rV6Fkp1DZTwm,True,http://filadelffia.ru//uploads/2023/10/18/sous...,Острый премиальный васаби fumiko,Муксинова_2/3_3_124


In [16]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 587700 entries, 0 to 587699
Data columns (total 19 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   order_id     587700 non-null  object 
 1   user_id      520635 non-null  object 
 2   product_id   447449 non-null  object 
 3   count        587700 non-null  int64  
 4   price_x      587700 non-null  int64  
 5   address      587700 non-null  object 
 6   order_date   587700 non-null  object 
 7   name_x       514536 non-null  object 
 8   street       514536 non-null  object 
 9   house        514536 non-null  object 
 10  entrance     514536 non-null  object 
 11  apartment    514536 non-null  object 
 12  name_y       365183 non-null  object 
 13  price_y      365183 non-null  float64
 14  category     365183 non-null  object 
 15  discount     365183 non-null  object 
 16  image        365183 non-null  object 
 17  description  365183 non-null  object 
 18  family_id    514536 non-

In [17]:
merged_df.isna().sum()

Unnamed: 0,0
order_id,0
user_id,67065
product_id,140251
count,0
price_x,0
address,0
order_date,0
name_x,73164
street,73164
house,73164


In [18]:
merged_df.shape

(587700, 19)

In [19]:
merged_df.columns


Index(['order_id', 'user_id', 'product_id', 'count', 'price_x', 'address',
       'order_date', 'name_x', 'street', 'house', 'entrance', 'apartment',
       'name_y', 'price_y', 'category', 'discount', 'image', 'description',
       'family_id'],
      dtype='object')

In [20]:
final = merged_df.copy()

In [21]:
final = final.drop(columns={'image', 'street', 'house', 'entrance', 'apartment', 'address'})

In [22]:
final = final.rename(columns={
    'price_x': 'order_price',
    'name_x': 'client_name',
    'name_y': 'product_name',
    'price_y': 'product_price'
    })

In [23]:
final.head()

Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,product_name,product_price,category,discount,description,family_id
0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124
1,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,3e65b63c9122072f16f374775f826cc0,1,1401,2022-05-06T11:25:00.692839Z,Раиса,,,,,,Муксинова_2/3_3_124
2,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,rQtl1nmQvL8JUWwCaUeC,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Имбирь,25.0,BFvVAGV9rV6Fkp1DZTwm,True,"Чуть сладкий, чуть острый - отборный вкусный и...",Муксинова_2/3_3_124
3,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,c01feb26a59ba7ea12b096bc04002449,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Филадельфия Лайт,389.0,v0ufIpn2ogrwKL6hvhgj,True,"Фарерский лосось, сливочный сыр Cremette, огурец",Муксинова_2/3_3_124
4,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,yqiObhaBaxhfov3ujHPX,2,1401,2022-05-06T11:25:00.692839Z,Раиса,Васаби,20.0,BFvVAGV9rV6Fkp1DZTwm,True,Острый премиальный васаби fumiko,Муксинова_2/3_3_124


In [24]:
final.to_csv('final.csv', index=False)

Categories id\

0f7kp5wcm73AVoS7u71s - Акции\
0kqAvq7OTumJ85PyW8nr - Закуски и салаты\
58QZ5QsfQy1K8PT0jbks - сеты\
BFvVAGV9rV6Fkp1DZTwm - соуса\
NOn81HMZPYytNvEou0Jr - напитки и десерты\
Uv2uq1jnf1NC7JI13V56 - запеченые роллы\
gCzCO39jbY1RRyfti8yE - пицца\
v0ufIpn2ogrwKL6hvhgj - роллы


## Removing Nans

In [25]:
final.isna().sum()

Unnamed: 0,0
order_id,0
user_id,67065
product_id,140251
count,0
order_price,0
order_date,0
client_name,73164
product_name,222517
product_price,222517
category,222517


# Working with merged dataset

In [26]:
merged = pd.read_csv('final.csv')
merged.head()

Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,product_name,product_price,category,discount,description,family_id
0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124
1,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,3e65b63c9122072f16f374775f826cc0,1,1401,2022-05-06T11:25:00.692839Z,Раиса,,,,,,Муксинова_2/3_3_124
2,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,rQtl1nmQvL8JUWwCaUeC,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Имбирь,25.0,BFvVAGV9rV6Fkp1DZTwm,True,"Чуть сладкий, чуть острый - отборный вкусный и...",Муксинова_2/3_3_124
3,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,c01feb26a59ba7ea12b096bc04002449,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Филадельфия Лайт,389.0,v0ufIpn2ogrwKL6hvhgj,True,"Фарерский лосось, сливочный сыр Cremette, огурец",Муксинова_2/3_3_124
4,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,yqiObhaBaxhfov3ujHPX,2,1401,2022-05-06T11:25:00.692839Z,Раиса,Васаби,20.0,BFvVAGV9rV6Fkp1DZTwm,True,Острый премиальный васаби fumiko,Муксинова_2/3_3_124


In [27]:
merged.isna().sum()

Unnamed: 0,0
order_id,0
user_id,67065
product_id,140251
count,0
order_price,0
order_date,0
client_name,73164
product_name,222517
product_price,222517
category,222517


In [29]:
merged_copy = merged.copy()
merged_copy = merged_copy.dropna(subset=['product_name', 'family_id'])
merged_copy.isna().sum()

Unnamed: 0,0
order_id,0
user_id,0
product_id,0
count,0
order_price,0
order_date,0
client_name,0
product_name,0
product_price,0
category,0


In [30]:
merged_copy[:3]

Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,product_name,product_price,category,discount,description,family_id
0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124
2,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,rQtl1nmQvL8JUWwCaUeC,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Имбирь,25.0,BFvVAGV9rV6Fkp1DZTwm,True,"Чуть сладкий, чуть острый - отборный вкусный и...",Муксинова_2/3_3_124
3,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,c01feb26a59ba7ea12b096bc04002449,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Филадельфия Лайт,389.0,v0ufIpn2ogrwKL6hvhgj,True,"Фарерский лосось, сливочный сыр Cremette, огурец",Муксинова_2/3_3_124


In [37]:
merged_copy.to_csv('final_without_nans.csv')

## Подсчет "холодных" и "горячих" пользователей

In [38]:
merged_without_nans = pd.read_csv('final_without_nans.csv')

In [39]:
num_families = merged_without_nans["family_id"].nunique()

num_dishes = merged_without_nans["product_id"].nunique()

avg_orders_per_family = merged_without_nans.groupby("family_id")["order_id"].nunique().mean()

orders_per_family = merged_without_nans.groupby("family_id")["order_id"].nunique()

cold_users_ratio = (orders_per_family <= 3).sum() / num_families

{
    "Всего заказов": len(merged_without_nans),
    "Уникальных семей": num_families,
    "Уникальных блюд": num_dishes,
    "Среднее число заказов на семью": avg_orders_per_family,
    "Процент холодных пользователей (≤3 заказа)": cold_users_ratio * 100
}

{'Всего заказов': 361321,
 'Уникальных семей': 12372,
 'Уникальных блюд': 120,
 'Среднее число заказов на семью': np.float64(7.247009376010346),
 'Процент холодных пользователей (≤3 заказа)': np.float64(50.840607824118976)}

# Building lightFM model

In [27]:
merged_without_nans = pd.read_csv('final_without_nans.csv')

In [28]:
excluded_categories = [
    "0kqAvq7OTumJ85PyW8nr",  # Закуски и салаты
    "58QZ5QsfQy1K8PT0jbks",  # Сеты
    "BFvVAGV9rV6Fkp1DZTwm",  # Соусы
    "NOn81HMZPYytNvEou0Jr"   # Напитки и десерты
]
merged_without_nans["ingredients_list"] = merged_without_nans.apply(
    lambda row: str(row["description"]).split(", ") if row["category"] not in excluded_categories else "", axis=1)


In [29]:
df = merged_without_nans.copy()

users = df['family_id'].unique()
items = df['product_id'].unique()

df["ingredients_list"] = df["ingredients_list"].apply(lambda lst: [i.strip() for i in lst if i.strip() != ""])

In [30]:
all_ingredients_tags = []
for i in range(len(df)):
  row_ingredients = df.loc[i, "ingredients_list"]

  if isinstance(row_ingredients, list):
      tags = []
      for ingredient in row_ingredients:
          if isinstance(ingredient, str) and ingredient.strip():
              tags.append(f"ingredient:{ingredient.strip()}")
          else:
            all_ingredients_tags.append(f"ingredient:None")
      all_ingredients_tags.append(tags)
  else:
    print(row_ingredients)
    all_ingredients_tags.append(f"ingredient:None")

df["ingredient_tags"] = all_ingredients_tags

s = set()
for tag in all_ingredients_tags:
  for elem in tag:
    s.add(elem)

s.add('ingredient:None')

In [31]:
dataset = Dataset()
dataset.fit(users=users, items=items, item_features=s)

In [32]:
(interactions, _) = dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

item_features_data = []
for product_id in items:
    sub = df[df["product_id"] == product_id]
    tags = set()
    for tag_list in sub["ingredient_tags"]:
        tags.update(tag_list)

    if not tags:
        tags.add("ingredient:None")

    item_features_data.append((product_id, list(tags)))

In [33]:
item_features_matrix = dataset.build_item_features(item_features_data)

train, test = random_train_test_split(
    interactions,
    test_percentage=0.2,
    random_state=42
)

model = LightFM(loss="warp")
model.fit(train, item_features=item_features_matrix, epochs=10, num_threads=4)


<lightfm.lightfm.LightFM at 0x7bd6dfc7e590>

In [34]:
precision = precision_at_k(
    model, test, k=10,
    item_features=item_features_matrix
).mean()
print(f"✅ Precision@10 (только по ингредиентам): {precision:.4f}")

✅ Precision@10 (только по ингредиентам): 0.2545


генерируется топ-10 рекомендаций для семьи\
0.25 неплохой результат при условии, что в датасете всего 94 блюда, некоторые из которых исключены из финальных фич по обучения (соусы, напитки)\
так же почти 76% клиентов - холодные

## LightFM with cats

In [48]:
cat_df = df.copy()

In [49]:
cat_df["category_tag"] = cat_df["category"].astype(str).str.strip().apply(lambda c: f"category:{c}")
cat_df.head(1)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,product_name,product_price,category,discount,description,family_id,ingredients_list,ingredient_tags,category_tag
0,0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124,"[Фарерский лосось, сливочный сыр Cremette, сыр...","[ingredient:Фарерский лосось, ingredient:сливо...",category:Uv2uq1jnf1NC7JI13V56


In [50]:
cat_df["item_tags"] = cat_df.apply(lambda row: row["ingredient_tags"] + [row["category_tag"]], axis=1)

In [51]:
all_tags = set()
for tag_list in cat_df["item_tags"]:
    all_tags.update(tag_list)

In [52]:
cat_dataset = Dataset()
cat_dataset.fit(users=users, items=items, item_features=all_tags)

(interactions, _) = cat_dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

item_features_data_cat = []
for product_id in items:
    group = cat_df[df["product_id"] == product_id]
    tags = set()
    for tag_list in group["item_tags"]:
        tags.update(tag_list)
    item_features_data_cat.append((product_id, list(tags)))

item_features_cat_matrix = cat_dataset.build_item_features(item_features_data_cat)


In [53]:
item_features_cat_matrix = cat_dataset.build_item_features(item_features_data_cat)

cat_model = LightFM(loss="warp")
cat_model.fit(train, item_features=item_features_cat_matrix, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7d950e10d290>

In [54]:
precision = precision_at_k(
    cat_model, test, k=10,
    item_features=item_features_cat_matrix
).mean()
print(f"✅ Precision@10 (ingr + cats): {precision:.4f}")

✅ Precision@10 (ingr + cats): 0.2511


категории не улучшили метрику

## LightFM with converting sets

In [55]:
set_category_id = '58QZ5QsfQy1K8PT0jbks'

set_df = df[df["category"] == set_category_id].copy()
product_ingredients_map = df.set_index("product_name")["ingredients_list"].to_dict()

In [56]:
def extract_ingredients_from_set_description(description):
    try:
        roll_names = [r.strip() for r in description.split(",") if r.strip()]
        ingredients = []
        for roll in roll_names:
            if roll in product_ingredients_map:
                ingredients += product_ingredients_map[roll]
        return list(set(ingredients))  # убираем дубли
    except Exception:
        return []


In [57]:
set_df["ingredients_list"] = set_df["description"].apply(extract_ingredients_from_set_description)

In [58]:
needed_set = set_df[["product_id", "ingredients_list"]]
needed_set.head()

Unnamed: 0,product_id,ingredients_list
35,f8e1a135a5026ef2d4582f7b4f32b87b,"[Японский омлет, кунжут, Фарерский лосось жаре..."
126,bJeZcEZEzR03ZMXLZmgm,"[огурец, сырный соус La Paulina, сливочный сыр..."
127,bJeZcEZEzR03ZMXLZmgm,"[огурец, сырный соус La Paulina, сливочный сыр..."
128,bJeZcEZEzR03ZMXLZmgm,"[огурец, сырный соус La Paulina, сливочный сыр..."
129,bJeZcEZEzR03ZMXLZmgm,"[огурец, сырный соус La Paulina, сливочный сыр..."


In [None]:
merged_df = df.merge(needed_set, on="product_id", how="left")

In [None]:
merged_df["ingredients_list"] = merged_df.apply(
    lambda row: list(set(
        (row["ingredients_list_x"] if isinstance(row["ingredients_list_x"], list) else []) +
        (row["ingredients_list_y"] if isinstance(row["ingredients_list_y"], list) else [])
    )),
    axis=1
)

merged_df.drop(columns=["ingredients_list_x", "ingredients_list_y"], inplace=True)
merged_df.head(1)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,phone,product_name,product_price,category,discount,description,family_id,ingredient_tags,ingredients_list
0,0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,79373254767,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124,"[ingredient:Фарерский лосось, ingredient:сливо...","[сырный соус La Paulina, сливочный сыр Cremett..."


In [None]:
all_ingredients_tags = []
for i in range(len(merged_df)):
  row_ingredients = merged_df.loc[i, "ingredients_list"]

  if isinstance(row_ingredients, list):
      tags = []
      for ingredient in row_ingredients:
          if isinstance(ingredient, str) and ingredient.strip():
              tags.append(f"ingredient:{ingredient.strip()}")
      all_ingredients_tags.append(tags)
  else:
      all_ingredients_tags.append([])  # пустой список, если нет данных

merged_df["ingredient_tags"] = all_ingredients_tags

s = set()
for tag in all_ingredients_tags:
  for elem in tag:
    s.add(elem)

In [None]:
sets_dataset = Dataset()
sets_dataset.fit(users=users, items=items, item_features=s)

In [None]:
(interactions, _) = sets_dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

# Создание item_features
item_features_sets_data = []
for product_id, group in df.groupby("product_id"):
    tags = set()
    for tag_list in group["ingredient_tags"]:
        tags.update(tag_list)
    item_features_sets_data.append((product_id, list(tags)))


In [None]:
item_features_sets_matrix = sets_dataset.build_item_features(item_features_sets_data)

sets_model = LightFM(loss="warp")
sets_model.fit(train, item_features=item_features_sets_matrix, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7b144633ea50>

In [None]:
precision = precision_at_k(
    sets_model, test, k=10,
    item_features=item_features_sets_matrix
).mean()
print(f"✅ Precision@10 (ingr+sets): {precision:.4f}")

✅ Precision@10 (ingr+sets): 0.1702


## LightFM with ingr + cats + sets

In [None]:
full_df = merged_df.copy()
full_df.head(1)

Unnamed: 0.1,Unnamed: 0,order_id,user_id,product_id,count,order_price,order_date,client_name,phone,product_name,product_price,category,discount,description,family_id,ingredient_tags,ingredients_list
0,0,00233FF8-4A50-4684-B625-AF14A4D94A1F,gduNZA9mjrcIECN9QgP4shbAlNV2,ceabfdcb7757af418ee9e5cb2d0bb0ba,1,1401,2022-05-06T11:25:00.692839Z,Раиса,79373254767,Сливочный Лосось Хот,369.0,Uv2uq1jnf1NC7JI13V56,True,"Фарерский лосось, сливочный сыр Cremette, сырн...",Муксинова_2/3_3_124,"[ingredient:сырный соус La Paulina, ingredient...","[сырный соус La Paulina, сливочный сыр Cremett..."


In [None]:
full_df["category_tag"] = full_df["category"].apply(lambda c: f"category:{c}")


In [None]:
full_df["item_tags"] = full_df.apply(lambda row: row["ingredient_tags"] + [row["category_tag"]], axis=1)

In [None]:
all_tags = set()
for tag_list in full_df["item_tags"]:
    all_tags.update(tag_list)

In [None]:
cat_dataset = Dataset()
cat_dataset.fit(users=users, items=items, item_features=all_tags)

(interactions, _) = cat_dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

item_features_data_cat = []
for product_id in items:
    group = cat_df[df["product_id"] == product_id]
    tags = set()
    for tag_list in group["item_tags"]:
        tags.update(tag_list)
    item_features_data_cat.append((product_id, list(tags)))

item_features_cat_matrix = cat_dataset.build_item_features(item_features_data_cat)


In [None]:
full_dataset = Dataset()
full_dataset.fit(users=users, items=items, item_features=all_tags)

(interactions, _) = full_dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

item_features_data_full = []
for product_id in items:
    group = cat_df[df["product_id"] == product_id]
    tags = set()
    for tag_list in group["item_tags"]:
        tags.update(tag_list)
    item_features_data_full.append((product_id, list(tags)))

item_features_full_matrix = full_dataset.build_item_features(item_features_data_full)

In [None]:
item_features_full_matrix = cat_dataset.build_item_features(item_features_data_full)

full_model = LightFM(loss="warp")
full_model.fit(train, item_features=item_features_full_matrix, epochs=10, num_threads=4)

<lightfm.lightfm.LightFM at 0x7b130218fb10>

In [None]:
precision = precision_at_k(
    full_model, test, k=10,
    item_features=item_features_full_matrix
).mean()
print(f"✅ Precision@10 (ingr + cats + sets): {precision:.4f}")

✅ Precision@10 (ingr + cats + sets): 0.1707


Precision@10 (ingr): 0.1564\
Precision@10 (ingr + cats): 0.1557\
Precision@10 (ingr + sets): 0.1569\
Precision@10 (ingr + cats + sets): 0.1560

## LightFM predict

In [48]:
merged_without_nans = pd.read_csv('final_without_nans.csv')

In [49]:
df = merged_without_nans.copy()

users = df['family_id'].unique()
items = df['product_id'].unique()

In [50]:
excluded_categories = [
  "0kqAvq7OTumJ85PyW8nr",  # Закуски и салаты
  "58QZ5QsfQy1K8PT0jbks",  # Сеты
  "BFvVAGV9rV6Fkp1DZTwm",  # Соусы
  "NOn81HMZPYytNvEou0Jr"   # Напитки и десерты
]

df["ingredients_list"] = df.apply(
  lambda row: str(row["description"]).split(", ") if row["category"] not in excluded_categories else [],
  axis=1
)

    # Clean ingredients
df["ingredients_list"] = df["ingredients_list"].apply(
  lambda lst: [i.strip() for i in lst if isinstance(i, str) and i.strip() != ""]
)

    # Create ingredient tags
df["ingredient_tags"] = df["ingredients_list"].apply(
  lambda ingredients: [f"ingredient:{ingredient}" for ingredient in ingredients]
)

In [51]:
users = df['family_id'].unique()
items = df['product_id'].unique()

dataset = Dataset()
dataset.fit(
    users=users,
    items=items,
    item_features=set(tag for sublist in df["ingredient_tags"] for tag in sublist)
)

(interactions, weights) = dataset.build_interactions(
    (row["family_id"], row["product_id"]) for _, row in df.iterrows()
)

item_features_data = []
for product_id in items:
    product_data = df[df["product_id"] == product_id]
    tags = set(tag for sublist in product_data["ingredient_tags"] for tag in sublist)
    item_features_data.append((product_id, list(tags)))

item_features = dataset.build_item_features(item_features_data)


In [52]:
train, test = random_train_test_split(
    interactions,
    test_percentage=0.2,
    random_state=42
)

model = LightFM(loss="warp", random_state=42)
model.fit(
    train,
    item_features=item_features,
    epochs=10,
    num_threads=4
)

precision = precision_at_k(
    model, test, k=10,
    item_features=item_features
).mean()
print(f"Precision@10: {precision:.4f}")


Precision@10: 0.2549


In [45]:
def get_recommendations(family_id, model, dataset, df, n_recs=10):
    all_item_ids = list(df['product_id'].unique())
    known_items = set(df[df['family_id'] == family_id]['product_id'])
    user_id_map, _, item_id_map, _ = dataset.mapping()
    if family_id not in user_id_map:
        print(f"Family {family_id} not found in the model. Returning popular items.")
        popular_items = df['product_id'].value_counts().head(n_recs).index.tolist()
        return df[df['product_id'].isin(popular_items)][['product_id', 'product_name', 'description']].drop_duplicates()

    valid_items = [item for item in all_item_ids if item in item_id_map]
    if not valid_items:
        raise ValueError("No valid items found in the model mapping")

    scores = model.predict(
        user_ids=user_id_map[family_id],
        item_ids=[item_id_map[item] for item in valid_items],
        item_features=item_features_matrix
    )

    recommendations = pd.DataFrame({
        'product_id': valid_items,
        'score': scores
    })

    recommendations = recommendations[~recommendations['product_id'].isin(known_items)]

    top_recs = recommendations.sort_values('score', ascending=False).head(n_recs)
    top_recs = top_recs.merge(
        df[['product_id', 'product_name', 'description']].drop_duplicates(),
        on='product_id',
        how='left'
    )

    return top_recs

In [53]:
sample_family = df['family_id'].iloc[321]
sample_family

'Юрия Гагарина_25_1_12'

In [54]:
recommendations = get_recommendations(sample_family, model, dataset, df)

if recommendations is not None:
    print("Recommendations for family:", sample_family)
    print(recommendations)

Recommendations for family: Юрия Гагарина_25_1_12
                         product_id     score            product_name  \
0  e3632be7aa229eac059e985428342a05  0.590857               С лососем   
1  5c3aa3b3bddd9a284f5471e7175e36c1 -0.452029       Зеленый чай 0,5 л   
2  5d75b29d8c2760fa9cdb6a96e4b008ec -1.274980                   Нияма   
3              9Y0JOxbAlArA5Sdy8jVN -1.486959  Чука с ореховым соусом   
4  f8e1a135a5026ef2d4582f7b4f32b87b -1.504880              Запеченный   
5              EbCcJeRTbXicvvFZ4dJq -1.563463           Наггетсы 6 шт   
6  07240213129fc1a6167a05eee5ca8911 -1.636657        Цыпленок барбекю   
7              aPrHMI53UhgjAkn8Oj9z -1.652522           Унаги Темпура   
8              Ex41b7vfTrCRqxQkH61E -1.796444           Картофель ФРИ   
9  97f417a8c9a03cb6dfefcffd3113c286 -1.808857                 Кентуки   

                                         description  
0                Фарерский лосось увеличенная порция  
1                             Освежа

# Model export

In [None]:
user_id_map, _, item_id_map, reverse_item_id_map = dataset.mapping()

In [None]:
import pickle
import scipy.sparse

with open("lightfm_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("mappings.pkl", "wb") as f:
    pickle.dump({
        "user_map": user_id_map,
        "item_map": item_id_map,
        "reverse_item_map": reverse_item_id_map
    }, f)

scipy.sparse.save_npz("item_features_matrix.npz", item_features_matrix)

# Model import

In [None]:
import pickle
import numpy as np
import scipy.sparse
from lightfm import LightFM

with open("lightfm_model.pkl", "rb") as f:
    new_model = pickle.load(f)

with open("mappings.pkl", "rb") as f:
    mappings = pickle.load(f)

item_features_matrix = scipy.sparse.load_npz("item_features_matrix.npz")

user_id_map = mappings["user_map"]
item_id_map = mappings["item_map"]
reverse_item_id_map = mappings["reverse_item_map"]


In [55]:
def get_recommendations_for_saved(
    family_id,
    model,
    user_id_map,
    item_id_map,
    df,
    n_recs=10
):
    all_item_ids = list(df['product_id'].unique())
    known_items = set(df[df['family_id'] == family_id]['product_id'])

    if family_id not in user_id_map:
        print(f"Family {family_id} not found in the model. Returning popular items.")
        popular_items = df['product_id'].value_counts().head(n_recs).index.tolist()
        return df[df['product_id'].isin(popular_items)][['product_id', 'product_name', 'description']].drop_duplicates()

    valid_items = [item for item in all_item_ids if item in item_id_map]
    if not valid_items:
        raise ValueError("No valid items found in the model mapping")

    scores = model.predict(
        user_ids=user_id_map[family_id],
        item_ids=[item_id_map[item] for item in valid_items],
        item_features=item_features_matrix
    )

    recommendations = pd.DataFrame({
        'product_id': valid_items,
        'score': scores
    })

    recommendations = recommendations[~recommendations['product_id'].isin(known_items)]

    top_recs = recommendations.sort_values('score', ascending=False).head(n_recs)

    top_recs = top_recs.merge(
        df[['product_id', 'product_name', 'description']].drop_duplicates(),
        on='product_id',
        how='left'
    )

    return top_recs

In [35]:
sample_family = df['family_id'].iloc[123]
sample_family

'Таллинская_6_1_7'

In [None]:
recommendations = get_recommendations_for_saved(
    sample_family,
    new_model,
    user_id_map,
    item_id_map,
    df
)

if recommendations is not None:
    print("Recommendations for family:", sample_family)
    print(recommendations)