In [1]:
import pandas as pd
import numpy as np
import polars as pl
import glob
from tqdm import tqdm
from collections import defaultdict
import matplotlib.pyplot as plt

from src.gen_cand_utils import *
from src.validation import validate_recommendations, plot_metrics_by_k

In [6]:
tracker_files = glob.glob('data/final_apparel_tracker_data_08_action_widget/*/*.parquet')
order_files = glob.glob('data/final_apparel_orders_data_07/*/*.parquet')
test_user_ids = pd.read_parquet('data/ml_ozon_recsys_test.snappy.parquet')
test_user_ids_set = set(test_user_ids['user_id'].to_list())

In [13]:
cat = pl.read_parquet('submits/top100_FINAL_2.parquet')

In [14]:
top100_list = (
    cat
    .sort(["user_id", "score"], descending=[False, True])  # сортируем по score внутри user
    .group_by("user_id")
    .agg([
        pl.col("item_id").head(100).alias("top100_items")  # берём первые 100 item_id
    ]).filter(pl.col('user_id').is_in(test_user_ids_set))
)

In [15]:
gen_submit(top100_list.rename({"top100_items":'item_id'}).to_pandas(), 'catboost_v_5')

Формирование submission: 100%|██████████| 470347/470347 [00:17<00:00, 27235.46it/s]


In [9]:
top100_list[0,1]

86451006
201283307
10885660
164516883
140634651
…
174853854
298949789
183857215
186981750
56246241


In [3]:
candidate_last_favorite_items = get_last_favorite_items(mode='submit', n=200, min_date='2025-05-07')
last_viewed_items = get_last_viewed_items(mode='submit', n=50, min_date='2025-06-15')
candidate_last_viewed_def_items = get_last_viewed_def_items(mode='submit', n=200, min_date='2025-06-15')
# last_viewed_items = pl.read_parquet('candidate_cache/last_viewed_items.parquet')

nn_path = glob.glob('nearest_neighbors/*.parquet')
nn_df = pl.read_parquet(nn_path, columns=['item_id', 'neighbor_item_id', 'rank'])

k_values = {
        'item_1': 6,
        'item_2': 3,
        'item_3': 2,
        # 'item_4': 10,
        # 'item_5': 5,
        # 'item_6': 3,
        # 'item_7': 2,
    }
candidate_neighbors_of_viewed_items = get_neighbors_of_viewed_items(last_viewed_items, nn_df, k_values)

cooccur_neighbors_of_all_items = pl.read_parquet('cooccurrence_neighbors.parquet')
candidate_cooccur_neighbors = get_cooccur_neighbors_of_last_delivered_items(cooccur_neighbors_of_all_items, mode='submit', min_date='2025-06-21')
popular_items = get_popular_items(n=400, min_date='2025-05-21')
popular_df = pl.DataFrame({'item_id': popular_items})

candidate_processed_items = get_processed_items(mode='submit', n_last=100, min_date='2025-06-15')


Gen last favorite items: 100%|██████████| 3000/3000 [22:00<00:00,  2.27it/s] 
Gen last viewed items: 100%|██████████| 3000/3000 [13:46<00:00,  3.63it/s] 
Gen last favorite items: 100%|██████████| 3000/3000 [12:29<00:00,  4.00it/s] 
Gen cooccur neighbors of delivered items: 100%|██████████| 33/33 [00:07<00:00,  4.23it/s]
Gen processed items: 100%|██████████| 33/33 [00:08<00:00,  3.70it/s]


In [8]:
candidates = (candidates
    .group_by("user_id")
    .agg([
        pl.col("item_id").head(100).alias("top100_items")  # берём первые 100 item_id
    ]).filter(pl.col('user_id').is_in(test_user_ids_set))
)

In [3]:
cand = pl.read_csv('submits/candidates_before_cb.csv')

In [4]:
dfs_to_merge = [candidate_last_favorite_items, candidate_processed_items, candidate_last_viewed_def_items, candidate_neighbors_of_viewed_items, candidate_cooccur_neighbors]
popular_items = get_popular_items(n=400, min_date='2025-05-21')
candidates = unite_candidates_exploded(dfs_to_merge, popular_items, n=400)
gen_submit(candidates, name=f'candidates_exploded')

Формирование submission:   0%|          | 0/188138800 [00:00<?, ?it/s]


TypeError: 'numpy.int64' object is not iterable

In [6]:
candidates.to_parquet('candidates_exploded_v2.parquet')

In [7]:
candidate_last_favorite_items.write_parquet('candidate_cache/last_favotite_items.parquet')
last_viewed_items.write_parquet('candidate_cache/last_viewed_items.parquet')
candidate_neighbors_of_viewed_items.write_parquet('candidate_cache/neighbors_of_viewed_items.parquet')
candidate_cooccur_neighbors.write_parquet('candidate_cache/cooccur_neighbors.parquet')
candidate_processed_items.write_parquet('candidate_cache/processed_items.parquet')
candidate_last_viewed_def_items.write_parquet('candidate_cache/last_viewed_def_items.parquet')

In [3]:
candidate_last_favorite_items = pl.read_parquet('candidate_cache/last_favotite_items.parquet')
last_viewed_items = pl.read_parquet('candidate_cache/last_viewed_items.parquet')
candidate_neighbors_of_viewed_items = pl.read_parquet('candidate_cache/neighbors_of_viewed_items.parquet')
cooccur_neighbors_of_all_items = pl.read_parquet('cooccurrence_neighbors.parquet')
candidate_cooccur_neighbors = pl.read_parquet('candidate_cache/cooccur_neighbors.parquet')
popular_items = pl.read_parquet('candidate_cache/popular_items.parquet')['item_id'].to_list()
candidate_processed_items = pl.read_parquet('candidate_cache/processed_items.parquet')

## Leave-last-N train/val split

In [None]:
orders_df_delivered = pl.read_parquet('data/orders_df_delivered.parquet', columns=['user_id', 'item_id', 'created_timestamp'])

Получаем время разделения на train/val - после cutoff_time пользователь совершает три покупки.

In [None]:
leave_last_n = 3

# Получаем временные метки отсечки для каждого пользователя
user_cutoff_time = (
    orders_df_delivered
    .sort(['user_id', 'created_timestamp'])
    .group_by('user_id')
    .agg(
        pl.col('created_timestamp')
        .tail(leave_last_n)  # Берем последние n заказов
        .min()
        .alias('cutoff_time')
    )
)

In [None]:
val_n_orders = (
    orders_df_delivered
    .join(user_cutoff_time, on='user_id')
    .filter(pl.col('created_timestamp') >= pl.col('cutoff_time'))
    .sort(['user_id', 'created_timestamp'])
    .group_by('user_id')
    .agg(
        pl.col('item_id')
        .alias('item_ids')
    )
).to_pandas()

## Last n favorite items

### получаем

In [None]:
candidate_last_favorite_items = get_last_favorite_items(mode='train', user_cutoff_time=user_cutoff_time)

Processing files: 100%|██████████| 2800/2800 [00:43<00:00, 63.88it/s]


In [4]:
candidate_last_favorite_items.write_parquet('candidate_cache/last_favotite_items.parquet')


In [9]:
candidate_last_favorite_items = pl.read_parquet('candidate_cache/last_favotite_items.parquet')

## Last n viewed items

In [None]:
last_viewed_items = get_last_viewed_items(mode='train', n=7, user_cutoff_time=user_cutoff_time)

Processing files: 100%|██████████| 2800/2800 [05:22<00:00,  8.68it/s]


In [5]:
last_viewed_items.write_parquet('candidate_cache/last_viewed_items.parquet')

In [13]:
last_viewed_items = pl.read_parquet('candidate_cache/last_viewed_items.parquet')

## Ищем похожие товары из просматриваемых по косинусной близости

### получение

In [15]:
last_viewed_items = pl.read_parquet('candidate_cache/last_viewed_items.parquet')

nn_path = glob.glob('nearest_neighbors/*.parquet')
nn_df = pl.read_parquet(nn_path, columns=['item_id', 'neighbor_item_id', 'rank'])

k_values = {
        'item_1': 20,
        'item_2': 17,
        'item_3': 15,
        'item_4': 10,
        'item_5': 5,
        'item_6': 3,
        'item_7': 2,
    }

In [16]:
candidate_neighbors_of_viewed_items = get_neighbors_of_viewed_items(last_viewed_items, nn_df, k_values)

In [6]:
candidate_neighbors_of_viewed_items.write_parquet('candidate_cache/neighbors_of_viewed_items.parquet')

In [18]:
candidate_neighbors_of_viewed_items = pl.read_parquet('candidate_cache/neighbors_of_viewed_items.parquet')

## Получаем co-occurrence

### получение

In [23]:
cooccur_neighbors_of_all_items = pl.read_parquet('cooccurrence_neighbors.parquet')

In [None]:
candidate_cooccur_neighbors = get_cooccur_neighbors_of_last_delivered_items(cooccur_neighbors_of_all_items, mode='train', user_cutoff_time=user_cutoff_time)

Processing files: 100%|██████████| 33/33 [00:26<00:00,  1.26it/s]


In [7]:
candidate_cooccur_neighbors.write_parquet('candidate_cache/cooccur_neighbors.parquet')

In [27]:
candidate_cooccur_neighbors = pl.read_parquet('candidate_cache/cooccur_neighbors.parquet')

## popular_items

### получение

In [3]:
popular_items = get_popular_items(n=100)
popular_df = pl.DataFrame({'item_id': popular_items})

In [8]:
popular_df.write_parquet('candidate_cache/popular_items.parquet')

In [5]:
popular_items = pl.read_parquet('candidate_cache/popular_items.parquet')['item_id'].to_list()

## status = processed_orders

### получение

In [None]:
candidate_processed_items = get_processed_items(mode='train', user_cutoff_time=user_cutoff_time)

Processing files: 100%|██████████| 33/33 [00:09<00:00,  3.35it/s]


In [9]:
candidate_processed_items.write_parquet('candidate_cache/processed_items.parquet')

In [37]:
candidate_processed_items = pl.read_parquet('candidate_cache/processed_items.parquet')

## Объединяем кандидатов. Добиваем до 200 popular items

In [None]:
# candidate_last_favorite_items = pl.read_parquet('candidate_cache/last_favotite_items.parquet')
# candidate_neighbors_of_viewed_items = pl.read_parquet('candidate_cache/neighbors_of_viewed_items.parquet')
# candidate_cooccur_neighbors = pl.read_parquet('candidate_cache/cooccur_neighbors.parquet')
# candidate_processed_items = pl.read_parquet('candidate_cache/processed_items.parquet')
# popular_items = pl.read_parquet('candidate_cache/popular_items.parquet')['item_id'].to_list()


### объединяем

In [None]:
dfs_to_merge = [candidate_processed_items, candidate_last_favorite_items, candidate_neighbors_of_viewed_items, candidate_cooccur_neighbors]

candidates = unite_candidates(dfs_to_merge=[], n=100)

NameError: name 'candidate_processed_items' is not defined