In [9]:
import glob
import polars as pl
import pandas as pd
import numpy as np
from tqdm import tqdm
from scipy.sparse import csr_matrix
import implicit
import matplotlib.pyplot as plt
from implicit.evaluation import train_test_split, precision_at_k, ndcg_at_k, mean_average_precision_at_k

In [2]:
tracker_files = glob.glob('data/final_apparel_tracker_data_08_action_widget/*/*.parquet')
order_files = glob.glob('data/final_apparel_orders_data_07/*/*.parquet')
items_files = glob.glob('data/ml_ozon_recsys_train_final_apparel_items_data/*.parquet')
test_user_ids = pd.read_parquet('data/ml_ozon_recsys_test.snappy.parquet')
test_user_ids_set = set(test_user_ids['user_id'].to_list())

In [3]:
test_user_ids_list = test_user_ids['user_id'].to_list()

In [4]:
item_ids_from_may = pl.read_parquet('item_ids_from_may.parquet')
item_ids_from_may = item_ids_from_may['item_id'].to_list()

### Создание модели

In [4]:
def get_weight(action_type):
    weight_map = {
        "delivered_orders": 8.0,
        "canceled_orders": -4.0,
        "proccesed_orders": 4.0,
        "to_cart": 3.0,
        "favorite": 2.5,
        "view_description": 2.0,
        "review_view": 1.8,
        "page_view": 1.0,
        "unfavorite": -1.0,
        "remove": -2.0
    }
    return weight_map.get(action_type, 0.0)

In [None]:
all_dfs = []
for i in tqdm(range(len(tracker_files)), desc='Processing files'):
    file_path = tracker_files[i]

    df = pl.read_parquet(
        file_path, 
        columns=['user_id', 'item_id', 'action_type', 'timestamp']
    ).filter((pl.col('timestamp') > pl.datetime(2025, 5, 21)) &
             (pl.col('user_id').is_in(test_user_ids_set)) &
             (pl.col('item_id').is_in(item_ids_from_may))
    ).with_columns(
    weight=pl.col("action_type").map_elements(get_weight, return_dtype=pl.Float32)
    ).drop("timestamp", "action_type")

    all_dfs.append(df)
    
for i in tqdm(range(len(order_files)), desc='Processing files'):
    file_path = order_files[i]

    df = pl.read_parquet(
        file_path, 
        columns=['user_id', 'item_id', 'created_timestamp', 'last_status']
    ).filter((pl.col('created_timestamp') > pl.datetime(2025, 5, 21)) &
             (pl.col('user_id').is_in(test_user_ids_set)) &
             (pl.col('item_id').is_in(item_ids_from_may))
    ).with_columns(
    weight=pl.col("last_status").map_elements(get_weight, return_dtype=pl.Float32)
    ).drop("created_timestamp", "last_status")
    all_dfs.append(df)

df_with_weights = pl.concat(all_dfs)

Processing files: 100%|██████████| 2800/2800 [49:46<00:00,  1.07s/it] 
Processing files: 100%|██████████| 33/33 [00:42<00:00,  1.28s/it]


In [41]:
df_with_weights.write_parquet('candidate_cache/df_with_weights_for_matrix_fact')

In [5]:
df_with_weights = pl.read_parquet('candidate_cache/df_with_weights_for_matrix_fact')

In [6]:
df_agg = df_with_weights.group_by(['user_id', 'item_id']).agg(
    pl.sum('weight').alias('total_weight')
)

In [7]:
user_ids = df_agg['user_id'].unique()
user2index = {user_id: idx for idx, user_id in enumerate(user_ids)}

item_ids = df_agg['item_id'].unique()
item2index = {item_id: idx for idx, item_id in enumerate(item_ids)}
index2item = {idx: item_id for item_id, idx in item2index.items()}

rows = [user2index[user] for user in df_agg['user_id']]
cols = [item2index[item] for item in df_agg['item_id']]
values = df_agg['total_weight'].to_numpy()

interaction_matrix = csr_matrix((values, (rows, cols)), 
                                shape=(len(user_ids), len(item_ids)))

In [8]:
model = implicit.als.AlternatingLeastSquares(
    factors=50,
    iterations=50,
    regularization=0.1,
    random_state=42
)

model.fit(interaction_matrix)

  check_blas_config()
100%|██████████| 50/50 [08:01<00:00,  9.64s/it]


In [9]:
import pickle
save_data = {
    'model': model,
    'interaction_matrix': interaction_matrix,
    'matrix_shape': interaction_matrix.shape
}

with open('als_full_data.pkl', 'wb') as f:
    pickle.dump(save_data, f)

### Генерация предсказаний

In [10]:
def get_popular_items(df_agg, n=100):
    popular_items = df_agg.group_by('item_id').agg(
        pl.sum('total_weight').alias('popularity_score')
    ).sort('popularity_score', descending=True)['item_id'].head(n).to_list()
    return popular_items

In [None]:
all_recommendations = {}
popular_items = get_popular_items(df_agg, n=100)

for user_id in tqdm(test_user_ids_list, desc='Generating recommendations'):
    if user_id not in user2index:
        all_recommendations[user_id] = popular_items
        continue
    
    user_idx = user2index[user_id]
    user_items = interaction_matrix[user_idx, :]
    recommendations = model.recommend(
        user_idx, 
        user_items,
        N=100,
        filter_already_liked_items=True
    )
    item_indices, scores = recommendations
    recommended_items = [index2item[idx] for idx in item_indices]

    if len(recommended_items) < 100:
        recommended_items.extend(popular_items[:100 - len(recommended_items)])
    
    all_recommendations[user_id] = recommended_items[:100]

Generating recommendations: 100%|██████████| 470347/470347 [5:55:06<00:00, 22.07it/s]      


In [22]:
df.to_parquet("recommendations_from_matrix_fact.parquet")

In [20]:
submission_data = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="Формирование submission"):
    submission_data.append({
        'user_id': row['user_id'],
        'item_id_1 item_id_2 ... item_id_100': ' '.join(map(str, row['item_id']))
    })
submission_df = pd.DataFrame(submission_data)
submission_df.to_csv(f'submits/matrix_fact.csv', index=False)

Формирование submission: 100%|██████████| 470347/470347 [00:15<00:00, 30121.36it/s]


### Отладка, поиск гиперпараметров

In [13]:
def evaluate_subset(model, train, test, users, K=10):
    """Считаем precision, ndcg и map только по подмножеству пользователей"""
    prec_list, ndcg_list, mapk_list = [], [], []
    for u in users:

        if test[u].nnz == 0:
            continue
        # создаём маленькие матрицы только для одного пользователя
        train_u = train[u]
        test_u = test[u]

        # implicit считает метрику для всех строк матрицы,
        # поэтому передаём матрицу из одного пользователя
        prec_list.append(precision_at_k(model, train_u, test_u, K=K))
        ndcg_list.append(ndcg_at_k(model, train_u, test_u, K=K))
        mapk_list.append(mean_average_precision_at_k(model, train_u, test_u, K=K))

    return (
        np.mean(prec_list),
        np.mean(ndcg_list),
        np.mean(mapk_list),
    )

In [14]:
train, test = train_test_split(interaction_matrix, train_percentage=0.8, random_state=42)

iterations_list = [5, 10, 20, 30, 40]
precisions, ndcgs, mapks = [], [], []
rng = np.random.default_rng(42)
sample_users = np.random.choice(train.shape[0], size=10000, replace=False)

for iters in iterations_list:
    model = implicit.als.AlternatingLeastSquares(
        factors=50,
        iterations=iters,
        regularization=0.1,
        random_state=42
    )
    model.fit(train)
    

    prec, ndcg, mapk = evaluate_subset(model, train, test, sample_users, K=10)

    precisions.append(prec)
    ndcgs.append(ndcg)
    mapks.append(mapk)
    print(f"iters={iters}: precision@10={prec:.4f}, ndcg@10={ndcg:.4f}, MAP@10={mapk:.4f}")

# Рисуем график
plt.plot(iterations_list, precisions, marker="o", label="Precision@10")
plt.plot(iterations_list, ndcgs, marker="s", label="NDCG@10")
plt.plot(iterations_list, mapks, marker="^", label="MAP@10")
plt.xlabel("Iterations")
plt.ylabel("Score")
plt.title("Metrics vs Iterations")
plt.legend()
plt.grid(True)
plt.show()


100%|██████████| 5/5 [00:46<00:00,  9.21s/it]
100%|██████████| 1/1 [00:00<00:00, 50.07it/s]
100%|██████████| 1/1 [00:00<00:00, 39.72it/s]
100%|██████████| 1/1 [00:00<00:00, 47.62it/s]
100%|██████████| 1/1 [00:00<00:00, 45.28it/s]
100%|██████████| 1/1 [00:00<00:00, 50.97it/s]
100%|██████████| 1/1 [00:00<00:00, 49.45it/s]
100%|██████████| 1/1 [00:00<00:00, 44.99it/s]
100%|██████████| 1/1 [00:00<00:00, 47.85it/s]
100%|██████████| 1/1 [00:00<00:00, 47.57it/s]
100%|██████████| 1/1 [00:00<00:00, 50.77it/s]
100%|██████████| 1/1 [00:00<00:00, 45.96it/s]
100%|██████████| 1/1 [00:00<00:00, 50.09it/s]
100%|██████████| 1/1 [00:00<00:00, 49.59it/s]
100%|██████████| 1/1 [00:00<00:00, 46.87it/s]
100%|██████████| 1/1 [00:00<00:00, 49.15it/s]
100%|██████████| 1/1 [00:00<00:00, 47.43it/s]
100%|██████████| 1/1 [00:00<00:00, 49.43it/s]
100%|██████████| 1/1 [00:00<00:00, 44.50it/s]
100%|██████████| 1/1 [00:00<00:00, 51.18it/s]
100%|██████████| 1/1 [00:00<00:00, 48.04it/s]
100%|██████████| 1/1 [00:00<00:00,

iters=5: precision@10=0.0023, ndcg@10=0.0018, MAP@10=0.0005


100%|██████████| 10/10 [02:29<00:00, 14.91s/it]
100%|██████████| 1/1 [00:00<00:00, 38.05it/s]
100%|██████████| 1/1 [00:00<00:00, 36.06it/s]
100%|██████████| 1/1 [00:00<00:00, 35.29it/s]
100%|██████████| 1/1 [00:00<00:00, 35.04it/s]
100%|██████████| 1/1 [00:00<00:00, 36.45it/s]
100%|██████████| 1/1 [00:00<00:00, 38.55it/s]
100%|██████████| 1/1 [00:00<00:00, 33.32it/s]
100%|██████████| 1/1 [00:00<00:00, 36.51it/s]
100%|██████████| 1/1 [00:00<00:00, 31.94it/s]
100%|██████████| 1/1 [00:00<00:00, 29.64it/s]
100%|██████████| 1/1 [00:00<00:00, 21.37it/s]
100%|██████████| 1/1 [00:00<00:00, 34.43it/s]
100%|██████████| 1/1 [00:00<00:00, 29.87it/s]
100%|██████████| 1/1 [00:00<00:00, 41.32it/s]
100%|██████████| 1/1 [00:00<00:00, 41.29it/s]
100%|██████████| 1/1 [00:00<00:00, 44.93it/s]
100%|██████████| 1/1 [00:00<00:00, 40.23it/s]
100%|██████████| 1/1 [00:00<00:00, 44.03it/s]
100%|██████████| 1/1 [00:00<00:00, 42.30it/s]
100%|██████████| 1/1 [00:00<00:00, 45.01it/s]
100%|██████████| 1/1 [00:00<00:0

iters=10: precision@10=0.0023, ndcg@10=0.0020, MAP@10=0.0007


MemoryError: Unable to allocate 395. MiB for an array with shape (2069110, 50) and data type float32