In [4]:
import numpy as np
import pandas as pd

# IMPORTANT: This notebook is done for CHUCK of test set. 

In [6]:
transactions = pd.read_csv('../personalized-fashion-recommendations/transactions_train.csv')
transactions.shape

(31788324, 5)

In [7]:
split_date = '2020-09-16'
train_set = transactions[transactions['t_dat']<split_date]  # 
test_set = transactions[transactions['t_dat']>=split_date] # '2020-09-16' - '2020-09-22'
test_set['t_dat'].min(), test_set['t_dat'].max()

('2020-09-16', '2020-09-22')

In [8]:
train_users = set(train_set["customer_id"].unique())

test_old = test_set[test_set["customer_id"].isin(train_users)]
test_new = test_set[~test_set["customer_id"].isin(train_users)]

p_old = len(test_old) / len(test_set)
p_new = len(test_new) / len(test_set)
print(f"Old users: {p_old:.2%}, New users: {p_new:.2%}")

# Collect proportional chunk
chunk_size = 10000  # flexible count
n_old = int(chunk_size * p_old)
n_new = chunk_size - n_old

chunk_old = test_old.sample(n_old, random_state=42)
chunk_new = test_new.sample(n_new, random_state=42)

chunk_of_test_set = pd.concat([chunk_old, chunk_new], ignore_index=True)

print(chunk_of_test_set.shape)

Old users: 92.45%, New users: 7.55%
(10000, 5)


In [9]:
chunk_of_test_set.to_csv('chunk_of_test_set.csv', index=False)

In [73]:
lookup_set = set(chunk_of_test_set['customer_id'].tolist())
ttr1 = train_set[train_set['customer_id'].isin(lookup_set)].copy()
ttr1.to_csv('chunk_of_train_set.csv', index=False)

In [74]:
len(ttr1)

672999

### Add Target. Build data for the model training. 

Tasks:
Add a target (0/1) for (customer_id, article_id).
1 if the product was purchased by the user in the validation week (2020-09-16 → 2020-09-22).
0 — otherwise.
Prepare the data for training (split X, y).

In [5]:
candidates_features = pd.read_csv('candidates_features_chunk.csv')

In [10]:
# 1. Create a target during the validation week

val_purchases = (
    chunk_of_test_set[["customer_id", "article_id"]]
    .drop_duplicates()
    .assign(label=1)
)

In [11]:
candidates_features = candidates_features.merge(val_purchases, on=["customer_id", "article_id"], how="left")
candidates_features["label"] = candidates_features["label"].fillna(0).astype(int)


In [2]:
# 2. X (features) и y (target)

# remove identifiers and last_purchase_date, article_id (we'll leave it as a feature after the conversion)
drop_cols = ["customer_id", "article_id", "club_member_status",	"fashion_news_frequency"]
X = candidates_features.drop(columns=drop_cols + ["label"])

In [28]:

X = candidates_features.drop(columns=drop_cols + ["label"])

In [29]:
y = candidates_features["label"]

In [30]:
X.head(5)

Unnamed: 0,age,num_unique_items,num_purchases,mean_price_x,max_price,mean_channel_x,product_code,section_no,department_no,product_type_no,sales_per_week,mean_price_y,mean_channel_y
0,23.0,57.0,63.0,0.030586,0.06778,1.936508,733803,51,1643,274,22.0,0.01001,2.0
1,23.0,57.0,63.0,0.030586,0.06778,1.936508,733803,51,1643,274,14.0,0.011615,2.0
2,23.0,57.0,63.0,0.030586,0.06778,1.936508,733803,51,1643,274,10.0,0.013407,2.0
3,23.0,57.0,63.0,0.030586,0.06778,1.936508,732842,57,1772,272,149.0,0.06549,1.785235
4,23.0,57.0,63.0,0.030586,0.06778,1.936508,732842,57,1772,272,113.0,0.063689,1.681416


In [31]:
print("Dataset shape:", candidates_features.shape)
print("Positive samples:", y.sum())
print("Negative samples:", (y == 0).sum())
print("Feature matrix shape:", X.shape)

Dataset shape: (3086119, 18)
Positive samples: 1054
Negative samples: 3085065
Feature matrix shape: (3086119, 13)


### Шаг 4: Model Training.

In [18]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split

In [32]:
# For simplicity 80/20 split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [33]:
# 2. Adjust class weights
# ------------------------
pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = max(1, int(neg / pos / 4))  # автор брал 100 вместо теоретических ~394

print(f"Positive: {pos}, Negative: {neg}, scale_pos_weight={scale_pos_weight}")

Positive: 843, Negative: 2468052, scale_pos_weight=731


In [34]:
# 3. CatBoostClassifier
# ------------------------
train_pool = Pool(X_train, y_train)
valid_pool = Pool(X_valid, y_valid)

model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=8,
    eval_metric="AUC",
    scale_pos_weight=scale_pos_weight,
    random_seed=42,
    verbose=100,
    task_type="CPU"  
)

In [35]:
# 4. Обучение
# ------------------------
model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

0:	test: 0.7205534	best: 0.7205534 (0)	total: 182ms	remaining: 3m 2s
100:	test: 0.7837104	best: 0.8142085 (41)	total: 11.7s	remaining: 1m 44s
200:	test: 0.7536942	best: 0.8142085 (41)	total: 23.3s	remaining: 1m 32s
300:	test: 0.7267304	best: 0.8142085 (41)	total: 35s	remaining: 1m 21s
400:	test: 0.7111738	best: 0.8142085 (41)	total: 46.6s	remaining: 1m 9s
500:	test: 0.7030318	best: 0.8142085 (41)	total: 58.4s	remaining: 58.2s
600:	test: 0.6931081	best: 0.8142085 (41)	total: 1m 10s	remaining: 46.6s
700:	test: 0.6866038	best: 0.8142085 (41)	total: 1m 22s	remaining: 35s
800:	test: 0.6792764	best: 0.8142085 (41)	total: 1m 34s	remaining: 23.4s
900:	test: 0.6692636	best: 0.8142085 (41)	total: 1m 46s	remaining: 11.7s
999:	test: 0.6666996	best: 0.8142085 (41)	total: 1m 58s	remaining: 0us

bestTest = 0.8142085126
bestIteration = 41

Shrink model to first 42 iterations.


<catboost.core.CatBoostClassifier at 0x2d35a66d0>

In [36]:
# 6. Оценка
# ------------------------
from sklearn.metrics import roc_auc_score

y_pred = model.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, y_pred)
print("Validation AUC:", auc)

Validation AUC: 0.8142085125707638


In [44]:
drop_cols = ["club_member_status",	"fashion_news_frequency"]
data = candidates_features.drop(columns=drop_cols)

 

### Предсказания модели для всех кандидатов. 

In [45]:
# 1. Model predictions for all candidates
# data: dataframe with candidate features
# model: trained CatBoost 

data["pred_prob"] = model.predict_proba(data.drop(columns=["customer_id", "article_id", "label"], errors='ignore'))[:, 1]


In [46]:
# 2. We take the top 12 products for each client with a history
user_candidates = (
    data.sort_values(["customer_id", "pred_prob"], ascending=[True, False])
    .groupby("customer_id")["article_id"]
    .apply(lambda x: list(x.head(12)))
    .reset_index(name="predictions")
)

In [48]:
user_candidates.head(5)

Unnamed: 0,customer_id,predictions
0,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[448509014, 933989002, 448509001, 933989001, 7..."
1,0026ebdd70715d8fa2befa14dfed317a1ffe5451aba839...,"[751471001, 909370001, 915529003, 896152002, 9..."
2,003ca8034fe32b9bab8e1c03d74c972abd80dccf84a302...,"[889370002, 889370001, 706016001, 915529003, 9..."
3,00465ec96dd32dca19f85108cbce142de6667a7ace8208...,"[909370001, 915526001, 915529003, 706016001, 8..."
4,004c3751ed6f9dfc98b870291c95be6702d3afa97d9467...,"[706016001, 891375001, 909370001, 915529003, 9..."


In [47]:
ground_truth = chunk_of_test_set.groupby('customer_id')['article_id'].apply(set).to_dict()

In [49]:
# 3. New clients (no history)

existing_customers = set(chunk_old["customer_id"])
new_customers = set(chunk_new["customer_id"])


In [51]:
# Global Top 12 Popular Products (by Likelihood or Sales)
top12_global = data.groupby("article_id")["pred_prob"].mean().sort_values(ascending=False).head(12).index.tolist()


In [52]:
# 4. Build DataFrame for new clients
new_cust_df = pd.DataFrame({
    "customer_id": list(new_customers),
    "predictions": [top12_global]*len(new_customers)
})


In [53]:

final_submission = pd.concat([user_candidates, new_cust_df], ignore_index=True)


In [None]:
# 6. Save to CSV
#final_submission.to_csv("submission.csv", index=False)

#print("Submission ready:", final_submission.shape)
#print(final_submission.head())

In [55]:
final_submission.head(5)

Unnamed: 0,customer_id,predictions
0,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"[448509014, 933989002, 448509001, 933989001, 7..."
1,0026ebdd70715d8fa2befa14dfed317a1ffe5451aba839...,"[751471001, 909370001, 915529003, 896152002, 9..."
2,003ca8034fe32b9bab8e1c03d74c972abd80dccf84a302...,"[889370002, 889370001, 706016001, 915529003, 9..."
3,00465ec96dd32dca19f85108cbce142de6667a7ace8208...,"[909370001, 915526001, 915529003, 706016001, 8..."
4,004c3751ed6f9dfc98b870291c95be6702d3afa97d9467...,"[706016001, 891375001, 909370001, 915529003, 9..."


In [70]:
map12 = map_at_k(ground_truth, final_submission, k=12)
print(f"MAP@12: {map12:.4f}")

MAP@12: 0.0186


In [69]:
def map_at_k(ground_truth, user_candidates, k=12):

    average_precisions = []
    
    for user, target_items in ground_truth.items():
        #candidate_items = user_candidates.get(user, top_products)
        candidate_items = (user_candidates[user_candidates['customer_id']==user]['predictions']).values[0]
    
        score = 0.0
        hits = 0
    
        for i, p in enumerate(candidate_items):
            if p in target_items:
                hits += 1
                score += hits / (i + 1)
            #print(hits)
    
        if target_items:
            average_precisions.append(score / min(len(target_items), k))
        else:
            average_precisions.append(0.0)
    #print('average_precisions=', average_precisions, 'len(average_precisions) ', len(average_precisions))
        

    return sum(average_precisions) / len(average_precisions)

In [71]:
# Let's just check for just top12 prediction. 
# ground_truth = chunk_of_test_set.groupby('customer_id')['article_id'].apply(set).to_dict()

all_customers = set(chunk_of_test_set["customer_id"])

all_cust_df = pd.DataFrame({
    "customer_id": list(all_customers),
    "predictions": [top12_global]*len(all_customers)
})

In [72]:
map12 = map_at_k(ground_truth, all_cust_df, k=12)
print(f"MAP@12: {map12:.4f}")

MAP@12: 0.0005


In [75]:
chunk_of_test_set.head(5)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2020-09-16,3126eddd0d6ad89f17f9638ac436ca0d0f771b20f4b3af...,929165002,0.050831,1
1,2020-09-16,f3e776a67652b61d1ead5a9a9464ab5664c6aa637e7fe5...,879605003,0.021254,2
2,2020-09-18,0589088ff0b5c2c36c4d581e764628d8924b99807351fc...,872973001,0.042356,2
3,2020-09-18,a9639efa091195ec82c27913d7c2d592bab0c7a1136418...,915611004,0.033881,1
4,2020-09-20,8c831d83e2b945139482dce3b86eef4ca4c9a0ecc43589...,780237003,0.084729,2
