In [1]:
import numpy as np
import pandas as pd

In [2]:
transactions = pd.read_csv('../personalized-fashion-recommendations/transactions_train.csv')
transactions.shape

(31788324, 5)

In [18]:
articles = pd.read_csv('../personalized-fashion-recommendations/articles.csv')
#articles.head(5)

In [4]:
df = transactions.merge(articles[['article_id', 'product_code']], on='article_id', how='left')


In [5]:
#transactions.head(5)

In [3]:
split_date = '2020-09-16'
train_set = transactions[transactions['t_dat']<split_date]  # 
test_set = transactions[transactions['t_dat']>=split_date] # '2020-09-16' - '2020-09-22'
test_set['t_dat'].min(), test_set['t_dat'].max()

('2020-09-16', '2020-09-22')

In [13]:
# Okay, to avoid running the entire dataset through the same data, let's take a chunk from the test_set, but maintain the proportion of users who:
# - were in the train_set (old users),
# - were not in the train_set (new users).

# Step-by-step Logic Algorithm:
# 1. Find the train_users set.
# 2. Split the test_set into two subsets:
# test_old = users who were in the train,
# test_new = users who were not in the train.
# 3. Calculate the proportion of old/new users in the test.
# 4. Select a random subsample (chunk) maintaining these proportions.


In [4]:

train_users = set(train_set["customer_id"].unique())


test_old = test_set[test_set["customer_id"].isin(train_users)]
test_new = test_set[~test_set["customer_id"].isin(train_users)]

# proportiom
p_old = len(test_old) / len(test_set)
p_new = len(test_new) / len(test_set)
print(f"Old users: {p_old:.2%}, New users: {p_new:.2%}")


chunk_size = 10000  # flexibla
n_old = int(chunk_size * p_old)
n_new = chunk_size - n_old

chunk_old = test_old.sample(n_old, random_state=42)
chunk_new = test_new.sample(n_new, random_state=42)

chunk_of_test_set = pd.concat([chunk_old, chunk_new], ignore_index=True)

print(chunk_of_test_set.shape)

Old users: 92.45%, New users: 7.55%
(10000, 5)


In [6]:
chunk_of_test_set['customer_id'].nunique()

8937

In [5]:
# Let's build train_set_candidates only for those who are in test_set. 

lookup_set = set(chunk_of_test_set['customer_id'].tolist())
test_customers_in_train = train_set[train_set['customer_id'].isin(lookup_set)].copy()
test_customers_in_train['customer_id'].nunique()

8248

In [7]:
train_set = test_customers_in_train.copy()

In [14]:
len(test_customers_in_train)

672999

### EDA

In [6]:
# Frequency of purchases per user during the week?
# Overlap with prevous purchases?


user_history_items = train_set.groupby('customer_id')['article_id'].apply(set).reset_index()
target_week_items = test_set.groupby('customer_id')['article_id'].apply(set).reset_index()


In [7]:
target_week_items.head(5)

Unnamed: 0,customer_id,article_id
0,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,{624486001}
1,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,{827487003}
2,000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed...,"{757926001, 640021019, 788575004}"
3,000525e3fe01600d717da8423643a8303390a055c578ed...,{874110016}
4,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"{158340001, 935892001, 799365027, 918171001, 9..."


In [58]:
i = 0
for idx, row  in target_week_items.iterrows():
    user_id = row['customer_id']
    articles = row['article_id']
    b = user_history_items[user_history_items['customer_id'] == user_id]
    print(user_id, )
    i+=1
    if i == 5:
        break
b

00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793
0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55
000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9
000525e3fe01600d717da8423643a8303390a055c578ed8a97256600baf54565
00077dbd5c4a4991e092e63893ccf29294a9d5c46e85010e95f2fc10bf9437a4


Unnamed: 0,customer_id,article_id
177,00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...,"{929744001, 826955011, 714790020, 861685001, 9..."


In [59]:
f = b['article_id'].tolist()
#articles = {756192001}

In [60]:
len(articles), len(f[0])

(14, 57)

In [61]:
common_elemnts = (articles & f[0]) # both them are sets already.
common_elemnts

{448509014}

### What we can do for customers who has no repeated purchases?
What can be done for inactive users?

For example, I looked at 5 customers, and they only had 1 or 3 purchases each. And they didn't appear in the user_history_purchases table even once.

In [62]:
# Top 12 
window_start = '2020-08-15' # 30 days 
window_df = train_set[(train_set['t_dat'] >= window_start)]
print(window_df['t_dat'].min(), window_df['t_dat'].max())

top_products = ( window_df.groupby("article_id")["customer_id"].count().sort_values(ascending=False).head(12)
                        .index.tolist())

2020-08-15 2020-09-15


In [68]:
articles & set(top_products)

{448509014, 915529003}

In [77]:
# category_popularity  - how to calculate?
#How many unique users purchased this category?
# "Top N popular products in each category"

category_popularity_users = df.groupby('product_code')['customer_id'].nunique().reset_index()
category_popularity_users.rename(columns={'customer_id': 'category_popularity_users'}, inplace=True)


top_n = 5
category_popularity_users['rank_in_category'] = category_popularity_users.groupby('product_code')['category_popularity_users'] \
                                                      .rank(method='first', ascending=False)
N = 12
top_items = category_popularity_users[category_popularity_users['rank_in_category'] <= N]


In [78]:
top_items['rank_in_category'].value_counts()

rank_in_category
1.0    46834
Name: count, dtype: int64

In [80]:
# Объединяем с категориями
# df = transactions.merge(items[['article_id', 'item_category']], on='article_id', how='left')

# Считаем количество покупок каждого товара
item_popularity = df.groupby(['product_code', 'article_id'])['customer_id'].count().reset_index()
item_popularity.rename(columns={'customer_id': 'product_code'}, inplace=True)


### Candidates

In [15]:
# Candidates1 :Top 300 Popular Products (weekly 09-16 → 09-22) or week before. or 30 days. 

N = 12
window_start = '2020-08-15' # 30 days 
window_df = train_set[(train_set['t_dat'] >= window_start)]
print(window_df['t_dat'].min(), window_df['t_dat'].max())

top_products = ( window_df.groupby("article_id")["customer_id"].count().sort_values(ascending=False).head(N)
                        .index.tolist())

2020-08-15 2020-09-15


In [16]:
# Candidates: Previously purchased items. 
user_past_items = (
    train_set.groupby("customer_id")["article_id"]
    .unique()
    .reset_index()
    .rename(columns={"article_id": "past_items"})
)

In [19]:
# Candidates: products with the same product_code.
train_set = train_set.merge(articles[["article_id", "product_code"]], on="article_id", how="left")

user_product_codes = (
    train_set.groupby("customer_id")["product_code"]
    .unique()
    .reset_index()
)

# для каждого product_code возьмём все article_id
product_code2articles = (
    articles.groupby("product_code")["article_id"]
    .unique()
    .to_dict()
)

def expand_by_product_code(product_codes):
    candidates = []
    for pc in product_codes:
        candidates.extend(product_code2articles.get(pc, []))
    return list(set(candidates))

user_product_code_items = user_product_codes.copy()
user_product_code_items["product_code_items"] = user_product_code_items["product_code"].apply(expand_by_product_code)
user_product_code_items = user_product_code_items[["customer_id", "product_code_items"]]


In [20]:
# Union Candidates 
candidates = user_past_items.merge(user_product_code_items, on="customer_id", how="outer")

def merge_candidates(row):
    items = set(top_products)  # global gloabal 
    if isinstance(row["past_items"], (list, pd.Series, pd.Index)):
        items.update(row["past_items"])
    if isinstance(row["product_code_items"], (list, pd.Series, pd.Index)):
        items.update(row["product_code_items"])
    return list(items)

candidates["candidates"] = candidates.apply(merge_candidates, axis=1)

In [21]:
# In to dataframe (customer_id, article_id). 
final_candidates = (
    candidates[["customer_id", "candidates"]]
    .explode("candidates")
    .rename(columns={"candidates": "article_id"})
    .dropna()
    .reset_index(drop=True)
)

print("Final candidates shape:", final_candidates.shape)
print(final_candidates.head())

Final candidates shape: (3086119, 2)
                                         customer_id article_id
0  00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...  733803009
1  00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...  733803011
2  00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...  733803014
3  00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...  732842001
4  00077dbd5c4a4991e092e63893ccf29294a9d5c46e8501...  732842002


In [35]:
final_candidates.to_csv('final_candidates.csv', index=False)

In [23]:
len(final_candidates)

3086119

In [15]:
len(final_candidates)

184602897

In [16]:
groups = final_candidates.groupby('customer_id')['article_id'].count().to_dict()
len(groups)

1356709

In [20]:
i = 0
for cust, cand in groups.items():
    print(cust, cand)
    i+=1
    if i==5:
        break

00000dbacae5abe5e23885899a1fa44253a17956c6d1c3d25f88aa139fdfc657 121
0000423b00ade91418cceaf3b26c6af3dd342b51fd051eec9c12fb36984420fa 419
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318 100
00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e 19
00006413d8573cd20ed7128e53b7b13819fe5cfc2d801fe7fc0f26dd8d65a85a 84


In [21]:
train_set[train_set['customer_id']=='00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e']

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,product_code
11576669,2019-06-09,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,742079001,0.030492,2,742079
11576670,2019-06-09,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,732413001,0.030492,2,732413


In [22]:
# It's interesting that the product_code is different for the same article_id. No. That's not true, it is different;

test_set[test_set['customer_id']=='00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2c5feb1ca5dff07c43e']

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id


### Feature Engineering. Add to each features. 

In [24]:
# 1. Customer Features. 
cust_stats = (
    train_set.groupby("customer_id")
    .agg(
        num_unique_items=("article_id", "nunique"),
        num_purchases=("article_id", "count"),
        mean_price=("price", "mean"),
        max_price=("price", "max"),
        mean_channel=("sales_channel_id", "mean"),
            )
    .reset_index()
)

In [25]:
customers = pd.read_csv('../personalized-fashion-recommendations/customers.csv')

In [26]:
cust = customers[["customer_id", "age", "club_member_status", "fashion_news_frequency"]]

In [27]:
customer_features = cust.merge(cust_stats, on="customer_id", how="left")

In [28]:
len(customer_features)

1371980

In [31]:
customer_features.head(5)

Unnamed: 0,customer_id,age,club_member_status,fashion_news_frequency,num_unique_items,num_purchases,mean_price,max_price,mean_channel
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,49.0,ACTIVE,NONE,19.0,21.0,0.030904,0.05422,1.571429
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,25.0,ACTIVE,NONE,64.0,86.0,0.030255,0.084729,1.94186
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,24.0,ACTIVE,NONE,14.0,18.0,0.039154,0.06778,2.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,54.0,ACTIVE,NONE,2.0,2.0,0.030492,0.030492,2.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,52.0,ACTIVE,Regularly,12.0,13.0,0.03613,0.059305,1.846154


In [29]:
customer_features.to_csv('customer_features.csv', index=False)

In [30]:
# 2. Articles Features. 

art = articles[["article_id", "product_code", "section_no", "department_no", "product_type_no"]]

# популярность товара
article_stats = (
    train_set.groupby("article_id")
    .agg(
        sales_per_week=("article_id", "count"),
        mean_price=("price", "mean"),
        mean_channel=("sales_channel_id", "mean"),
    )
    .reset_index()
)

article_features = art.merge(article_stats, on="article_id", how="left")

In [31]:
article_features.to_csv('article_features.csv', index=False)

In [33]:
article_features.head(5)

Unnamed: 0,article_id,product_code,section_no,department_no,product_type_no,sales_per_week,mean_price,mean_channel
0,108775015,108775,16,1676,253,10841.0,0.008142,1.770778
1,108775044,108775,16,1676,253,7247.0,0.008114,1.710087
2,108775051,108775,16,1676,253,215.0,0.00498,1.995349
3,110065001,110065,61,1339,306,1044.0,0.020219,1.375479
4,110065002,110065,61,1339,306,539.0,0.018205,1.654917


In [1]:
# 3. Customer × Article features
last_purchase = (
    train_set.groupby(["customer_id", "article_id"])["t_dat"]
    .max()
    .reset_index()
    .rename(columns={"t_dat": "last_purchase_date"})
)

In [2]:
data = pd.read_csv('final_candidates.csv')


In [3]:
article_features = pd.read_csv('article_features.csv')

In [4]:
customer_features = pd.read_csv('customer_features.csv')

In [32]:
data = final_candidates.copy()

In [33]:
# 4. Merge on 
data = data.merge(customer_features, on="customer_id", how="left")


In [34]:
data = data.merge(article_features, on="article_id", how="left")

In [35]:
for col in data.select_dtypes(include=[np.number]).columns:
    data[col] = data[col].fillna(0)

In [36]:
data.to_csv('candidates_features_chunk.csv', index=False)