In [1]:
import pandas as pd
from tqdm.auto import tqdm
import gc
from functools import reduce
from lightgbm import LGBMRanker, early_stopping

def create_customer_week_pair():
    transactions = pd.read_parquet(f'./data/sample/transactions_train_sample_{sample}.parquet')
    return transactions[(transactions.week >= start_week)].drop_duplicates(["customer_id", "week"])[["customer_id", "week"]].reset_index(drop=True)

def create_ground_truth():
    transactions = pd.read_parquet(f'./data/sample/transactions_train_sample_{sample}.parquet')
    s0 = transactions[(transactions.week >= start_week)].drop_duplicates(["customer_id", "article_id", "week"]).reset_index(drop=True)
    s0["y"] = 1
    return s0[["customer_id", "article_id", "week", "y"]]

def create_top_strategy(weekly_customer):
    transactions = pd.read_parquet(f'./data/sample/transactions_train_sample_{sample}.parquet')
    customer = pd.read_parquet(f'./data/sample/customers_sample_{sample}.parquet')
    customer.loc[customer.age == -1, "age"] = customer.age.mean()
    customer_sex = pd.read_parquet('./data/fe/customer_sex.parquet')
    top_history = pd.read_parquet('./data/fe/group_rank.parquet')
    top_history.week += 1

    transactions_with_gender = transactions \
        .merge(customer_sex, on=["customer_id"], how="left") \
        .merge(customer[["customer_id", "age"]], on=["customer_id"], how="left")
    transactions_with_gender["year"] = 2022
    transactions_with_gender["modify_year"] = transactions_with_gender["t_dat"].apply(lambda x: x.year)
    transactions_with_gender["age"] = transactions_with_gender["age"] - (transactions_with_gender["year"] - transactions_with_gender["modify_year"])
    bins = [0, 18, 25, 30, 35, 40, 45, 50, 55, 60, 999]
    labels = [i for i in range(len(bins)-1)]
    transactions_with_gender["age_group"] = pd.cut(transactions_with_gender["age"], bins=bins, labels=labels, right=False)
    s1 = weekly_customer \
            .merge(transactions_with_gender[["customer_id", "gender", "age_group", "week"]].drop_duplicates(["customer_id", "week"]), on=["customer_id", "week"]) \
            .merge(top_history, on=["week", "gender", "age_group"], how="inner")
    s1["top_strategy"] = 1
    return s1[["customer_id", "week", "top_strategy", "bestseller_rank", "article_id"]]

def create_duplicate_buy_strategy(weekly_customer):
    transactions = pd.read_parquet(f'./data/sample/transactions_train_sample_{sample}.parquet')
    output = pd.DataFrame()
    for week in tqdm(range(start_week, 105)):
        tmp = transactions[transactions.week < week]
        tmp = tmp.drop_duplicates(["customer_id", "article_id", "week"])
        last_week = tmp.groupby(["customer_id", "article_id"])["week"].last().rename("previous_buy").reset_index()
        df = tmp.groupby(["customer_id", "article_id"]).size().rename("buy_times").reset_index()
        df = df[df.buy_times > 0].reset_index(drop=True)
        df = df.merge(last_week, on=["customer_id", "article_id"])
        df["week"] = week
        output = pd.concat([output, df])
    output["duplicate_buy_strategy"] = 1
    output["week_gap"] = output["week"] - output["previous_buy"]
    return weekly_customer.merge(output, on=["customer_id", "week"], how="inner")

def create_recommendation_strategy(weekly_customer):
    recommend_counts = 3
    als = pd.read_parquet('./data/fe/als.parquet')
    als = als[als.article_id != als.recommendation_article_id].reset_index(drop=True)
    als = als.rename(columns={"score": "item_score"})
    als_for_recommendation_candidate_generator = als.groupby(["article_id"]).head(recommend_counts).reset_index(drop=True)
    transactions = pd.read_parquet(f'./data/sample/transactions_train_sample_{sample}.parquet')
    output = pd.DataFrame()
    for week in tqdm(range(start_week, 105)):
        tmp = transactions[transactions.week < week]
        tmp = tmp.drop_duplicates(["customer_id", "article_id", "week"])
        duplicate_buy_df = tmp.groupby(["customer_id", "article_id"]).size().rename("buy_times").reset_index()
        duplicate_buy_df = duplicate_buy_df[duplicate_buy_df.buy_times > 0].reset_index(drop=True)
        recommendation_candidate_df = pd.merge(duplicate_buy_df, als_for_recommendation_candidate_generator, 
                                               on=["article_id"], how="left")
        recommendation_candidate_df = recommendation_candidate_df.drop(columns=["article_id"]).rename(columns={"recommendation_article_id": "article_id"})
        recommendation_candidate_df["recommendation_strategy"] = recommendation_candidate_df.item_score
        recommendation_candidate_df["week"] = week
        recommendation_candidate_df.drop_duplicates(["customer_id", "article_id"], inplace=True)
        output = pd.concat([output, recommendation_candidate_df])
        
    return weekly_customer.merge(output[["customer_id", "article_id", "recommendation_strategy", "week"]].reset_index(drop=True), on=["customer_id", "week"], how="inner")

In [73]:
# helper functions
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np


def apk(actual, predicted, k=10, perfect=False):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if not perfect and len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0
    if perfect:
        for i, a in enumerate(actual[:k]):
            if a in predicted and a not in actual[:i]:
                score += 1     
    else:
        for i,p in enumerate(predicted):
            if p in actual and p not in predicted[:i]:
                num_hits += 1.0
                score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [74]:
sample = 0.05
start_week = 80

In [75]:
customer_week_pair = create_customer_week_pair()
ground_truth_stragegy = create_ground_truth()
top_strategy = create_top_strategy(customer_week_pair)
duplicate_buy_strategy = create_duplicate_buy_strategy(customer_week_pair)
recommendation_strategy = create_recommendation_strategy(customer_week_pair)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=25.0), HTML(value='')))




In [76]:
candidate_merger = top_strategy \
                    .merge(duplicate_buy_strategy, on=["customer_id", "article_id", "week"], how="outer") \
                    .merge(recommendation_strategy, on=["customer_id", "article_id", "week"], how="outer")

In [77]:
top_strategy

Unnamed: 0,customer_id,week,top_strategy,bestseller_rank,article_id
0,6219998949057481,80,1,1,827681002
1,6219998949057481,80,1,2,730683041
2,6219998949057481,80,1,3,877278003
3,6219998949057481,80,1,4,714790008
4,6219998949057481,80,1,5,759871002
...,...,...,...,...,...
10689142,13584458923080438626,104,1,17,917297004
10689143,13584458923080438626,104,1,17,918171001
10689144,13584458923080438626,104,1,17,919978002
10689145,13584458923080438626,104,1,17,920610002


In [78]:
df = candidate_merger.merge(ground_truth_stragegy, on=["customer_id", "article_id", "week"], how="outer")

In [79]:
df["y"].fillna(0, inplace=True)
df["top_strategy"].fillna(0, inplace=True)
df["bestseller_rank"].fillna(999, inplace=True)
df["buy_times"].fillna(0, inplace=True)
df["duplicate_buy_strategy"].fillna(0, inplace=True)
df["week_gap"].fillna(999, inplace=True)
df["recommendation_strategy"].fillna(0, inplace=True)
del df["previous_buy"]
gc.collect()

112

In [80]:
del customer_week_pair, ground_truth_stragegy, top_strategy, duplicate_buy_strategy, recommendation_strategy, candidate_merger
gc.collect()

20

In [81]:
df.groupby(["customer_id", "week"]).article_id.count().mean()

279.43123724475413

In [82]:
df.y.mean()

0.013013388451848457

In [83]:
### read meta data ###
static_article_df = pd.read_parquet('./data/fe/static_article_df.parquet')[["article_id", "duplicate_hold_rate"]]
df = df.merge(static_article_df, on=["article_id"])
del static_article_df
gc.collect()

60

In [84]:
dynamic_article_df = pd.read_parquet('./data/fe/dynamic_article_df.parquet')
dynamic_article_df.week += 1
df = df.merge(dynamic_article_df, on=["article_id", "week"], how="left")
del dynamic_article_df
gc.collect()

20

In [85]:
customer_features = pd.read_parquet('./data/fe/customer_features.parquet')[["customer_id", "gender", "dup_rate"]]
df = df.merge(customer_features, on=["customer_id"])
del customer_features
gc.collect()

20

In [2]:
customer_article_score = pd.read_parquet('./data/fe/customer_recommendation_fully.parquet').rename(columns={"recommendation_article_id": "article_id"})
df = df.merge(customer_article_score, on=["customer_id", "article_id"], how="left")
del customer_article_score
gc.collect()

NameError: name 'df' is not defined

In [3]:
customer_article_score

Unnamed: 0,customer_id,article_id,score
0,4245900472157,715624001,0.090816
1,4245900472157,715624010,0.057217
2,4245900472157,189616006,0.050851
3,4245900472157,547780001,0.046596
4,4245900472157,803757001,0.041995
...,...,...,...
21796491,18446737527580148316,800691008,0.120424
21796492,18446737527580148316,629420007,0.104970
21796493,18446737527580148316,803757004,0.100142
21796494,18446737527580148316,715624001,0.093450


In [87]:
df.score.fillna(0, inplace=True)
df["rank"].fillna(0, inplace=True)

In [88]:
df.isna().values.any()

False

In [89]:
gc.collect()

60

In [90]:
columns_to_use = [
       'week', 'top_strategy',
       'bestseller_rank', 
       'buy_times', 'duplicate_buy_strategy', 'week_gap',
       'recommendation_strategy', 
       'duplicate_hold_rate', 'price',
       'weekly_sell_counts', 'rank', 'gender', 'dup_rate', 'score'
]
target = "y"

In [91]:
n_folds = 1
last_week = 104
params = {
    "objective":"lambdarank",
    "metric":"ndcg",
    "boosting_type":"dart",
    "n_estimators":40,
    "importance_type":'gain',
    "verbose":10,
    "learning_rate":0.1,
    "num_leaves": 40,
}


for i in range(n_folds):
    spilt_week = last_week - (n_folds - i) + 1
    training_data = df[df.week < spilt_week]
    valid_data    = df[(df.week == spilt_week) & ~((df.top_strategy == 0) & (df.duplicate_buy_strategy == 0) & (df.recommendation_strategy == 0))]
    train_baskets = training_data.groupby(['week', 'customer_id'])['article_id'].count().values
    validation_baskets = valid_data.groupby(['week', 'customer_id'])['article_id'].count().values
    train_X, train_y = training_data[columns_to_use], training_data[target]
    val_X, val_y     = valid_data[columns_to_use], valid_data[target]
    ranker = LGBMRanker(
        **params,
    )
    ranker = ranker.fit(
        train_X,
        train_y,
        group=train_baskets,
        eval_set=[(val_X, val_y)],
        eval_group=[validation_baskets],
    )
    break

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.896237
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.464629
[LightGBM] [Debug] init for col-wise cost 0.245181 seconds, init for row-wise cost 1.036074 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 2030
[LightGBM] [Info] Number of data points in the train set: 23817856, number of used features: 14
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 8
[1]	valid_0's ndcg@1: 0.719531	valid_0's ndcg@2: 0.720836	valid_0's ndcg@3: 0.723031	valid_0's ndcg@4: 0.727182	valid_0's ndcg@5: 0.730065
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 7
[2]	valid_0's ndcg@1: 0.725546	valid_0's ndcg@2: 0.722643	valid_0's ndcg@3: 0.725618	valid_0's ndcg@4: 0.72958	valid_0's ndcg@5: 0.732482
[LightGBM] [Debug] Trained a tree with

[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 9
[38]	valid_0's ndcg@1: 0.727445	valid_0's ndcg@2: 0.725741	valid_0's ndcg@3: 0.7285	valid_0's ndcg@4: 0.732958	valid_0's ndcg@5: 0.7358
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 10
[39]	valid_0's ndcg@1: 0.727129	valid_0's ndcg@2: 0.726391	valid_0's ndcg@3: 0.728551	valid_0's ndcg@4: 0.733271	valid_0's ndcg@5: 0.736061
[LightGBM] [Debug] Trained a tree with leaves = 40 and depth = 10
[40]	valid_0's ndcg@1: 0.727129	valid_0's ndcg@2: 0.726191	valid_0's ndcg@3: 0.728729	valid_0's ndcg@4: 0.733313	valid_0's ndcg@5: 0.736207


In [92]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

recommendation_strategy 0.4345256053843216
bestseller_rank 0.18644609741145515
buy_times 0.176881165270071
top_strategy 0.09924206324997073
week_gap 0.04608174861917835
score 0.034814855179167464
weekly_sell_counts 0.012046679363147645
rank 0.0062038566789821385
duplicate_hold_rate 0.001291180222235345
week 0.0012354380432170722
price 0.0010113328982032338
dup_rate 0.0002014427495379371
gender 1.8534930512341454e-05
duplicate_buy_strategy 0.0


In [93]:
preds = ranker.predict(val_X)
valid_data["preds"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_data["preds"] = preds


In [94]:
c_id2predicted_article_ids = valid_data[["customer_id", "article_id", "preds"]] \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

In [95]:
tmp = valid_data[["customer_id", "article_id", "y"]]
tmp = tmp[tmp.y == 1]

In [96]:
tmp = pd.read_parquet('./data/sample/transactions_train_sample_0.05.parquet')

In [97]:
test_week_purchases_by_cust = tmp[tmp.week == spilt_week].groupby('customer_id').article_id.apply(list).to_dict()

In [98]:
apks = []
cold = 0
for c_id, gt in test_week_purchases_by_cust.items():
    temp = []
    if c_id in c_id2predicted_article_ids:
        temp = c_id2predicted_article_ids[c_id]
    else:
        cold += 1
    apks.append(apk(gt, temp, 12, False))
np.mean(apks)

0.04789038823871275

In [71]:
apks = []
cold = 0
for c_id, gt in test_week_purchases_by_cust.items():
    temp = []
    if c_id in c_id2predicted_article_ids:
        temp = c_id2predicted_article_ids[c_id]
    else:
        cold += 1
    apks.append(apk(gt, temp, 12, False))
np.mean(apks)

0.0356058206237468

In [37]:
apks = []
cold = 0
for c_id, gt in test_week_purchases_by_cust.items():
    temp = []
    if c_id in c_id2predicted_article_ids:
        temp = c_id2predicted_article_ids[c_id]
    else:
        cold += 1
    apks.append(apk(gt, temp, 12, False))
np.mean(apks)

0.03398928476970805

In [28]:
len(c_id2predicted_article_ids)

3283