In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from datetime import timedelta, datetime

from joblib import Parallel, delayed

from typing import List

In [2]:
train = pd.read_parquet("../data/working/transactions_train.parquet")
train["t_dat"] = pd.to_datetime(train["t_dat"])

In [3]:
train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


## Split data

In [4]:
def split_data(data: pd.DataFrame, base_date: str, train_days: int = 7):
    eval_days = 7
    
    eval_end_date = datetime.strptime(base_date, '%Y-%m-%d')
    eval_start_date = eval_end_date - timedelta(days=7)
    
    train_end_date = eval_start_date
    train_start_date = train_end_date - timedelta(days=train_days)
    
    eval_data = data[(train["t_dat"] <= eval_end_date) & (data["t_dat"] > eval_start_date)]
    train_data = data[(train["t_dat"] <= train_end_date) & (data["t_dat"] > train_start_date)]
    
    return train_data, eval_data

In [5]:
train_data, eval_data = split_data(train, "2020-09-15", 7)

In [6]:
def create_target(data: pd.DataFrame):
    result = eval_data.groupby("customer_id")["article_id"].agg(list).to_frame().reset_index()
    return result

In [7]:
eval_target = create_target(eval_data)

## Make some rules

In [26]:
def most_popular_articles(data: pd.DataFrame) -> List:
    popular_articles = data["article_id"].value_counts().nlargest(12)
    popular_articles = popular_articles.index.tolist()
    popular_articles = ' '.join(['0' + str(r) for r in popular_articles])
    return popular_articles


def most_purchase_articles_each_customer(data: pd.DataFrame):
    def _func(c_id, gdf):
        res = most_popular_articles(gdf)
        return (c_id, res)
    result = Parallel(n_jobs=-1)(delayed(_func)(c_id, gdf) for c_id, gdf in tqdm(data.groupby("customer_id")))
    result = pd.DataFrame(columns=["customer_id", "top_purchased_articles"], data=result)
    return result

In [27]:
purchased_articles = most_purchase_articles_each_customer(train_data)

  0%|          | 0/75822 [00:00<?, ?it/s]

In [28]:
pop_result = most_popular_articles(train_data)

In [29]:
def ensemble(row: pd.DataFrame, weights: List):
    assert len(row) == len(weights)
    row = row.str.split(" ")
    res = {}
    for i, _r in enumerate(row):
        w = weights[i]
        for a_id in _r:
            if a_id in res.keys():
                res[a_id] += w * 1 / sum(weights)
            else:
                res[a_id] = w * 1 / sum(weights)
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())
    return " ".join(res[:12])

In [30]:
result = purchased_articles[["customer_id"]].copy()

pred = pd.concat([purchased_articles["top_purchased_articles"], purchased_articles["top_purchased_articles"]], axis=1)
result["prediction"] = pred.apply(ensemble, weights=[1, 1], axis=1)

In [31]:
pred_data = eval_target[["customer_id"]].copy().reset_index(drop=True)
pred_data = pred_data.merge(result, how="left", on="customer_id").fillna("")

pred_data["prediction"] = pred_data["prediction"] + pop_result
pred_data["prediction"] = pred_data["prediction"].str.strip()
pred_data["prediction"] = pred_data["prediction"].str[:131]

In [32]:
pred_data.head()

Unnamed: 0,customer_id,prediction
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0915526001 0751471043 0751471001 0706016001 09...
1,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,0915526001 0751471043 0751471001 0706016001 09...
2,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0572998013 0909869004 0923134003 0935858001 08...
3,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,0913367001 0904026001 09059140020915526001 075...
4,00040239317e877c77ac6e79df42eb2633ad38fcac09fc...,0915526001 0751471043 0751471001 0706016001 09...


## Evaluation

In [16]:
def average_precision(target, predict, k=12):
    len_target = min(len(target), k)

    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predict):
        if p in target and p not in predict[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len_target, k)


def mean_average_precision(targets, predicts, k=12):
    map_top_k = np.mean([average_precision(t, p) for t, p in zip(targets, predicts)])
    assert 0.0 <= map_top_k <= 1.0, "map_top_k must be 0.0 <= map_top_k <= 1.0"
    return map_top_k

In [17]:
target = eval_target["article_id"].tolist()

pred = pred_data["prediction"].str[1:]
pred = pred.str.split(" 0").tolist()
pred = [[int(_p) for _p in p] for p in pred]

mean_average_precision(target, pred)

0.012716760347330937

## Prediction

In [18]:
_, sub_data = split_data(train, "2020-09-22", 7)

In [19]:
purchased_articles = most_purchase_articles_each_customer(sub_data)

  0%|          | 0/68984 [00:00<?, ?it/s]

In [20]:
pop_result = most_popular_articles(sub_data)

In [21]:
resultlt = purchased_articles[["customer_id"]].copy()

pred = pd.concat([purchased_articles["top_purchased_articles"], purchased_articles["top_purchased_articles"]], axis=1)
result["prediction"] = pred.apply(ensemble, weights=[1, 1], axis=1)

In [22]:
result.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0624486001
1,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,0827487003
2,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,0757926001 0788575004 0640021019
3,0001f8cef6b9702d54abf66fd89eb21014bf98567065a9...,0874110016
4,0002cca4cc68601e894ab62839428e5f0696417fe0f9e8...,0903762001 0879189005 0158340001 0867966009 09...


## Submission

In [23]:
sub = pd.read_csv("../data/raw/sample_submission.csv")

In [24]:
sub = sub[["customer_id"]].copy().reset_index(drop=True)
sub = sub.merge(result, how="left", on="customer_id").fillna("")

sub["prediction"] = sub["prediction"] + pop_result
sub["prediction"] = sub["prediction"].str.strip()
sub["prediction"] = sub["prediction"].str[:131]

In [25]:
sub.to_csv("../data/submit/00101.csv", index=False)
sub.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,06244860010924243001 0924243002 0918522001 092...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0924243001 0924243002 0918522001 0923758001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...
