In [None]:
import os
import sys
import gc

import random
from datetime import datetime
from tqdm import notebook
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd

sys.path.append(str(Path().absolute().parent))

from src.map_at_k import mapk, apk
from src.negative_sampling import NegativeSampling
import lightgbm as lgb
# mac install lightgbm - https://www.geeksforgeeks.org/how-to-install-xgboost-and-lightgbm-on-macos/
# mac M1 pip problem (can also solve lightgbm install) - https://stackoverflow.com/questions/68620927/installing-scipy-and-scikit-learn-on-apple-m1/70178471#70178471
# mac M1 have to use conda to install pytorch () - https://betterprogramming.pub/how-to-install-pytorch-on-apple-m1-series-512b3ad9bc6
# scipy early version cannot be install at mac M1 (e.g., scipy 1.6.0)

In [None]:
"""Notes

* articles.csv, transactions_train.csv, customers.csv, sample_submission.csv are download from kaggle \
    https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data
* user_features.parquet, item_features.parquet are create by notebook \
    hm_generate_advance_article_features.ipynb, hm_generate_advance_user_features.ipynb
"""

base_pth = Path().absolute().parent
article_pth = base_pth/'datasets/articles.csv'
transaction_pth = base_pth/'datasets/transactions_train.csv'
customer_pth = base_pth/'datasets/customers.csv'
submission_pth = base_pth/'datasets/sample_submission.csv'

adv_user_feature_pth = base_pth/'datasets/user_features.parquet'
adv_item_feature_pth = base_pth/'datasets/item_features.parquet'

output_pth = base_pth/'output_data/submission.csv'

## Load Data 

In [None]:
# user features
user_features = pd.read_parquet(adv_user_feature_pth)
# str to int
user_features[['club_member_status', 'fashion_news_frequency']] = (
    user_features[['club_member_status', 'fashion_news_frequency']]
    .apply(lambda x: pd.factorize(x)[0])
).astype('int8')
user_features = user_features.reset_index()


# article features
article_df = pd.read_csv(article_pth)
article_df['article_id'] = '0' + article_df['article_id'].astype(str)


# item features
item_features = pd.read_parquet(adv_item_feature_pth)
item_features = item_features.reset_index()
item_features['article_id'] = '0' + item_features['article_id'].astype(str)


# customer features
customer_df = pd.read_csv(customer_pth)
# customer_df['age_bins'] = pd.cut(customer_df['age'], [-1, 19, 29, 39, 49, 69, 119])
# customer_df['age_bins'] = customer_df['age_bins'].astype(str)


# transaction features
transaction_df = pd.read_csv(transaction_pth)
transaction_df['t_dat'] = pd.to_datetime(transaction_df['t_dat'])
transaction_df['article_id'] = '0' + transaction_df['article_id'].astype(str)
# week from 104 - 0 (close - far)
transaction_df['week'] = 104 - (transaction_df.t_dat.max() - transaction_df.t_dat).dt.days // 7



# submission data
sub = pd.read_csv(
    submission_pth,
    usecols=['customer_id'],
    dtype={'customer_id': 'string'}
)

In [None]:
# Preprocessing
neg_sampling = NegativeSampling(
    transaction_df=transaction_df, train_inteval=10
)
data = neg_sampling.create_data_with_neg_sample(
    extra_user_features=user_features,
    extra_item_features=item_features,
)

In [None]:
valid_week = neg_sampling.valid_week

train = data[data.week != valid_week]
valid = data[data.week==valid_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

train_X = train.drop(columns=['purchased', 't_dat', 'price', 'sales_channel_id', 'customer_id', 'article_id', 'week'])
train_y = train['purchased']
valid_X = valid.drop(columns=['purchased', 't_dat', 'price', 'sales_channel_id', 'customer_id', 'article_id', 'week'])

In [None]:
# 確認正負樣本比例
train.groupby('purchased').count()

## Modeling

In [None]:
# training

# 注意: 這邊的 train 需要按照組別順序來給
train_baskets = train.groupby(
    ['week', 'customer_id']
)['article_id'].count().values

ranker = lgb.LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10
)
ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

## Prediction & Measurement

In [None]:
# 準備推薦商品, 預測值
valid['preds'] = ranker.predict(valid_X)

c_id2predicted_article_ids = (
    valid
    .sort_values(['customer_id', 'preds'], ascending=False)
    .groupby('customer_id')['article_id'].apply(list).to_dict()
)

bestsellers_last_week = \
    neg_sampling.bestsellers_previous_week[
        neg_sampling.bestsellers_previous_week.week == neg_sampling.bestsellers_previous_week.week.max()
    ]['article_id'].tolist()


# create submission (valid prediction)
preds = []
for c_id in sub.customer_id:
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

preds = [' '.join([str(p) for p in ps]) for ps in preds]
sub['prediction'] = preds

In [None]:
## measure map@12 at valid
valid_ground_true = neg_sampling.valid_trans.groupby(
    'customer_id', as_index=False
).agg(ground_true=('article_id', list))

measure_df = sub[['customer_id', 'prediction']]
measure_df = measure_df.merge(valid_ground_true, on='customer_id', how='inner')
measure_df['prediction'] = [pred.split(' ') for pred in list(measure_df['prediction'].values)]

mapk_value = mapk(measure_df, pred_col='prediction', ground_true_col='ground_true', k=12)

In [None]:
mapk_value