In [679]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import partial
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from collections import Counter
from datasplit import DataSplit
from preprocess import DataPreprocessor
from candidate_model import CandidateModel
from metrics import precision_at_k, recall_at_k, ap_k, calc_mean_metric

## prepare data & fit candidate model

In [680]:
purchases = pd.read_csv('retail_train.csv')

# train/valid split
splitter = DataSplit(purchases, 'week_no', [6, 4])

train_lv1 = purchases[splitter.part0].copy()
valid_lv1 = purchases[splitter.part1].copy()
valid_lv2 = purchases[splitter.part2].copy()

# prepare lv1 validation true values
true_train_lv1 = train_lv1.groupby('user_id')['item_id'].unique().reset_index()
true_train_lv1.columns=['user_id', 'actual']
true_valid_lv1 = valid_lv1.groupby('user_id')['item_id'].unique().reset_index()
true_valid_lv1.columns=['user_id', 'actual']

In [681]:
mix_feat_params = {
    'top_config': {'fields': ['quantity', 'sales_value'],
                   'beta': [1., 1.],
                   'k': 5000,
                   'scaler': StandardScaler
                    },
    'uim_config': {'aggfunc': 'sum',
                   # 'weights': tfidf_weight
                   },
}

pre = DataPreprocessor(train_lv1, valid_lv1, **mix_feat_params)
pre.fit()

In [682]:
candidate_params = {
    'train': pre.train_uim_sparse,
    'weighted': pre.train_uim_weighted,
    'top_items': pre.top_k_items,
    'placeholder_id': pre.placeholder_id,
    'idx_to_item': pre.idx_to_item,
    'item_to_idx': pre.item_to_idx,
    'user_to_idx': pre.user_to_idx
}
n_candidates = 70
cm = CandidateModel('BM25', **candidate_params)
cm.fit(K=1)
train_candidates = cm.predict(true_train_lv1['user_id'], N=n_candidates)
valid_candidates = cm.predict(true_valid_lv1['user_id'], N=n_candidates)

recall_lv1_train = calc_mean_metric(recall_at_k, true_train_lv1['actual'], train_candidates, k=n_candidates)
recall_lv1_valid = calc_mean_metric(recall_at_k, true_valid_lv1['actual'], valid_candidates, k=n_candidates)
recall_lv1_train, recall_lv1_valid

(0.24309811427640554, 0.18430627660815352)

In [683]:
# unstack candidates
candidates = pd.DataFrame.from_dict(valid_candidates.to_dict(), orient='index').set_index(true_valid_lv1['user_id'])
candidates = candidates.stack().reset_index(level=1, drop=True).rename('item_id').reset_index()
candidates.shape

(150780, 2)

## prepare data for lv2 model

In [684]:
# leave only warm users in dataset
train_users = train_lv1['user_id'].unique()
warm_users = valid_lv1['user_id'].isin(train_users)
train_target_lv2 = valid_lv1.loc[warm_users, ['user_id', 'item_id']].copy()
# train_target_lv2 = valid_lv1.copy()       # or both warm & cold

# markup candidates and merge with real items
train_target_lv2['target'] = 1      # flag means this item was really bought
train_data_lv2 = candidates.merge(train_target_lv2, on=['user_id', 'item_id'], how='left').fillna(0)
train_lv2_empty = train_data_lv2.drop(columns='target')
train_lv2_empty.shape

(164810, 2)

In [685]:
# the same for validation data
valid_target_lv2 = valid_lv2[['user_id', 'item_id']].copy()
valid_target_lv2['target'] = 1
valid_data_lv2 = candidates.merge(valid_target_lv2, on=['user_id', 'item_id'], how='left').fillna(0)
valid_lv2_empty = valid_data_lv2.drop(columns='target')
valid_lv2_empty.shape

(157666, 2)

In [686]:
# доля релевантных товаров среди кандидатов
zeros, ones = train_data_lv2['target'].value_counts()
ones / (ones + zeros)

0.21866998361749893

### feature engineering

In [712]:
# baseline item features
cat_items = []
item_data = pd.read_csv('product.csv')
item_data.columns = item_data.columns.str.lower()
item_data.rename(columns={'product_id': 'item_id'}, inplace=True)
# item_data.drop(columns=['curr_size_of_product'], inplace=True)
keep_cols = ['item_id', 'department',]
item_data = item_data[keep_cols]

# baseline user features
user_data = pd.read_csv('../hw5/user_features_corrected.csv')

# merge dummies
train_lv2 = train_lv2_empty.copy()
train_lv2 = train_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
train_lv2 = train_lv2.merge(user_data, on='user_id', how='left').fillna(0)

valid_lv2 = valid_lv2_empty.copy()
valid_lv2 = valid_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
valid_lv2 = valid_lv2.merge(user_data, on='user_id', how='left').fillna(0)

In [692]:
# add some features
# ...
# make pipeline for valid lv2 preparation

In [693]:
# # load items data
# item_data = pd.read_csv('product.csv')
# item_data.columns = item_data.columns.str.lower()
# item_data.rename(columns={'product_id': 'item_id'}, inplace=True)

In [694]:
# # day rate: % дней в которые товар был продан
# day_rate = (purchases.groupby('item_id')['day'].nunique() / purchases['day'].max()).rename('day_rate')
# item_data = item_data.merge(day_rate, on='item_id', how='left').fillna(0)

In [695]:
# # basket rate: % уникальных чеков в которых присутствовал товар
# basket_rate = (purchases.groupby('item_id')['basket_id'].nunique() / purchases['basket_id'].nunique()).rename('basket_rate')
# item_data = item_data.merge(basket_rate, on='item_id', how='left').fillna(0)

In [696]:
# # load users data
# user_data = pd.read_csv('hh_demographic.csv')
# user_data.columns = user_data.columns.str.lower()
# user_data.rename(columns={'household_key': 'user_id'}, inplace=True)

In [697]:
# # featuring train dataset
# train_lv2 = train_lv2_empty.copy()

# train_lv2 = train_lv2.merge(day_rate, on='item_id', how='left').fillna(0)
# train_lv2 = train_lv2.merge(basket_rate, on='item_id', how='left').fillna(0)
# train_lv2 = train_lv2.merge(item_data, on='user_id', how='left').fillna(0)

# train_lv2 = train_lv2.merge(user_data, on='user_id', how='left').fillna(0)

# categorical_feats = []
# train_lv2[categorical_feats] = train_lv2[categorical_feats].astype('category')
# train_lv2.shape

In [714]:
train_lv2.head(2)

Unnamed: 0,user_id,item_id,department_,department_AUTOMOTIVE,department_CHARITABLE CONT,department_CHEF SHOPPE,department_CNTRL/STORE SUP,department_COSMETICS,department_COUP/STR & MFG,department_DAIRY DELI,...,income_15-24K,income_150-174K,income_175-199K,income_200-249K,income_25-34K,income_250K+,income_35-49K,income_50-74K,income_75-99K,income_Under 15K
0,1,1082185,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1,1082185,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [715]:
# prepare lv2 validation true values
true_train_lv2 = train_lv2.groupby('user_id')['item_id'].unique().reset_index()
true_train_lv2.columns=['user_id', 'actual']
true_valid_lv2 = valid_lv2.groupby('user_id')['item_id'].unique().reset_index()
true_valid_lv2.columns=['user_id', 'actual']
true_train_lv2.shape, true_valid_lv2.shape

((2154, 2), (2154, 2))

## fit lv2 LGBM

In [716]:
# model = LGBMClassifier(objective='binary', max_depth=6, categorical_column=categorical_feats)
# model = LGBMClassifier(max_depth=5, learning_rate=0.01, categorical_column=cat_items)
model = LGBMClassifier(max_depth=5, learning_rate=0.01,)
model.fit(train_lv2, train_data_lv2['target'])

train_preds = model.predict(train_lv2)
# valid_preds = model.predict(valid_lv2)
lgb_pr_train = precision_score(train_data_lv2['target'], train_preds)
# lgb_pr_valid = precision_score(data_lv2['target'], valid_preds)

Counter(train_preds), # Counter(valid_preds)

(Counter({0.0: 163775, 1.0: 1035}),)

In [717]:
# train proba
proba = pd.Series(model.predict_proba(train_lv2).T[1], name='proba')
ranked_predicts = pd.concat([train_lv2[['user_id', 'item_id']], proba], axis=1)
ranked_candidates = candidates.merge(ranked_predicts, on=['user_id', 'item_id'], how='left')

# collect recommends
k = 5
sorted_candidates = ranked_candidates.sort_values(by=['user_id', 'proba'], ascending=[True, False]).groupby('user_id').head(k)

# this is for train
recommends = sorted_candidates.groupby('user_id')['item_id'].unique()
precision_train_lv2 = calc_mean_metric(precision_at_k, true_train_lv2['actual'], recommends.reset_index(drop=True), k=k)

In [718]:
# valid proba
proba = pd.Series(model.predict_proba(valid_lv2).T[1], name='proba')
ranked_predicts = pd.concat([valid_lv2[['user_id', 'item_id']], proba], axis=1)
ranked_candidates = candidates.merge(ranked_predicts, on=['user_id', 'item_id'], how='left')

# collect recommends
k = 5
sorted_candidates = ranked_candidates.sort_values(by=['user_id', 'proba'], ascending=[True, False]).groupby('user_id').head(k)

# this is for train
recommends = sorted_candidates.groupby('user_id')['item_id'].unique()
precision_valid_lv2 = calc_mean_metric(precision_at_k, true_valid_lv2['actual'], recommends.reset_index(drop=True), k=k)

In [724]:
precision_train_lv2, precision_valid_lv2

(0.82330547818013, 0.8885793871866295)

## fit lv2 XGBoost

In [721]:
# model = XGBClassifier(max_depth=4, subsample=0.5, eta=0.1)
# model.fit(train_lv2, data_lv2['target'])
#
# precision_score(data_lv2['target'], model.predict(train_lv2))

In [722]:
# # train proba
# proba = pd.Series(model.predict_proba(train_lv2).T[1], name='proba')
# ranked_predicts = pd.concat([train_lv2[['user_id', 'item_id']], proba], axis=1)
# ranked_candidates = candidates.merge(ranked_predicts, on=['user_id', 'item_id'], how='left')
#
# # collect recommends
# k = 5
# sorted_candidates = ranked_candidates.sort_values(by=['user_id', 'proba'], ascending=[True, False]).groupby('user_id').head(k)
#
# # this is for train
# recommends = sorted_candidates.groupby('user_id')['item_id'].unique()
# precision_train_lv2 = calc_mean_metric(precision_at_k, true_train_lv2['actual'], recommends.reset_index(drop=True), k=k)
# # precision_valid_lv2 = calc_mean_metric(precision_at_k, true_valid_lv2['actual'], recommends.reset_index(drop=True), k=k)
#
# precision_train_lv2,

In [723]:
#