In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from featuring import Merger, ColumnsCorrector, LastDaysRate, BasketRate, DepartmentSellRate, SameDepartmentPurchases, \
    MeanDepartmentExpenses

from collections import Counter
from datasplit import DataSplit
from preprocess import DataPreprocessor
from candidate_model import CandidateModel
from metrics import precision_at_k, recall_at_k, calc_mean_metric

## prepare data & fit candidate model

In [2]:
def prepare_true_values(df):
    true_values =  df.groupby('user_id')['item_id'].unique().reset_index()
    true_values.columns = ['user_id', 'actual']
    return true_values

In [3]:
purchases = pd.read_csv('retail_train.csv')

# train/valid split
splitter = DataSplit(purchases, 'week_no', [6, 4])

train_lv1 = purchases[splitter.part0].copy()
valid_lv1 = purchases[splitter.part1].copy()
valid_lv2 = purchases[splitter.part2].copy()

# prepare lv1 validation true values
true_train_lv1 = prepare_true_values(train_lv1)
true_valid_lv1 = prepare_true_values(valid_lv1)

In [4]:
mix_feat_params = {
    'top_config': {'fields': ['quantity', 'sales_value'],
                   'beta': [1., 1.],
                   'k': 5000,
                   'scaler': StandardScaler
                    },
    'uim_config': {'aggfunc': 'sum',
                   # 'weights': tfidf_weight
                   },
}

pre = DataPreprocessor(train_lv1, valid_lv1, **mix_feat_params)
pre.fit()

In [5]:
candidate_params = {
    'train': pre.train_uim_sparse,
    'weighted': pre.train_uim_weighted,
    'top_items': pre.top_k_items,
    'placeholder_id': pre.placeholder_id,
    'idx_to_item': pre.idx_to_item,
    'item_to_idx': pre.item_to_idx,
    'user_to_idx': pre.user_to_idx
}
n_candidates = 100
cm = CandidateModel('BM25', **candidate_params)
cm.fit(K=1)
train_candidates = cm.predict(true_train_lv1['user_id'], N=n_candidates)
valid_candidates = cm.predict(true_valid_lv1['user_id'], N=n_candidates)

recall_lv1_train = calc_mean_metric(recall_at_k, true_train_lv1['actual'], train_candidates, k=n_candidates)
recall_lv1_valid = calc_mean_metric(recall_at_k, true_valid_lv1['actual'], valid_candidates, k=n_candidates)
recall_lv1_train, recall_lv1_valid

(0.30147727453621564, 0.2123693837009425)

In [6]:
# stack candidates
candidates = pd.DataFrame.from_dict(valid_candidates.to_dict(), orient='index').set_index(true_valid_lv1['user_id'])
candidates = candidates.stack().reset_index(level=1, drop=True).rename('item_id').reset_index()
candidates.shape

(215400, 2)

In [7]:
# общая доля релевантных товаров в подборке кандидатов
relevant = valid_lv1[['user_id', 'item_id']].copy()
relevant['target'] = 1
zeros, ones = candidates.merge(relevant, on=['user_id', 'item_id'], how='left').fillna(0)['target'].value_counts()
ones / (zeros + ones)

0.18031961964914717

## prepare data for lv2 model

In [8]:
def merge_candidates(df, cand, warm_users=None):
    """ Prepare dataset lv2 for featuring
    :param df: required data to be prepared
    :param cand: dataset with stacked candidates
    :param warm_users: leave only given users (array-like)
    """
    if warm_users is not None:
        warm = df['user_id'].isin(warm_users)
        target = df[warm].copy()
    else:
        target = df.copy()
    target['target'] = 1      # flag means this item was really bought
    target = cand.merge(target, on=['user_id', 'item_id'], how='left').fillna(0)
    return target.drop(columns='target'), target['target']

In [9]:
# train lv2: prepare for featuring (markup & merge candidates)
train_lv2, train_lv2_target = merge_candidates(valid_lv1, candidates, warm_users=train_lv1['user_id'].unique())
# train_lv2_empty = merge_candidates(valid_lv1, candidates)     # both warn & cold

# valid lv2: prepare for featuring (markup & merge candidates)
valid_lv2, valid_lv2_target = merge_candidates(valid_lv2, candidates)
train_lv2.shape, valid_lv2.shape

((230524, 12), (222826, 12))

### feature engineering

In [10]:
# # baseline item features
# drop_columns = ['basket_id', 'day', 'quantity', 'sales_value', 'store_id', 'retail_disc', 'trans_time', 'week_no', 'coupon_disc', 'coupon_match_disc']
# item_data = pd.read_csv('product.csv')
# item_data.columns = item_data.columns.str.lower()
# item_data.rename(columns={'product_id': 'item_id'}, inplace=True)
# # item_data.drop(columns=['curr_size_of_product'], inplace=True)
# keep_cols = ['item_id', 'department',]
# item_data = item_data[keep_cols]
#
# # baseline user features
# user_data = pd.read_csv('../hw5/user_features_corrected.csv')
#
# # merge dummies
# featured_train_lv2 = train_lv2.copy()
# featured_train_lv2 = featured_train_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
# featured_train_lv2 = featured_train_lv2.merge(user_data, on='user_id', how='left').fillna(0)
# featured_train_lv2.drop(columns=drop_columns, inplace=True)
#
# featured_valid_lv2 = valid_lv2.copy()
# featured_valid_lv2 = featured_valid_lv2.merge(pd.get_dummies(item_data), on='item_id', how='left').fillna(0)
# featured_valid_lv2 = featured_valid_lv2.merge(user_data, on='user_id', how='left').fillna(0)
# featured_valid_lv2.drop(columns=drop_columns, inplace=True)

In [11]:
# load items data
item_data = pd.read_csv('product.csv')
item_data.columns = item_data.columns.str.lower()
item_data.rename(columns={'product_id': 'item_id'}, inplace=True)

# load users data
user_data = pd.read_csv('hh_demographic.csv')
user_data.columns = user_data.columns.str.lower()
user_data.rename(columns={'household_key': 'user_id'}, inplace=True)

# load prepared user features
user_data_corrected = pd.read_csv('../hw5/user_features_corrected.csv')

In [12]:
drop_columns = ['basket_id', 'day', 'quantity', 'sales_value', 'store_id', 'retail_disc', 'trans_time', 'week_no',
                'coupon_disc', 'coupon_match_disc']
keep_user_cols = ['hh_comp', 'hh_size', 'kids', 'single_female', 'single_male']

featuring = Pipeline([('UserFeaturesMerger', Merger(user_data_corrected, on='user_id', cols=keep_user_cols)),
                      # ('LastDaysRate', LastDaysRate(n_days=25)),
                      # ('BasketRate', BasketRate(n_days=14)),
                      ('MeanDepartmentExpenses', MeanDepartmentExpenses(item_data, n_days=25)),
                      ('DepartmentSellRate', DepartmentSellRate(item_data, n_days=7)),
                      ('SameDepartmentPurchases', SameDepartmentPurchases(item_data, n_days=3)),
                      # ('', ),
                      # ('', ),
                      ('drop', ColumnsCorrector(drop_columns, mode='drop')),
                      ])

featuring.fit(train_lv2)
featured_train_lv2 = featuring.transform(train_lv2)
featured_valid_lv2 = featuring.transform(valid_lv2)

In [13]:
featured_train_lv2.head(2)

Unnamed: 0,user_id,item_id,hh_comp,hh_size,kids,single_female,single_male,mean_department_expenses_for_25_days,department_sell_rate_for_7_days,same_department_purchases_for_3_days
0,1,1082185,2.0,2.0,0.0,0.0,0.0,2.15,0.163369,0.0
1,1,1082185,2.0,2.0,0.0,0.0,0.0,2.15,0.163369,0.0


## fit lv2 LGBM

In [14]:
model = LGBMClassifier(max_depth=5, learning_rate=0.01, random_state=193)
model.fit(featured_train_lv2, train_lv2_target)

train_pred = model.predict(featured_train_lv2)
valid_pred = model.predict(featured_valid_lv2)
lgb_pr_train = precision_score(train_lv2_target, train_pred)
lgb_pr_valid = precision_score(valid_lv2_target, valid_pred)
lgb_rc_train = recall_score(train_lv2_target, train_pred)
lgb_rc_valid = recall_score(valid_lv2_target, valid_pred)

lgb_pr_train, lgb_pr_valid, lgb_rc_train, lgb_rc_valid
# Counter(train_preds), Counter(valid_preds)

(0.9649965682910089,
 0.9037698412698413,
 0.03382490918276517,
 0.03370828091467476)

In [15]:
def predict_recommends(data, *, k):
    """ Get k recommends based on given data """
    proba = pd.Series(model.predict_proba(data).T[1], name='proba')
    ranked_pred = pd.concat([data[['user_id', 'item_id']], proba], axis=1)
    ranked_cand = candidates.merge(ranked_pred, on=['user_id', 'item_id'], how='left')
    # collect recommends
    sorted_cand = ranked_cand.sort_values(by=['user_id', 'proba'], ascending=[True, False]).groupby('user_id').head(k)
    return sorted_cand.groupby('user_id')['item_id'].unique()

In [16]:
# prepare lv2 validation true values
true_train_lv2 = prepare_true_values(train_lv2)
true_valid_lv2 = prepare_true_values(valid_lv2)
true_train_lv2.shape, true_valid_lv2.shape

((2154, 2), (2154, 2))

In [17]:
# predictions and metrics
k = 5
train_rec = predict_recommends(featured_train_lv2, k=k)
valid_rec = predict_recommends(featured_valid_lv2, k=k)
precision_train_lv2 = calc_mean_metric(precision_at_k, true_train_lv2['actual'], train_rec.reset_index(drop=True), k=k)
precision_valid_lv2 = calc_mean_metric(precision_at_k, true_valid_lv2['actual'], valid_rec.reset_index(drop=True), k=k)
precision_train_lv2, precision_valid_lv2

(0.8326833797585886, 0.9064995357474466)

In [18]:
# 0.8326833797585886, 0.9064995357474466