In [111]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from functools import partial
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score
from lightgbm import LGBMClassifier

from datasplit import DataSplit
from preprocess import DataPreprocessor
from candidate_model import CandidateModel
from metrics import precision_at_k, recall_at_k, ap_k, calc_mean_metric

## load & split

In [186]:
purchases = pd.read_csv('retail_train.csv')

# train/valid split
splitter = DataSplit(purchases, 'week_no', [6, 3])

train_lv1 = purchases[splitter.part0].copy()
valid_lv1 = purchases[splitter.part1].copy()

# leave only warm users
# train_users = train_lv1['user_id'].unique()
# warm_users = valid_lv1['user_id'].isin(train_users)
# train_lv2_base = valid_lv1.loc[warm_users, ['user_id', 'item_id']].copy()
train_lv2_base = valid_lv1.copy()       # warm & cold
valid_lv2 = purchases[splitter.part2].copy()

# prepare true values
true_lv1 = valid_lv1.groupby('user_id')['item_id'].unique().reset_index()
true_lv1.columns=['user_id', 'actual']

In [187]:
# load & prepare products info
item_features = pd.read_csv('product.csv')
item_features.columns = item_features.columns.str.lower()
item_features.rename(columns={'product_id': 'item_id'}, inplace=True)

# # load & prepare users info
# user_features = pd.read_csv('hh_demographic.csv')
# user_features.columns = user_features.columns.str.lower()
# user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

## prepare data & fit candidate model

In [737]:
mix_feat_params = {
    'top_config': {# 'fields': ['quantity', 'sales_value'],
                   # 'beta': [1., 1.],
                   'k': 5000,
                   # 'scaler': StandardScaler
                    },
    'uim_config': {# 'aggfunc': 'sum',
                   # 'weights': tfidf_weight
                   },
}   # отказался от собственного решенения в этом моменте, чтобы не перегружать модель на ранних этапах

pre = DataPreprocessor(train_lv1, valid_lv1, **mix_feat_params)
pre.fit()

In [773]:
candidate_params = {
    'train': pre.train_uim_sparse,
    'weighted': pre.train_uim_weighted,
    'top_items': pre.top_k_items,
    'placeholder_id': pre.placeholder_id,
    'idx_to_item': pre.idx_to_item,
    'item_to_idx': pre.item_to_idx,
    'user_to_idx': pre.user_to_idx
}
n_candidates = 50
cm = CandidateModel('TFIDF', **candidate_params)
cm.fit(K=1)
cand_predicts = cm.predict(true_lv1['user_id'], N=n_candidates)
calc_mean_metric(recall_at_k, true_lv1['actual'], cand_predicts, k=n_candidates)

0.1654474764453391

## prepare data for lv2 model

In [774]:
# make pipeline

In [775]:
# markup candidates
candidates = pd.DataFrame.from_dict(cand_predicts.to_dict(), orient='index').set_index(true_lv1['user_id'])
candidates = candidates.stack().reset_index(level=1, drop=True).rename('item_id').reset_index()

# merge candidates with real items
target_lv2 = train_lv2_base[['user_id', 'item_id']].drop_duplicates().copy()
target_lv2['target'] = 1      # set flag means this item was really bought
data_lv2 = candidates.merge(target_lv2, on=['user_id', 'item_id'], how='left').fillna(0)
train_lv2_empty = data_lv2.drop(columns='target')
true_lv2 = data_lv2['target']

In [776]:
# доля релевантных товаров среди кандидатов
zeros, ones = data_lv2['target'].value_counts()
ones / (ones + zeros)

0.19384900416859657

In [777]:
# add some features
# ...

In [792]:
# load items data
item_data = pd.read_csv('product.csv')
item_data.columns = item_data.columns.str.lower()
item_data.rename(columns={'product_id': 'item_id'}, inplace=True)

# keep_cols = ['item_id', 'manufacturer', 'sub_commodity_desc']
keep_cols = ['item_id', 'department', 'manufacturer', 'brand']
item_data = item_data[keep_cols]

item_features = pd.DataFrame(pre.train_uim.columns)
item_features = item_features.merge(item_data, on='item_id', how='left')
item_features.set_index('item_id', inplace=True)
item_features = pd.get_dummies(item_features, columns=item_features.columns.tolist())
# del item_data

In [793]:
user_features = pd.read_csv('../hw5/user_features_corrected.csv')
train_lv2 = train_lv2_empty.merge(user_features, on='user_id', how='left').fillna(0)
train_lv2 = train_lv2.merge(item_features, on='item_id', how='left').fillna(0)

In [794]:
train_lv2.head()

Unnamed: 0,user_id,item_id,hh_comp,hh_size,kids,single_female,single_male,age_19-24,age_25-34,age_35-44,...,manufacturer_5942.0,manufacturer_5972.0,manufacturer_6009.0,manufacturer_6032.0,manufacturer_6046.0,manufacturer_6047.0,manufacturer_6082.0,manufacturer_6331.0,brand_National,brand_Private
0,1,856942,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,1,1082185,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,1,995242,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
3,1,9527290,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
4,1,940947,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


## fit lv2 model

In [795]:
model = LGBMClassifier()
model.fit(train_lv2, true_lv2)

precision_score(true_lv2, model.predict(train_lv2))

0.7385534173855341

In [796]:
# train proba
proba = pd.Series(model.predict_proba(train_lv2).T[1], name='proba')
ranked_predicts = pd.concat([train_lv2[['user_id', 'item_id']], proba], axis=1)
ranked_candidates = candidates.merge(ranked_predicts, on=['user_id', 'item_id'], how='left')

# collect recommends
k = 10
sorted_candidates = ranked_candidates.sort_values(by=['user_id', 'proba'], ascending=[True, False]).groupby('user_id').head(k)
recommends = sorted_candidates.groupby('user_id')['item_id'].unique()

In [797]:
calc_mean_metric(precision_at_k, true_lv1['actual'], recommends.reset_index(drop=True), k=k)

0.33376563223714684

In [798]:
#