In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pickle

In [None]:
if 'google.colab' in str(get_ipython()):
    colab_session = True
else:
    colab_session = False

In [None]:
if colab_session:
    from google.colab import drive
    drive.mount('/content/drive')

    DATADIR = "/content/drive/My Drive/sberhack" # change if necessary

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
colab_install_dependencies = False

In [None]:
if colab_install_dependencies:
    !pip install implicit
    !pip install -U lightgbm
    !pip install -U pyarrow

import implicit
#import lightgbm as lgb

In [None]:
from pandas.api.types import CategoricalDtype
from scipy.sparse import csr_matrix

def load_i2i_data(name, feature_name):
    df = pd.read_csv(f"{DATADIR}/{name}.csv")

    if "Unnamed: 0" in df.columns:
        df.drop(columns=['Unnamed: 0'], inplace=True)
    
    users = list(sorted(df['user_id'].unique()))
    products = list(sorted(df['product_id'].unique()))

    print("Data size:", len(df))
    print("Users:", len(users))
    print("Products:", len(products))
    
    # create sparse item-user matrix
    row = df.user_id.astype(CategoricalDtype(categories=users)).cat.codes
    col = df.product_id.astype(CategoricalDtype(categories=products)).cat.codes

    data = df[feature_name].tolist()
    item_user_matrix = csr_matrix((data, (row, col)), shape=(len(users), len(products))).T

    return item_user_matrix

In [None]:
def save_obj(obj, name):
    pickle.dump(obj, open(f"{DATADIR}/{name}.pkl", "wb") )
  
def load_obj(name):
    return pickle.load(open(f"{DATADIR}/{name}.pkl", "rb"))

In [None]:
# instantiate and fit the model
item_user_matrix = load_i2i_data('quantities_norm', 'quantity_norm')

i2i_model = implicit.nearest_neighbours.BM25Recommender(K=1, num_threads=4)
i2i_model.fit(item_user_matrix)

# save the model
save_obj(i2i_model, "i2i_model_bm25")

Data size: 46194236
Users: 657502
Products: 95002


HBox(children=(FloatProgress(value=0.0, max=95002.0), HTML(value='')))




In [None]:
# load mappings

mapping_users = load_obj("mapping_users") # encoded_id -> real_id
mapping_products = load_obj("mapping_products")

inverse_mapping_users = load_obj("inverse_mapping_users") # real_id -> encoded_id
inverse_mapping_products = load_obj("inverse_mapping_products")

In [None]:
submission = pd.read_csv(f"{DATADIR}/sample_submission.csv")
test_ids = submission['Id'].tolist()

In [None]:
from tqdm import tqdm

test_list = []

for test_id in tqdm(test_ids):
    matrix_user_id = inverse_mapping_users[test_id]
    recs = i2i_model.recommend(userid=0, user_items=item_user_matrix[:, matrix_user_id].T.astype('float32'), N=50, filter_already_liked_items=False, recalculate_user=True)

    for i, rec in enumerate(recs):
        test_list.append( [test_id, mapping_products[rec[0]], 49-i, rec[1]] )

100%|██████████| 107068/107068 [00:47<00:00, 2230.67it/s]


In [None]:
new_df = pd.DataFrame(test_list)
new_df.head()

Unnamed: 0,0,1,2,3
0,51,14863,49,13.176471
1,51,63057,48,3.0
2,51,3562687,47,2.705882
3,51,9979,46,2.117647
4,51,2558,45,2.117647


In [None]:
new_df.columns = ['user_id', 'product_id', 'rec_rank', 'rec_score']
new_df.head()

Unnamed: 0,user_id,product_id,rec_rank,rec_score
0,51,14863,49,13.176471
1,51,63057,48,3.0
2,51,3562687,47,2.705882
3,51,9979,46,2.117647
4,51,2558,45,2.117647


In [None]:
new_df.head()

Unnamed: 0,user_id,product_id,rec_rank,rec_score
0,51,14863,49,13.176471
1,51,63057,48,3.0
2,51,3562687,47,2.705882
3,51,9979,46,2.117647
4,51,2558,45,2.117647


In [None]:
new_df.to_csv(f"{DATADIR}/roflan.csv", index=False)

In [None]:
new_df.groupby('user_id')['product_id'].nunique()

user_id
51         50
65         50
187         2
400        50
576        50
           ..
3142760    50
3142766    25
3142774    12
3142820    21
3142875    11
Name: product_id, Length: 107068, dtype: int64

In [None]:
df = pd.read_csv(f"{DATADIR}/quantities_norm.csv")

if "Unnamed: 0" in df.columns:
    df.drop(columns=['Unnamed: 0'], inplace=True)

In [None]:
df.head()

Unnamed: 0,user_id,product_id,quantity_norm
0,51,0,0.529412
1,51,159,0.058824
2,51,397,0.117647
3,51,407,0.941176
4,51,456,1.176471


In [None]:
most_popular_items_df = df.groupby('product_id', as_index=False)['quantity_norm'].sum()
most_popular_items_df.sort_values(by=['quantity_norm'], ascending=False, inplace=True)
most_popular_items_df.head()

Unnamed: 0,product_id,quantity_norm
51839,5481337,338364.838957
0,0,311158.092267
195,709,101947.840394
365,1300,75669.164817
69,158,71976.451851


In [None]:
most_popular_items_df.reset_index(inplace=True)
most_popular_items_df.drop(columns=['index'], inplace=True)
most_popular_items_df.head()

Unnamed: 0,product_id,quantity_norm
0,5481337,338364.838957
1,0,311158.092267
2,709,101947.840394
3,1300,75669.164817
4,158,71976.451851


In [None]:
most_popular_items_list = most_popular_items_df.iloc[0:100]['product_id'].tolist()

In [None]:
def generate_submission_for_recommender_only(model, sparse_matrix, sub_num):
    test_dict = {}

    for test_id in tqdm(test_ids):
        matrix_user_id = inverse_mapping_users[test_id]
        recs = model.recommend(userid=0, user_items=sparse_matrix[:, matrix_user_id].T.astype('float32'), N=50, filter_already_liked_items=False)

        rec_list = []
        for rec in recs:
            rec_list.append(str(mapping_products[rec[0]]))
        test_dict[test_id] = " ".join(rec_list)
    
    # pack into dataframe
    sub = pd.DataFrame.from_dict(test_dict, orient='index')
    sub.index.name = 'Id'
    sub.rename(columns={0 : 'Predicted'}, inplace=True)
    sub.reset_index(level=0, inplace=True)

    # save to scv
    sub.to_csv(f"{DATADIR}/submission_{sub_num}.csv", index=False)

    return sub

In [None]:
sub = generate_submission_for_recommender_only(i2i_model, item_user_matrix, 3)
sub.head()

100%|██████████| 107068/107068 [00:41<00:00, 2563.52it/s]


Unnamed: 0,Id,Predicted
0,51,14863 2558 63057 3562687 2545 2546 2150 84364 ...
1,65,18450 26062 18439 4141822 10019 5639 55049 138...
2,766,3043457 3041849 17228 50462 3069043 74609 9404...
3,1132,20499 2592 91008 66965 2629 2165 2146 6004065 ...
4,1578,19924 158 8545 285 96323 6003928 5629938 65526...


# LGB

In [None]:
dataset = pd.read_parquet(f"{DATADIR}/final_features.parquet", columns=['user_id', 'product_id', 'quantity', 'transaction_value', 'parent_category_id', 'target', 'user_order_num', 'mean_items_in_trans',\
                                                                        'mean_items_in_trans_log_denom', 'mean_items_in_trans_log', 'max', 'median', 'mean', 'max_normed', 'mean_normed', 'median_normed'])
dataset.head()

Unnamed: 0,user_id,product_id,quantity,transaction_value,parent_category_id,target,user_order_num,mean_items_in_trans,mean_items_in_trans_log_denom,mean_items_in_trans_log,max,median,mean,max_normed,mean_normed,median_normed
0,51,0,9,1055.630005,0,0,17,0.529412,2.3479,-0.634102,255,37.0,65.982821,1.294416,0.334938,0.187817
1,51,159,1,65.900002,20,0,17,0.058824,0.260878,-2.816356,255,4.0,23.091493,1.564417,0.141666,0.02454
2,51,397,2,778.0,35,0,17,0.117647,0.521755,-2.131602,255,0.0,4.822646,2.40566,0.045497,0.0
3,51,407,16,2274.049973,35,0,17,0.941176,4.174044,-0.059563,255,4.0,22.039294,14.166667,1.224405,0.222222
4,51,456,20,2162.779984,35,1,17,1.176471,5.217555,0.163369,255,4.0,22.569472,2.865169,0.25359,0.044944


In [None]:
dataset['user_id'] = dataset['user_id'].map(inverse_mapping_users)
dataset['product_id'] = dataset['product_id'].map(inverse_mapping_products)

In [None]:
dataset.head()

Unnamed: 0,user_id,product_id,quantity,transaction_value,parent_category_id,target,user_order_num,mean_items_in_trans,mean_items_in_trans_log_denom,mean_items_in_trans_log,max,median,mean,max_normed,mean_normed,median_normed
0,0,0,9,1055.630005,0,0,17,0.529412,2.3479,-0.634102,255,37.0,65.982821,1.294416,0.334938,0.187817
1,0,70,1,65.900002,20,0,17,0.058824,0.260878,-2.816356,255,4.0,23.091493,1.564417,0.141666,0.02454
2,0,132,2,778.0,35,0,17,0.117647,0.521755,-2.131602,255,0.0,4.822646,2.40566,0.045497,0.0
3,0,134,16,2274.049973,35,0,17,0.941176,4.174044,-0.059563,255,4.0,22.039294,14.166667,1.224405,0.222222
4,0,141,20,2162.779984,35,1,17,1.176471,5.217555,0.163369,255,4.0,22.569472,2.865169,0.25359,0.044944


In [None]:
def get_query_id(df):
    query_map = {}
    for query_id, user_id in enumerate(df['user_id'].unique()):
        query_map[user_id] = query_id
    query_id = df['user_id'].map(query_map)
    return query_id

dataset['query_id'] = get_query_id(dataset)
dataset.head()

In [None]:
!nvidia-smi

Sat Oct 10 22:18:37 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.23.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip uninstall lightgbm

Uninstalling lightgbm-3.0.0:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/lightgbm-3.0.0.dist-info/*
Proceed (y/n)? y
  Successfully uninstalled lightgbm-3.0.0


In [None]:
!pip install lightgbm --install-option=--gpu

  cmdoptions.check_install_build_global(options)
Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/0f/fe/3032d68f7bf3e3b65aafecc628cde020759eb23bde406855860f664cdf7d/lightgbm-3.0.0.tar.gz (711kB)
[K     |████████████████████████████████| 716kB 8.2MB/s 
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
    Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.0.0


In [None]:
import lightgbm as lgb

In [None]:
dataset.reset_index(level=0, inplace=True)
dataset.head()

Unnamed: 0,user_id,product_id,quantity,transaction_value,parent_category_id,target,user_order_num,mean_items_in_trans,mean_items_in_trans_log_denom,mean_items_in_trans_log,max,median,mean,max_normed,mean_normed,median_normed
0,0,0,9,1055.630005,0,0,17,0.529412,2.3479,-0.634102,255,37.0,65.982821,1.294416,0.334938,0.187817
1,0,70,1,65.900002,20,0,17,0.058824,0.260878,-2.816356,255,4.0,23.091493,1.564417,0.141666,0.02454
2,0,132,2,778.0,35,0,17,0.117647,0.521755,-2.131602,255,0.0,4.822646,2.40566,0.045497,0.0
3,0,134,16,2274.049973,35,0,17,0.941176,4.174044,-0.059563,255,4.0,22.039294,14.166667,1.224405,0.222222
4,0,141,20,2162.779984,35,1,17,1.176471,5.217555,0.163369,255,4.0,22.569472,2.865169,0.25359,0.044944


In [None]:
dropped_columns = dataset[['user_id', 'target', 'query_id']].copy()
dropped_columns.head()

Unnamed: 0,user_id,target,query_id
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,1,0


In [None]:
dataset.drop(columns=['user_id', 'target', 'query_id'], inplace=True)

In [None]:
train_set = lgb.Dataset(new_df, label=dropped_columns['target'], group=dropped_columns['query_id'].value_counts().sort_index().values)

In [None]:
params = {
    'objective' : 'binary',
    'max_depth' : 12,
    'random_state' : 42,
    'device_type' : 'gpu',
    'learning_rate' : 0.1,
    'metric' : ['binary', 'map'],
    'eval_at' : 50,
    'max_bin' : 63,
    'first_metric_only' : True
}

In [None]:
gbm = lgb.train(params, train_set, valid_sets=[train_set], num_boost_round=300, verbose_eval=10, early_stopping_rounds=50)

In [None]:
gbm.save_model(f'{DATADIR}/model.txt', num_iteration=gbm.best_iteration)

<lightgbm.basic.Booster at 0x7fdeb747c588>

In [None]:
gbm = lgb.Booster(model_file=f'{DATADIR}/model.txt')

In [None]:
submission = pd.read_csv(f"{DATADIR}/sample_submission.csv")
test_ids = submission['Id'].tolist()

In [None]:
from tqdm import tqdm

In [None]:
dataset.set_index('user_id', inplace=True)
dataset.head()

Unnamed: 0_level_0,product_id,quantity,transaction_value,parent_category_id,target,user_order_num,mean_items_in_trans,mean_items_in_trans_log_denom,mean_items_in_trans_log,max,median,mean,max_normed,mean_normed,median_normed
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,0,9,1055.630005,0,0,17,0.529412,2.3479,-0.634102,255,37.0,65.982821,1.294416,0.334938,0.187817
0,70,1,65.900002,20,0,17,0.058824,0.260878,-2.816356,255,4.0,23.091493,1.564417,0.141666,0.02454
0,132,2,778.0,35,0,17,0.117647,0.521755,-2.131602,255,0.0,4.822646,2.40566,0.045497,0.0
0,134,16,2274.049973,35,0,17,0.941176,4.174044,-0.059563,255,4.0,22.039294,14.166667,1.224405,0.222222
0,141,20,2162.779984,35,1,17,1.176471,5.217555,0.163369,255,4.0,22.569472,2.865169,0.25359,0.044944


In [None]:
test_dict = {}

for test_id in test_ids:
    user_id = inverse_mapping_users[test_id]
    data = dataset.loc[user_id].drop(columns=['target'])
    if not isinstance(data, pd.DataFrame):
        data = data.drop('target')
    y_pred_probas = gbm.predict(data)
    y_pred = np.argsort(-y_pred_probas, axis=0)[:50]
    y_pred = np.vectorize(mapping_products.get)(y_pred)

    test_dict[test_id] = " ".join([str(pred) for pred in y_pred])
    
# pack into dataframe
sub = pd.DataFrame.from_dict(test_dict, orient='index')
sub.index.name = 'Id'
sub.rename(columns={0 : 'Predicted'}, inplace=True)
sub.reset_index(level=0, inplace=True)



In [None]:
sub.to_csv(f"{DATADIR}/submission_4.csv", index=False)

# PARQUET

In [None]:
full_data = pd.read_parquet(f"{DATADIR}/all_data.parquet")
full_data.drop(columns=['product_name', 'brand_name'], inplace=True)
full_data.head()

Unnamed: 0,user_id,order_id,line_item_id,price,quantity,discount,product_id,master_category_id,parent_category_id
0,525,5354800,28381452,116.0,4,0.0,37548,101.0,99
1,525,5354800,29242911,56.990002,2,13.08,5636,596.0,51
2,525,5354800,29242919,64.989998,2,40.02,22107,596.0,51
3,525,5354800,29243785,44.990002,6,0.0,2530,77.0,74
4,525,5354800,29244246,65.900002,6,30.99,3818486,76.0,74


In [None]:
full_data['target'] = 0
full_data.head()

Unnamed: 0,user_id,order_id,line_item_id,price,quantity,discount,product_id,master_category_id,parent_category_id,target
0,525,5354800,28381452,116.0,4,0.0,37548,101.0,99,0
1,525,5354800,29242911,56.990002,2,13.08,5636,596.0,51,0
2,525,5354800,29242919,64.989998,2,40.02,22107,596.0,51,0
3,525,5354800,29243785,44.990002,6,0.0,2530,77.0,74,0
4,525,5354800,29244246,65.900002,6,30.99,3818486,76.0,74,0


In [None]:
last_orders = full_data.groupby('user_id', as_index=False)['order_id'].max()['order_id'].tolist()
last_orders

In [None]:
last_order_ids = full_data.index[full_data['order_id'].isin(last_orders)].tolist()
last_order_ids

In [None]:
full_data.loc[last_order_ids, 'target'] = 1
full_data.head()

Unnamed: 0,user_id,order_id,line_item_id,price,quantity,discount,product_id,master_category_id,parent_category_id,target
0,525,5354800,28381452,116.0,4,0.0,37548,101.0,99,0
1,525,5354800,29242911,56.990002,2,13.08,5636,596.0,51,0
2,525,5354800,29242919,64.989998,2,40.02,22107,596.0,51,0
3,525,5354800,29243785,44.990002,6,0.0,2530,77.0,74,0
4,525,5354800,29244246,65.900002,6,30.99,3818486,76.0,74,0


In [None]:
full_data.drop(columns=['line_item_id'], inplace=True)
full_data.head()

Unnamed: 0,user_id,order_id,price,quantity,discount,product_id,master_category_id,parent_category_id,target
0,525,5354800,116.0,4,0.0,37548,101.0,99,0
1,525,5354800,56.990002,2,13.08,5636,596.0,51,0
2,525,5354800,64.989998,2,40.02,22107,596.0,51,0
3,525,5354800,44.990002,6,0.0,2530,77.0,74,0
4,525,5354800,65.900002,6,30.99,3818486,76.0,74,0


In [None]:
full_data['transaction_value'] = (full_data['price'] + full_data['discount']) * full_data['quantity']
full_data.head()

Unnamed: 0,user_id,order_id,price,quantity,discount,product_id,master_category_id,parent_category_id,target,transaction_value
0,525,5354800,116.0,4,0.0,37548,101.0,99,0,464.0
1,525,5354800,56.990002,2,13.08,5636,596.0,51,0,140.140003
2,525,5354800,64.989998,2,40.02,22107,596.0,51,0,210.019997
3,525,5354800,44.990002,6,0.0,2530,77.0,74,0,269.94001
4,525,5354800,65.900002,6,30.99,3818486,76.0,74,0,581.340008


In [None]:
del full_data

In [None]:
simple_set = full_data.groupby(['user_id', 'product_id'], as_index=False).agg({'quantity' : ['sum'], 'transaction_value' : ['sum'],\
                                                                               'master_category_id' : ['mean'], 'parent_category_id' : ['mean'],\
                                                                               'target' : ['sum']})
simple_set.head()

Unnamed: 0_level_0,user_id,product_id,quantity,transaction_value,master_category_id,parent_category_id,target
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum,sum,mean,mean,sum
0,51,0,9,1055.630005,,0,0
1,51,159,1,65.900002,26.0,20,0
2,51,397,2,778.0,38.0,35,0
3,51,407,16,2274.049973,38.0,35,0
4,51,456,20,2162.779984,39.0,35,1


In [None]:
simple_set['master_category_id'].fillna(value=0, inplace=True)
simple_set.head()

Unnamed: 0,user_id,product_id,quantity,transaction_value,master_category_id,parent_category_id,target
0,51,0,9,1055.630005,0.0,0,0
1,51,159,1,65.900002,26.0,20,0
2,51,397,2,778.0,38.0,35,0
3,51,407,16,2274.049973,38.0,35,0
4,51,456,20,2162.779984,39.0,35,1


In [None]:
simple_set['master_category_id'] = simple_set['master_category_id'].astype('int32')
simple_set.head()

Unnamed: 0,user_id,product_id,quantity,transaction_value,master_category_id,parent_category_id,target
0,51,0,9,1055.630005,0,0,0
1,51,159,1,65.900002,26,20,0
2,51,397,2,778.0,38,35,0
3,51,407,16,2274.049973,38,35,0
4,51,456,20,2162.779984,39,35,1


In [None]:
simple_set['target'] = simple_set['target'].astype('bool').astype('int')

1

In [None]:
simple_set.to_parquet(f"{DATADIR}/simple_dataset.parquet")

In [None]:
targets = full_data[['user_id', 'product_id']].copy()
targets.head()

Unnamed: 0,user_id,product_id
0,525,37548
1,525,5636
2,525,22107
3,525,2530
4,525,3818486


In [None]:
targets.drop_duplicates(inplace=True)
targets.head()

Unnamed: 0,user_id,product_id
0,525,37548
1,525,5636
2,525,22107
3,525,2530
4,525,3818486


In [None]:
len(targets)

46194236

In [None]:
targets.to_csv(f"{DATADIR}/targets.csv", index=False)

In [None]:
validation_set = full_data[full_data['order_id'].isin(last_orders)]
validation_set.head()

Unnamed: 0,user_id,order_id,line_item_id,price,quantity,discount,product_id,master_category_id,parent_category_id
362,3241,5457903,28180918,261.0,2,0.0,36586,593.0,42
363,3241,5457903,28590941,85.0,1,0.0,69372,34.0,20
364,3241,5457903,28591096,409.0,1,0.0,4141785,116.0,112
365,3241,5457903,28591333,194.080002,2,0.0,5428,115.0,112
366,3241,5457903,28592887,60.759998,1,17.23,21446,580.0,9


# QUANTITY_NORM

In [None]:
quantities_df = full_data.groupby(['user_id', 'product_id'])['quantity'].sum().to_frame()
quantities_df.reset_index(level=1, inplace=True)
quantities_df['num_orders'] = full_data.groupby(['user_id'])['order_id'].nunique()
quantities_df['quantity_norm'] = quantities_df['quantity'] / quantities_df['num_orders']
quantities_df.drop(columns=['quantity', 'num_orders'], inplace=True)
quantities_df.reset_index(level=0, inplace=True)
quantities_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,quantity
user_id,product_id,Unnamed: 2_level_1
51,0,9
51,159,1
51,397,2
51,407,16
51,456,20


In [None]:
quantities_df.to_csv(f"{DATADIR}/quantities_norm.csv", index=False)