# VALIDATION & PREDICTION - Search

This notebook produces recommendations based on similarity of search texts (queries) and the description of the item (embeddings for the items is generated in `EDA_dataprep.ipynb`). These recommendations were used for seq's that showed zero views.

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## lang and w2v_mode

In [2]:
lang = 'pt'

In [3]:
w2v_mode = 'cbow'

## dir setup

In [4]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [5]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

In [6]:
TEST_OFFSET = int(os.environ.get("TEST_OFFSET"))
TOTAL_TEST_SEQS = int(os.environ.get("TOTAL_TEST_SEQS"))

## imports

In [7]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [8]:
import numpy as np
from datetime import datetime

In [9]:
import pandas as pd
pd.set_option('max_rows', 100)
pd.set_option('max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [10]:
from tqdm import tqdm
tqdm.pandas()

In [11]:
import tensorflow as tf

In [14]:
from sklearn.metrics import ndcg_score

## project imports

In [15]:
import sys
sys.path.insert(0, '..')

In [16]:
from src.features import *
from src.models import *

# validation

##### item_data

In [17]:
# item_data_fn = 'item_data.pkl'
# item_data_fp = os.path.join(processed_data_dir, item_data_fn)
# item_data = pd.read_pickle(item_data_fp)

#### item title embeddings
use only item_id and item_embs cols from item_data 

In [18]:
item_data_embs_only_fn = 'item_data_embs_only_embs_custom_' + w2v_mode + '.pkl'
item_data_embs_only_fp = os.path.join(processed_data_dir, item_data_embs_only_fn)
item_data = pd.read_pickle(item_data_embs_only_fp)

item_data = item_data[item_data.lang_category == lang].copy()
item_data.drop('lang_category', axis = 1, inplace = True)
item_data.set_index('item_id', inplace = True)

In [19]:
item_data = item_data.title_embs.progress_apply(pd.Series)

100%|██████████| 1723826/1723826 [05:12<00:00, 5509.46it/s] 


#### search embeddings
train_test with embeddings

In [20]:
train_test_embs_fn = 'train_test_embs_custom_' +  w2v_mode + '.pkl'
train_test_embs_fp = os.path.join(interim_data_dir, train_test_embs_fn)
train_test = pd.read_pickle(train_test_embs_fp)

# train_test['seq'] = train_test.seq - TEST_OFFSET
train_test = train_test[train_test.seq < TEST_OFFSET]  # validation

In [21]:
queries = train_test[(train_test.lang_seq == lang) & (train_test.event_type == 'search')]

#### selecting train seq's for validation

In [22]:
validation = 0.01
unique_seqs = queries.seq.unique()
seq_sample_size = int(len(unique_seqs) * validation)
seq_sel = np.random.choice(unique_seqs, size = seq_sample_size, replace = False)

##### selecting queries only in scope

In [23]:
queries = queries[queries.seq.isin(seq_sel)]

In [24]:
unique_seqs = queries.seq.unique()
num_batches = len(unique_seqs) // 100
seqs = np.array_split(unique_seqs, num_batches)
num_batches

29

In [25]:
seq_batch = seqs[0]
len(seq_batch)

102

In [26]:
item_data_tensor = tf.constant(item_data)

In [27]:
recs = dict()

for seq_batch in tqdm(seqs):
    query_embs = queries[queries.seq.isin(seq_batch)]
    query_embs = query_embs.groupby('seq').query_embs.apply(lambda x: x.mean())
    query_embs = query_embs.apply(pd.Series)
    
    # scores = item_data @ query_embs.T
    scores = tf.matmul(item_data_tensor, query_embs.T).numpy()
    scores = pd.DataFrame(scores, index = item_data.index, columns = query_embs.index)
    
    for s in scores.columns:
        recs[s] = list(scores[s].nlargest(10).items())

 52%|█████▏    | 15/29 [16:33<15:27, 66.22s/it]


KeyboardInterrupt: 

In [None]:
recs_df = pd.DataFrame(recs)

In [None]:
recs_df.shape

In [None]:
lang

In [None]:
recs_df = recs_df.applymap(lambda x: x[0]).T

In [None]:
recs_df

In [None]:
true_fn = 'true.pkl'
true_fp = os.path.join(processed_data_dir, true_fn)
true_df = pd.read_pickle(true_fp)

In [None]:
item_domain_fn = 'item_domain.pkl'
item_domain_fp = os.path.join(processed_data_dir, item_domain_fn)
item_domain = pd.read_pickle(item_domain_fp)

In [None]:
recs_df_scored = score_pred(recs_df, true_df, item_domain)

In [None]:
recs_df_scored.ndcg.mean()

In [None]:
recs_df_scored.sample()

# predictions

#### loading target seqs

In [49]:
ndcg_zero_pred_fn = 'ndcg_zero_pred.pkl'
ndcg_zero_pred_fp = os.path.join(interim_data_dir, ndcg_zero_pred_fn)
ndcg_zero_pred = pd.read_pickle(ndcg_zero_pred_fp)

In [50]:
ndcg_zero_pred

event_type,search,view,ndcg_zero_pred
seq,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,22,11,0.470137
1,8,16,0.354606
2,4,10,0.348669
3,1,0,0.715974
4,62,5,0.541159
...,...,...,...
177065,3,1,0.764295
177066,13,9,0.490572
177067,15,10,0.461504
177068,0,2,0.615265


In [14]:
# nine_decile = ndcg_zero_pred.ndcg_zero_pred.quantile(0.9)
# ninetyfive_percentile = ndcg_zero_pred.ndcg_zero_pred.quantile(0.95)

# print(nine_decile, ninetyfive_percentile)

# seq_sel = ndcg_zero_pred.query("ndcg_zero_pred > @nine_decile")

#### using view=0 criteria first

In [52]:
# seq_sel = ndcg_zero_pred.query("view == 0")
# seq_sel = seq_sel.index
# seq_sel.shape

(11370,)

#### using view in `[0,1,2,3]` criteria

In [60]:
seq_sel = ndcg_zero_pred.query("view < 4")
seq_sel = seq_sel.index
seq_sel.shape

(56602,)

##### item_data

In [53]:
# item_data_fn = 'item_data.pkl'
# item_data_fp = os.path.join(processed_data_dir, item_data_fn)
# item_data = pd.read_pickle(item_data_fp)
# item_data = item_data_desc[['item_id', 'title', 'domain_id']]

##### use only item_id and item_embs cols from item_data 

In [56]:
lang = 'pt'

In [57]:
item_data_embs_only_fn = 'item_data_embs_only.pkl'
item_data_embs_only_fp = os.path.join(processed_data_dir, item_data_embs_only_fn)
item_data = pd.read_pickle(item_data_embs_only_fp)

item_data = item_data[item_data.lang_domain == lang].copy()
item_data.drop('lang_domain', axis = 1, inplace = True)
item_data.set_index('item_id', inplace = True)

In [58]:
item_data = item_data.title_embs.progress_apply(pd.Series)

100%|██████████| 1723216/1723216 [05:07<00:00, 5606.30it/s] 


##### read train_test with embeddings

In [63]:
train_test_embs_fn = 'train_test_embs.pkl'
train_test_embs_fp = os.path.join(interim_data_dir, train_test_embs_fn)
train_test = pd.read_pickle(train_test_embs_fp)

train_test['seq'] = train_test.seq - TEST_OFFSET
train_test = train_test[train_test.seq >= 0]

In [64]:
queries = train_test[(train_test.lang_seq == lang) & (train_test.event_type == 'search')]

##### selecting queries only in scope

In [65]:
queries = queries[queries.seq.isin(seq_sel)]

In [66]:
unique_seqs = queries.seq.unique()
num_batches = len(unique_seqs) // 100
seqs = np.array_split(unique_seqs, num_batches)
num_batches

348

In [67]:
seq_batch = seqs[0]
len(seq_batch)

101

In [68]:
recs = dict()

for seq_batch in tqdm(seqs):
    query_embs = queries[queries.seq.isin(seq_batch)]
    query_embs = query_embs.groupby('seq').query_embs.apply(lambda x: x.mean())
    query_embs = query_embs.apply(pd.Series)
    
    scores = item_data @ query_embs.T
    
    for s in scores.columns:
        recs[s] = list(scores[s].nlargest(10).items())

  3%|▎         | 12/348 [09:49<4:35:04, 49.12s/it]


KeyboardInterrupt: 

In [35]:
recs_df = pd.DataFrame(recs)

In [36]:
recs_df.shape

(10, 11370)

In [37]:
lang

'pt'

In [38]:
search_recs_fn = lang + '_search_recs.pkl'
search_recs_fp = os.path.join(interim_data_dir, search_recs_fn)
recs_df.to_pickle(search_recs_fp)

In [43]:
recs_df

Unnamed: 0,442,1139,1329,2039,2086,2274,2457,2673,3009,3242,...,174383,174527,174957,175052,175242,175582,175644,176548,176618,176752
0,"(808165, 0.8779296875)","(1353412, 0.7939453125)","(603897, 0.93896484375)","(1337084, 0.912109375)","(503803, 0.9365234375)","(285890, 0.88525390625)","(18304, 0.83154296875)","(1290094, 1.0)","(655531, 0.95654296875)","(240640, 1.0)",...,"(329360, 0.67333984375)","(398243, 0.95166015625)","(11755, 0.9365234375)","(1013393, 0.84375)","(216976, 0.94384765625)","(866618, 0.94482421875)","(1377687, 0.93505859375)","(274039, 0.896484375)","(534772, 0.908203125)","(1553526, 1.0)"
1,"(897227, 0.8671875)","(555036, 0.783203125)","(1211874, 0.93408203125)","(538268, 0.912109375)","(137932, 0.931640625)","(1588816, 0.8828125)","(1493519, 0.8271484375)","(14761, 0.9150390625)","(1194003, 0.94140625)","(1042941, 0.96044921875)",...,"(1377488, 0.66796875)","(1925544, 0.951171875)","(1280707, 0.9306640625)","(735229, 0.84326171875)","(1400461, 0.94189453125)","(1109754, 0.94287109375)","(879544, 0.93359375)","(1361152, 0.89306640625)","(1575342, 0.90380859375)","(237740, 1.0)"
2,"(376345, 0.8564453125)","(1559830, 0.76806640625)","(1365389, 0.9306640625)","(1486750, 0.90576171875)","(754158, 0.9306640625)","(637173, 0.8779296875)","(121927, 0.81982421875)","(454590, 0.912109375)","(857895, 0.94140625)","(13616, 0.96044921875)",...,"(1703759, 0.66748046875)","(74631, 0.94921875)","(1944934, 0.927734375)","(584341, 0.84326171875)","(268182, 0.93408203125)","(447885, 0.9423828125)","(962803, 0.9326171875)","(1187360, 0.892578125)","(1056699, 0.90087890625)","(441391, 0.94580078125)"
3,"(1986747, 0.85546875)","(1446687, 0.759765625)","(1197885, 0.9296875)","(1184330, 0.904296875)","(1075502, 0.9287109375)","(651329, 0.8759765625)","(162025, 0.81884765625)","(1502213, 0.8984375)","(1907547, 0.9384765625)","(63481, 0.96044921875)",...,"(696313, 0.6650390625)","(1112436, 0.9453125)","(1308657, 0.92626953125)","(733210, 0.84228515625)","(256591, 0.9326171875)","(328007, 0.9423828125)","(1432504, 0.9326171875)","(851128, 0.89013671875)","(1514380, 0.89794921875)","(1743815, 0.9404296875)"
4,"(18380, 0.8544921875)","(1696599, 0.75341796875)","(1319751, 0.9296875)","(438114, 0.904296875)","(2018908, 0.92529296875)","(992302, 0.8759765625)","(1432690, 0.8173828125)","(2066659, 0.89794921875)","(1895562, 0.93798828125)","(1745078, 0.96044921875)",...,"(635636, 0.6630859375)","(344112, 0.94482421875)","(1591538, 0.92529296875)","(1877862, 0.841796875)","(1922505, 0.93212890625)","(421057, 0.9423828125)","(1594275, 0.931640625)","(315013, 0.88916015625)","(1077409, 0.89599609375)","(1535822, 0.939453125)"
5,"(2040458, 0.849609375)","(887274, 0.7529296875)","(2042574, 0.9296875)","(1059822, 0.90380859375)","(1356744, 0.9248046875)","(1976671, 0.8740234375)","(891824, 0.81689453125)","(64083, 0.8974609375)","(1676302, 0.93798828125)","(1643866, 0.96044921875)",...,"(692774, 0.662109375)","(1343591, 0.94189453125)","(738107, 0.92431640625)","(1237401, 0.84130859375)","(938911, 0.93212890625)","(626743, 0.94091796875)","(1462074, 0.93115234375)","(765244, 0.888671875)","(1427755, 0.89501953125)","(1153811, 0.9375)"
6,"(1364810, 0.8486328125)","(70120, 0.7529296875)","(1110908, 0.92822265625)","(150238, 0.90380859375)","(618607, 0.91845703125)","(1891662, 0.8740234375)","(1097105, 0.81591796875)","(1903004, 0.89697265625)","(304904, 0.9375)","(1510610, 0.96044921875)",...,"(994218, 0.6611328125)","(564467, 0.94140625)","(365595, 0.92431640625)","(209386, 0.83935546875)","(1913824, 0.93115234375)","(633925, 0.93994140625)","(2075749, 0.92919921875)","(1807257, 0.88818359375)","(872141, 0.89453125)","(37874, 0.9365234375)"
7,"(812912, 0.84814453125)","(411676, 0.7509765625)","(708512, 0.92626953125)","(446059, 0.90380859375)","(643397, 0.91796875)","(1484456, 0.8740234375)","(151298, 0.81591796875)","(448821, 0.8935546875)","(1451934, 0.93603515625)","(1020488, 0.94921875)",...,"(780409, 0.66015625)","(82670, 0.94091796875)","(1905509, 0.92431640625)","(1391307, 0.8388671875)","(268826, 0.9306640625)","(1618846, 0.939453125)","(1205394, 0.92919921875)","(133794, 0.8876953125)","(724165, 0.89453125)","(1386973, 0.9365234375)"
8,"(1122057, 0.84765625)","(2096506, 0.75)","(1775486, 0.92578125)","(1541250, 0.90283203125)","(1327659, 0.9169921875)","(313875, 0.87353515625)","(599929, 0.81591796875)","(203574, 0.89013671875)","(2025944, 0.93603515625)","(110903, 0.94921875)",...,"(1906386, 0.66015625)","(1505355, 0.9404296875)","(754959, 0.9228515625)","(764028, 0.83837890625)","(1962131, 0.9306640625)","(982661, 0.939453125)","(301064, 0.92919921875)","(1499488, 0.8876953125)","(1287672, 0.89404296875)","(911142, 0.93603515625)"
9,"(1793735, 0.845703125)","(1259538, 0.74951171875)","(228252, 0.92529296875)","(1232165, 0.9013671875)","(1619460, 0.9169921875)","(1764838, 0.87353515625)","(701585, 0.8154296875)","(366003, 0.8896484375)","(1298159, 0.9345703125)","(737239, 0.94921875)",...,"(1037362, 0.65966796875)","(289183, 0.93994140625)","(848954, 0.92236328125)","(499285, 0.83740234375)","(1549555, 0.9296875)","(703218, 0.93896484375)","(1107505, 0.9267578125)","(426694, 0.88623046875)","(1641755, 0.8935546875)","(385773, 0.93310546875)"


____
____

In [140]:
query_embs = queries[queries.seq.isin(seq_batch)]
query_embs = query_embs.groupby('seq').query_embs.apply(lambda x: x.mean())
query_embs = query_embs.progress_apply(pd.Series)

for len(seq_batch) = 1000, dot product below takes about 6 mins:
* CPU times: user 6min 11s, sys: 1.93 s, total: 6min 13s
* Wall time: 6min 14s
> **take only seqs that had low number of views (low top score from MF)** <br>
> or to restrict columns by selecting items from the viewed groups only

In [141]:
%%time
scores = item_data @ query_embs.T

CPU times: user 6min 11s, sys: 1.93 s, total: 6min 13s
Wall time: 6min 14s


In [146]:
recs = dict()
for s in tqdm(scores.columns):
    recs[s] = list(scores[s].nlargest(10).items())

100%|██████████| 1003/1003 [01:34<00:00, 10.61it/s]


In [147]:
# recs = pd.DataFrame(recs)

____

#### checking random seq's...
* seq 483: search for MESA EXPERT is giving rec "Mesa Digitalizadora Led Profissional Parblo C", because of search words treated in isolation
* **Estimate NDCG for a small number of random users in train**


##### item_data descriptions

In [19]:
item_data_fn = 'item_data.jl.gz'
item_data_desc = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines=True)
item_data_desc = item_data_desc[['item_id', 'title', 'domain_id']]

In [59]:
a_seq = np.random.choice(list(recs.keys()))
a_seq = 177058
print('a random seq:', a_seq)
one_seq_rec = pd.DataFrame(recs[a_seq])
one_seq_rec.columns = ['item_id', 'score']

a random seq: 177058


KeyError: 177058

In [60]:
pd.merge(one_seq_rec, item_data_desc, how = 'left')

Unnamed: 0,item_id,score,title,domain_id
0,1135878,0.838379,Tenis adidas Pro Next 2019 Jr Ef0855 Rojo Básq...,MLM-SNEAKERS
1,138426,0.837402,Tenis Para Deporte Marca Charly 1021777 Dog,MLM-SNEAKERS
2,1600907,0.837402,Tenis Para Deporte Marca Charly 1029137 Dog,MLM-SNEAKERS
3,114985,0.834473,Tenis Puma Vikky Ribbon Mujer Nike adidas Gym ...,MLM-SNEAKERS
4,864126,0.834473,Tenis De Basquetbol Nike Air Max Wavy Original.,MLM-SNEAKERS
5,326712,0.833984,Tenis Nike Air Max Infuriate Ii Mujer Basquetb...,MLM-SNEAKERS
6,68056,0.83252,Tenis Nuevo adidas Cosmic Fashion Negro,MLM-SNEAKERS
7,407633,0.832031,Tenis adidas Pace,MLM-SNEAKERS
8,1658750,0.831543,Tenis 8mx Coach Negro Monogram,MLM-SNEAKERS
9,1703189,0.831543,Tenis Futbol Para Hombre adidas Aq4289 Black W...,MLM-SNEAKERS


In [61]:
train_test[train_test.seq == a_seq]

Unnamed: 0,seq,event_info,views,event_type,lang_seq,query_embs
13914900,177058,ONEPLUS 5,,search,pt,"[0.03854, -0.2195, 0.3315, 0.0575, -0.0181, 0...."
13914901,177058,ONEPLUS 5,,search,pt,"[0.03854, -0.2195, 0.3315, 0.0575, -0.0181, 0...."
13914902,177058,ONE VISION,,search,pt,"[0.03427, -0.1542, 0.2006, -0.1891, -0.1799, 0..."


In [58]:
pd.merge(train_test[(train_test.seq == a_seq) & ~(train_test.event_type == 'search')], item_data_desc, how = 'left', left_on = 'event_info', right_on = 'item_id')

Unnamed: 0,seq,event_info,views,event_type,lang_seq,query_embs,item_id,title,domain_id


_____

## dataprep for search rows - spacy tests

#### error with spaCy & Dask
```OSError: [E050] Can't find model 'pt_core_news_md.vectors'.
It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.```

In [30]:
# import dask.dataframe as dd
# from dask.distributed import Client
# client = Client()
# client

# item_data_unique_dd = dd.from_pandas(item_data_unique, npartitions = 4)

# @delayed
# def add_spacy(df_p):
#     df = df_p.copy()
#     # df['port_nlp'] = df.title_lower.apply(lambda x: port_nlp(x))
#     df['port_nlp'] = df.title_lower.apply(port_nlp)
#     return df

# item_data_unique_nlp = list()
# for df_p in np.array_split(item_data_unique, n_workers):
#     item_data_unique_nlp.append(add_spacy(df_p))

# item_data_unique_nlp = compute(*item_data_unique_nlp)
# # OSError: [E050] Can't find model 'pt_core_news_md.vectors'. 
# #     It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [1]:
import spacy

In [2]:
port = spacy.load('pt_core_news_md')

In [3]:
query_original = port('PNEU LEVORIN AZONIC ARO 18 275 DIANTEIRO')
query_lower    = port('PNEU LEVORIN AZONIC ARO 18 275 DIANTEIRO'.lower())

In [4]:
item_original = port('Pneu Para Moto Levorin Azonic Dianteiro Tl 2,75 18')
item_lower    = port('Pneu Para Moto Levorin Azonic Dianteiro Tl 2,75 18'.lower())

In [5]:
print(query_original.similarity(item_original))
print(query_original.similarity(item_lower))
print(query_lower.similarity(item_original))
print(query_lower.similarity(item_lower))  # <--- highest similarity

0.623769216929041
0.55457467490276
0.7138354773981356
0.7602650326179264


In [130]:
type(query_lower.text)

str

In [135]:
for t in query_lower:
    print(t, type(t), t.text, type(t.text))

pneu <class 'spacy.tokens.token.Token'> pneu <class 'str'>
levorin <class 'spacy.tokens.token.Token'> levorin <class 'str'>
azonic <class 'spacy.tokens.token.Token'> azonic <class 'str'>
aro <class 'spacy.tokens.token.Token'> aro <class 'str'>
18 <class 'spacy.tokens.token.Token'> 18 <class 'str'>
275 <class 'spacy.tokens.token.Token'> 275 <class 'str'>
dianteiro <class 'spacy.tokens.token.Token'> dianteiro <class 'str'>


In [136]:
train['query'] = train.event_info
train.loc[~(train.event_type == 'search'), 'query'] = np.nan
train['query'] = train['query'].fillna('').str.lower()

In [137]:
def find_oov(s):
    s_spacy = port(s)
    return [t.text for t in s_spacy if t.is_oov]

In [139]:
train_search = train.loc[train.event_type == 'search', ['seq', 'query']].copy()
train_search_drop_dup = train_search.drop_duplicates(subset = 'query')

In [140]:
train_search_drop_dup['entities'] = train_search_drop_dup['query'].progress_apply(find_oov)

100%|██████████| 847496/847496 [1:37:52<00:00, 144.33it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [141]:
train = pd.merge(train, train_search_drop_dup, on = 'query', how = 'left')

In [142]:
train.rename(columns = {'seq_x': 'seq'}, inplace = True)
train.drop(columns = 'seq_y', inplace = True)

In [143]:
train.sort_values(['seq', 'event_timestamp'], inplace = True)

In [164]:
train.sample(10)

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,query,entities
6839163,6994,,984787,2019-10-12 18:54:21,view,25.0,,
11963835,98101,,364310,2019-09-28 21:16:31,view,10.0,,
1424896,138466,,LAMPADA LED HONDA CIVIC 99,2019-10-21 10:18:59,search,288.0,lampada led honda civic 99,[]
1828935,387228,,1799758,2019-10-03 01:27:39,view,49.0,,
2032760,154670,,243848,2019-10-15 18:20:03,view,14.0,,
6697734,257356,,CORTA VENTO MERCEDES,2019-10-03 20:25:03,search,6.0,corta vento mercedes,[]
244324,155814,1278790.0,,NaT,,,,
4728092,213232,,PULA PIRATA,2019-10-03 15:53:31,search,7.0,pula pirata,[]
3481274,152042,,CHUTEIRA CRIANCA 27 28,2019-10-03 16:30:14,search,7.0,chuteira crianca 27 28,[]
158968,238608,,RELOJ CASIO,2019-10-18 15:46:15,search,,reloj casio,[]


In [156]:
# train_fn = 'train_for_nlp.pkl'
# train_fp = os.path.join(interim_data_dir, train_fn)
# train.to_pickle(train_fp)

___

In [172]:
# train = pd.read_pickle(train_fp)

In [168]:
item_data.head()

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.0,MLM170527,new
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800.0,1392.83,MLM151595,new
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.0,MLM7697,new
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.0,MLM9761,used
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.0,MLM1652,used


In [173]:
train = pd.merge(train, item_data[['item_id', 'domain_id']], left_on = 'item_bought', right_on = 'item_id', how = 'left')

In [175]:
train['country'] = train.domain_id.str[:3]

In [177]:
views_sel = train.event_type == 'view'

In [176]:
train = train[~views_sel].copy() # searches & buys only

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,query,entities,item_id,domain_id,country
0,0,,1786148,2019-10-19 11:25:42,view,,,,,,
1,0,,1786148,2019-10-19 11:25:57,view,15.0,,,,,
2,0,,RELOGIO SMARTWATCH,2019-10-19 11:26:07,search,10.0,relogio smartwatch,[],,,
3,0,,1615991,2019-10-19 11:27:26,view,79.0,,,,,
4,0,,1615991,2019-10-19 11:28:36,view,70.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
12412324,413162,,792798,2019-10-07 10:49:32,view,8076.0,,,,,
12412325,413162,,258196,2019-10-07 10:52:21,view,169.0,,,,,
12412326,413162,,12716,2019-10-07 10:53:07,view,46.0,,,,,
12412327,413162,,258196,2019-10-07 10:55:32,view,145.0,,,,,
