# EDA - Search

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

## imports

In [4]:
import numpy as np
from datetime import datetime

In [5]:
import pandas as pd
pd.set_option('max_rows', 100)
# pd.set_option('max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [6]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [7]:
# import scipy.sparse as sparse
# import implicit

In [8]:
# from sklearn.metrics import ndcg_score

In [9]:
# from sklearn.model_selection import ParameterGrid

In [10]:
# import mlflow

In [11]:
# import pickle

In [12]:
import spacy

port_nlp_fn = 'nilc50skip'
port_nlp_fp = os.path.join(wordvecs_data_dir, port_nlp_fn) 
port_nlp = spacy.load(port_nlp_fp)

espa_nlp_fn = 'suc30fast'
espa_nlp_fp = os.path.join(wordvecs_data_dir, espa_nlp_fn) 
espa_nlp = spacy.load(espa_nlp_fp)

In [13]:
# nlp = spacy.load('en_core_web_sm')

# from spacy_langdetect import LanguageDetector
# nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

In [14]:
from langdetect import detect

## project imports

In [15]:
import sys
sys.path.insert(0, '..')

In [16]:
from src.features import *
from src.models import *

* remove buy events: no additional relevance attributed to buy
* remove event timestamp: no additional relevance attributed to recency
* remove search events: no additional relevance attributed to search
* aggregates counts - higher relevance attributed to more views

# exploration

In [17]:
train_fn = 'train_dataset.pkl'
test_fn = 'test_dataset.pkl'
train, test = read_processed(train_fn, test_fn)

In [18]:
true_fn = 'true.pkl'
true_fp = os.path.join(processed_data_dir, true_fn)
true_df = pd.read_pickle(true_fp)

item_data_fn = 'item_data.jl.gz'
item_data = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines=True)
item_domain = item_data[['item_id', 'domain_id']]

#### create nlp objects for titles in item_data

In [17]:
item_data['title_lower'] = item_data.title.str.lower()
item_data_unique = item_data.title_lower.drop_duplicates().to_frame()

In [21]:
def emb_vectors_port(x):
    v = port_nlp(x)
    return (v.vector / v.vector_norm).astype(np.float16)

item_data_unique['port'] = item_data_unique.title_lower.progress_apply(emb_vectors_port)

100%|██████████| 1925104/1925104 [03:31<00:00, 9109.84it/s] 


In [24]:
def emb_vectors_espa(x):
    v = espa_nlp(x)
    return (v.vector / v.vector_norm).astype(np.float16)

item_data_unique['espa'] = item_data_unique.title_lower.progress_apply(emb_vectors_espa)

100%|██████████| 1925104/1925104 [03:51<00:00, 8313.94it/s]


In [77]:
item_data_unique

Unnamed: 0,title_lower,port,espa
0,casa sola en venta con gran patio solo pago de...,"[0.1279, 0.02943, 0.10016, 0.0578, -0.1276, -0...","[-0.1664, -0.03888, 0.03452, 0.0887, 0.0623, 0..."
1,resident evil origins collection nintendo swit...,"[0.0374, -0.2139, 0.2559, -0.09314, -0.1992, 0...","[0.0116, -0.02869, -0.0703, 0.3406, 0.1633, -0..."
2,falda de imitación piel negra,"[-0.02623, 0.03317, -0.0725, -0.06097, -0.031,...","[-0.1075, 0.2355, 0.01142, 0.0687, 0.09735, 0...."
3,powercolor red devil radeon rx 580 8gb gddr5,"[0.129, -0.07404, 0.2448, -0.06235, -0.06555, ...","[0.1323, 0.001538, -0.2064, 0.4216, 0.1107, -0..."
4,laptop hp nx6320 core duo con puerto db9 windo...,"[0.1315, -0.1361, 0.2778, -0.0952, -0.1708, 0....","[0.185, -0.008896, -0.2537, 0.391, 0.1273, -0...."
...,...,...,...
2102271,aparador elétrico barba perna nova bivolt reca...,"[-0.1031, 0.04428, -0.09937, -0.0968, -0.11395...","[-0.07697, 0.1261, -0.067, 0.236, 0.006966, -0..."
2102272,carrinho de bebê stoke,"[0.2024, 0.2474, 0.10767, -0.217, -0.2332, -0....","[-0.1051, -0.1318, 0.2052, 0.03458, -0.1254, -..."
2102273,grelha para hambúrguer preta com cabo em madei...,"[-0.215, -0.01633, 0.10004, -0.205, 0.0761, -0...","[-0.035, 0.193, 0.1982, 0.2708, 0.013626, -0.1..."
2102274,meia tam 7/8 anti embolia trombose antitrombo,"[-0.0987, 0.0804, -0.1415, -0.1965, -0.258, -0...","[-0.065, 0.301, 0.1938, 0.353, -0.02925, 0.007..."


In [78]:
# saving interim item_data_unique

# item_data_unique_fn = 'item_data_unique.pkl'
# item_data_unique_fp = os.path.join(interim_data_dir, item_data_unique_fn)
# item_data_unique.to_pickle(item_data_unique_fp)

In [21]:
item_data_unique_fn = 'item_data_unique.pkl'
item_data_unique_fp = os.path.join(interim_data_dir, item_data_unique_fn)
item_data_unique = pd.read_pickle(item_data_unique_fp)

In [15]:
# item_data_unique['lang'] = item_data_unique.title_lower.progress_apply(lambda x: nlp(' '.join(x.split()[:3]))._.language)
# item_data_unique['lang'] = item_data_unique.title_lower.progress_apply(lambda x: nlp(x)._.language)

In [27]:
def safe_detect(txt):
    try:
        return detect(txt)
    except:
        return 'detect error'

In [28]:
import dask.dataframe as dd
from dask.distributed import Client
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:57861  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 8  Memory: 17.18 GB


In [29]:
item_data_unique_dd = dd.from_pandas(item_data_unique[['title_lower']], npartitions = 8)
item_data_unique_dd['lang'] = item_data_unique_dd.title_lower.apply(safe_detect, meta=('title_lower', 'object'))

In [None]:
%%time
item_data_unique_lang = item_data_unique_dd.compute()

In [65]:
item_data_unique_lang.lang.value_counts().head()

es    10312
en     2306
it     1764
pt     1553
ca     1271
Name: lang, dtype: int64

______

In [30]:
# import dask.dataframe as dd
# from dask.distributed import Client
# client = Client()
# client

# item_data_unique_dd = dd.from_pandas(item_data_unique, npartitions = 4)

# @delayed
# def add_spacy(df_p):
#     df = df_p.copy()
#     # df['port_nlp'] = df.title_lower.apply(lambda x: port_nlp(x))
#     df['port_nlp'] = df.title_lower.apply(port_nlp)
#     return df

# item_data_unique_nlp = list()
# for df_p in np.array_split(item_data_unique, n_workers):
#     item_data_unique_nlp.append(add_spacy(df_p))

# item_data_unique_nlp = compute(*item_data_unique_nlp)
# # OSError: [E050] Can't find model 'pt_core_news_md.vectors'. 
# #     It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [16]:
train['event_info'] = train.event_info.fillna(train.item_bought)

In [18]:
# validation = 0.2 # 0.2: 20% of original train will be test
# keep_train = 0.6  # 0.6: 60% of train for validation

# validation = 0.2 
# keep_train = 0.1  

validation = 0.2 
keep_train = 0.4 

if validation:
    train_red, test_red = shrink_and_split(train, keep_train = keep_train, validation = validation)

train shape: (12412329, 6)
train/test shapes: (3976666, 6) (970217, 5)


In [19]:
# param = {'buy_weight': 200,
#  'regularization': 0.1,
#  'iterations': 10,
#  'factors': 128,
#  'alpha_val': 200}

In [20]:
test_offset, test_shifted_seq_vals, train_test = join_prepare_train_test(train_red, test_red, buy_weight = -1, return_search = True)
# model, seq_map, event_info_map, sparse_user_item = fit_implicit_model(train_test, **param)
# pred = predict_implicit_model(model, sparse_user_item, 
#                               seq_map, event_info_map, test_shifted_seq_vals, 
#                               10, test_offset, 
#                               validation = True, true_df = true_df, item_domain = item_domain)

In [90]:
# existem titles com igual descrição, mas item_id diferentes
# item_data[item_data.duplicated(subset = ['title'], keep = False)].sort_values('title')

In [82]:
item_data.title

0          Casa Sola En Venta Con Gran Patio Solo Pago De...
1          Resident Evil Origins Collection Nintendo Swit...
2                              Falda De Imitación Piel Negra
3               Powercolor Red Devil Radeon Rx 580 8gb Gddr5
4          Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...
                                 ...                        
2102272                               Carrinho De Bebê Stoke
2102273    Grelha Para Hambúrguer Preta Com Cabo Em Madei...
2102274        Meia Tam 7/8 Anti Embolia Trombose Antitrombo
2102275      Pano De Boca Cremer Menina Luxo Bordado C/3 Und
2102276    Kit Turbo Turbina Virtual Simulador Som Apito ...
Name: title, Length: 2102277, dtype: object

In [81]:
pd.merge(train_test, item_data[['item_id', 'category_id']], left_on = 'event_info', right_on = 'item_id', how = 'left')

Unnamed: 0,seq,event_info,views,event_type,item_id,category_id
0,0,1615991,16.0,view,1615991.0,MLB135384
1,0,1786148,2.0,view,1786148.0,MLB135384
2,0,RELOGIO SMARTWATCH,,search,,
3,0,1748830,-1.0,buy,1748830.0,MLB135384
4,3,505541,5.0,view,505541.0,MLB1730
...,...,...,...,...,...,...
3899370,826311,CAMA BOX BAU CASAL,,search,,
3899371,826311,CAMA BOX BAU CASAL,,search,,
3899372,826311,CAMA BOX BAU CASAL,,search,,
3899373,826311,CAMA BOX BAU CASAL,,search,,


In [77]:
train_test[train_test.event_type == 'search'].event_info.unique()

array(['RELOGIO SMARTWATCH', 'RADIOBOSS', 'SOUND FORGE', ...,
       'CALCA JARDINEIRA FEMININA SUSPENSORIO', 'CALCA RAIOM',
       'CALCA RAIOM CINTURA ALTA'], dtype=object)

_____

## dataprep for search rows - spacy tests

In [1]:
import spacy

In [2]:
port = spacy.load('pt_core_news_md')

In [3]:
query_original = port('PNEU LEVORIN AZONIC ARO 18 275 DIANTEIRO')
query_lower    = port('PNEU LEVORIN AZONIC ARO 18 275 DIANTEIRO'.lower())

In [4]:
item_original = port('Pneu Para Moto Levorin Azonic Dianteiro Tl 2,75 18')
item_lower    = port('Pneu Para Moto Levorin Azonic Dianteiro Tl 2,75 18'.lower())

In [5]:
print(query_original.similarity(item_original))
print(query_original.similarity(item_lower))
print(query_lower.similarity(item_original))
print(query_lower.similarity(item_lower))  # <--- highest similarity

0.623769216929041
0.55457467490276
0.7138354773981356
0.7602650326179264


In [130]:
type(query_lower.text)

str

In [135]:
for t in query_lower:
    print(t, type(t), t.text, type(t.text))

pneu <class 'spacy.tokens.token.Token'> pneu <class 'str'>
levorin <class 'spacy.tokens.token.Token'> levorin <class 'str'>
azonic <class 'spacy.tokens.token.Token'> azonic <class 'str'>
aro <class 'spacy.tokens.token.Token'> aro <class 'str'>
18 <class 'spacy.tokens.token.Token'> 18 <class 'str'>
275 <class 'spacy.tokens.token.Token'> 275 <class 'str'>
dianteiro <class 'spacy.tokens.token.Token'> dianteiro <class 'str'>


In [136]:
train['query'] = train.event_info
train.loc[~(train.event_type == 'search'), 'query'] = np.nan
train['query'] = train['query'].fillna('').str.lower()

In [137]:
def find_oov(s):
    s_spacy = port(s)
    return [t.text for t in s_spacy if t.is_oov]

In [139]:
train_search = train.loc[train.event_type == 'search', ['seq', 'query']].copy()
train_search_drop_dup = train_search.drop_duplicates(subset = 'query')

In [140]:
train_search_drop_dup['entities'] = train_search_drop_dup['query'].progress_apply(find_oov)

100%|██████████| 847496/847496 [1:37:52<00:00, 144.33it/s] 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [141]:
train = pd.merge(train, train_search_drop_dup, on = 'query', how = 'left')

In [142]:
train.rename(columns = {'seq_x': 'seq'}, inplace = True)
train.drop(columns = 'seq_y', inplace = True)

In [143]:
train.sort_values(['seq', 'event_timestamp'], inplace = True)

In [164]:
train.sample(10)

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,query,entities
6839163,6994,,984787,2019-10-12 18:54:21,view,25.0,,
11963835,98101,,364310,2019-09-28 21:16:31,view,10.0,,
1424896,138466,,LAMPADA LED HONDA CIVIC 99,2019-10-21 10:18:59,search,288.0,lampada led honda civic 99,[]
1828935,387228,,1799758,2019-10-03 01:27:39,view,49.0,,
2032760,154670,,243848,2019-10-15 18:20:03,view,14.0,,
6697734,257356,,CORTA VENTO MERCEDES,2019-10-03 20:25:03,search,6.0,corta vento mercedes,[]
244324,155814,1278790.0,,NaT,,,,
4728092,213232,,PULA PIRATA,2019-10-03 15:53:31,search,7.0,pula pirata,[]
3481274,152042,,CHUTEIRA CRIANCA 27 28,2019-10-03 16:30:14,search,7.0,chuteira crianca 27 28,[]
158968,238608,,RELOJ CASIO,2019-10-18 15:46:15,search,,reloj casio,[]


In [156]:
train_fn = 'train_for_nlp.pkl'
train_fp = os.path.join(interim_data_dir, train_fn)
train.to_pickle(train_fp)

___

In [172]:
# train = pd.read_pickle(train_fp)

In [168]:
item_data.head()

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
0,111260,Casa Sola En Venta Con Gran Patio Solo Pago De...,MLM-INDIVIDUAL_HOUSES_FOR_SALE,,1150000.0,MLM170527,new
1,871377,Resident Evil Origins Collection Nintendo Swit...,MLM-VIDEO_GAMES,15270800.0,1392.83,MLM151595,new
2,490232,Falda De Imitación Piel Negra,MLM-SKIRTS,,350.0,MLM7697,new
3,1150706,Powercolor Red Devil Radeon Rx 580 8gb Gddr5,MLM-GRAPHICS_CARDS,,3200.0,MLM9761,used
4,934912,Laptop Hp Nx6320 Core Duo Con Puerto Db9 Windo...,MLM-NOTEBOOKS,,1599.0,MLM1652,used


In [173]:
train = pd.merge(train, item_data[['item_id', 'domain_id']], left_on = 'item_bought', right_on = 'item_id', how = 'left')

In [175]:
train['country'] = train.domain_id.str[:3]

In [177]:
views_sel = train.event_type == 'view'

In [176]:
train = train[~views_sel].copy() # searches & buys only

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,query,entities,item_id,domain_id,country
0,0,,1786148,2019-10-19 11:25:42,view,,,,,,
1,0,,1786148,2019-10-19 11:25:57,view,15.0,,,,,
2,0,,RELOGIO SMARTWATCH,2019-10-19 11:26:07,search,10.0,relogio smartwatch,[],,,
3,0,,1615991,2019-10-19 11:27:26,view,79.0,,,,,
4,0,,1615991,2019-10-19 11:28:36,view,70.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
12412324,413162,,792798,2019-10-07 10:49:32,view,8076.0,,,,,
12412325,413162,,258196,2019-10-07 10:52:21,view,169.0,,,,,
12412326,413162,,12716,2019-10-07 10:53:07,view,46.0,,,,,
12412327,413162,,258196,2019-10-07 10:55:32,view,145.0,,,,,
