# EDA - DATAPREP - Search

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

In [4]:
TEST_OFFSET = int(os.environ.get("TEST_OFFSET"))
TOTAL_TEST_SEQS = int(os.environ.get("TOTAL_TEST_SEQS"))

## imports

In [5]:
import numpy as np
from datetime import datetime

In [6]:
import pandas as pd
pd.set_option('max_rows', 100)
# pd.set_option('max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [8]:
# from sklearn.model_selection import ParameterGrid

# import mlflow

# import pickle

# import seaborn as sns
# import matplotlib.pyplot as plt

In [9]:
import spacy

w2v_mode = 'cbow'

# port_nlp_fn = 'nilc50skip'
port_nlp_fn = 'ptcustom100' + w2v_mode
port_nlp_fp = os.path.join(wordvecs_data_dir, port_nlp_fn) 
port_nlp = spacy.load(port_nlp_fp)

# espa_nlp_fn = 'suc30fast'
espa_nlp_fn = 'escustom100' + w2v_mode
espa_nlp_fp = os.path.join(wordvecs_data_dir, espa_nlp_fn) 
espa_nlp = spacy.load(espa_nlp_fp)

## project imports

In [10]:
import sys
sys.path.insert(0, '..')

In [11]:
from src.features import *
from src.models import *

# train/test search data dataprep for search info

In [12]:
test_offset, test_shifted_seq_vals, train_test = join_prepare_train_test('train_dataset.pkl', 
                                                                         'test_dataset.pkl', 
                                                                         buy_weight = -1, 
                                                                         return_search = True, 
                                                                         drop_timezone = True,
                                                                         drop_lang = False,
                                                                         lang = 'both')

lang both


#### identifying language for each seq
based on the browsed items and their domain MLB or MLM. <br>
this part was moved inside join_prepare_train_test via arg lang (drop_lang must be False)

In [13]:
train_test

Unnamed: 0,seq,event_info,views,event_type,lang_seq,in_nav,in_nav_pred
0,0,1.61599e+06,16.0,view,,,
1,0,1.78615e+06,2.0,view,,,
1924622,0,RELOGIO SMARTWATCH,,search,pt,False,0.655061
1924639,0,1.74883e+06,-1.0,buy,pt,False,0.655061
2,1,206667,1.0,view,,,
...,...,...,...,...,...,...,...
4603664,590232,NUTELLA 650,,search,pt,,
4603666,590232,NUTELLA 650,,search,pt,,
4603668,590232,NUTELLA 650,,search,pt,,
4603669,590232,XIAOMI MI 9 128GB PRETO,,search,pt,,


#### few seq's with no views. imputing with 'pt' (could run langdetect)
approximately 6.4% of the seq's do not have views. check with <br>
`train_test.groupby('seq').event_type.value_counts().unstack()['view'].isnull().value_counts(normalize = True)`

In [14]:
train_test.loc[train_test.lang_seq.isnull(), 'lang_seq'] = 'pt'

#### embeddings for searches

In [15]:
searches = train_test[train_test.event_type == 'search'].drop_duplicates(
    subset = ['event_info', 'lang_seq'])
searches = searches[['seq', 'event_info', 'lang_seq']]
searches['query_lower'] = searches.event_info.str.lower()

In [16]:
def emb_vectors(x):
    if x.lang_seq == 'pt':
        v = port_nlp(x.query_lower) 
    else:
        v = espa_nlp(x.query_lower)
        
    v_vector = v.vector
    vector_norm = v.vector_norm
    if vector_norm == 0:
        return np.zeros(v_vector.shape, dtype = np.float16)
    else:
        return (v_vector / vector_norm).astype(np.float16)

In [17]:
searches['query_embs'] = searches[['query_lower', 'lang_seq']].progress_apply(emb_vectors, axis = 1)

100%|██████████| 1172171/1172171 [02:02<00:00, 9564.70it/s] 


In [18]:
searches[['event_info', 'lang_seq', 'query_embs']].sample(20)

Unnamed: 0,event_info,lang_seq,query_embs
11810800,ADESIVO WAY UNO 2008,pt,"[-0.05783, 0.05054, -0.1365, 0.08826, -0.272, ..."
4260619,PRA FAZER MOTO GOL G3 16V,pt,"[0.0738, -0.03876, -0.007526, 0.226, -0.2112, ..."
898138,MINHAS COMPRAS EFETUADAS,pt,"[-0.0909, -0.00916, 0.03128, -0.08417, 0.1196,..."
6574283,TOALHA INFANTIL BANHO,pt,"[-0.005383, -0.06354, 0.1407, 0.0726, -0.044, ..."
2962493,TABLETA GRAFICA GUION 13,es,"[0.007904, 0.0036, 0.03656, 0.1051, -0.1426, -..."
320990,BLOCO VIDRO AZUL,pt,"[0.11487, -0.1158, -0.12317, -0.0768, -0.1707,..."
2734895,MOTO G7 PLAY 2019 DISPLAY,pt,"[0.04657, 0.0658, -0.06335, 0.2058, 0.04657, -..."
3804925,PROLONG,pt,"[-0.1428, -0.01182, -0.03403, 0.121, 0.11395, ..."
10354587,JOGO RODAS FERRO GOL 13 POLEGADAS,pt,"[0.0963, -0.1302, 0.06744, 0.1726, -0.2644, 0...."
9998875,APARELHO DVD 220V,pt,"[0.1202, -0.1287, -0.09174, -0.05002, -0.00309..."


In [19]:
train_test = pd.merge(train_test, searches[['event_info', 'lang_seq', 'query_embs']], how = 'left')

#### saving augmented train_test with search embeddings

In [20]:
train_test_embs_fn = 'train_test_embs_custom_' + w2v_mode + '.pkl'
train_test_embs_fp = os.path.join(interim_data_dir, train_test_embs_fn)
train_test.to_pickle(train_test_embs_fp)

## 👆 search dataprep above. finishes here