# EDA - DATAPREP - Search

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

In [4]:
TEST_OFFSET = int(os.environ.get("TEST_OFFSET"))
TOTAL_TEST_SEQS = int(os.environ.get("TOTAL_TEST_SEQS"))

## imports

In [5]:
import numpy as np
from datetime import datetime

In [6]:
import pandas as pd
pd.set_option('max_rows', 100)
# pd.set_option('max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [8]:
# from sklearn.model_selection import ParameterGrid

# import mlflow

# import pickle

# import seaborn as sns
# import matplotlib.pyplot as plt

In [9]:
import spacy

port_nlp_fn = 'nilc50skip'
port_nlp_fp = os.path.join(wordvecs_data_dir, port_nlp_fn) 
port_nlp = spacy.load(port_nlp_fp)

espa_nlp_fn = 'suc30fast'
espa_nlp_fp = os.path.join(wordvecs_data_dir, espa_nlp_fn) 
espa_nlp = spacy.load(espa_nlp_fp)

## project imports

In [11]:
import sys
sys.path.insert(0, '..')

In [12]:
from src.features import *
from src.models import *

# train/test search data dataprep for search info

In [22]:
test_offset, test_shifted_seq_vals, train_test = join_prepare_train_test('train_dataset.pkl', 
                                                                         'test_dataset.pkl', 
                                                                         buy_weight = -1, 
                                                                         return_search = True, 
                                                                         drop_timezone = True,
                                                                         drop_lang = False,
                                                                         lang = 'both')

lang both


#### identifying language for each seq
based on the browsed items and their domain MLB or MLM. <br>
this part was moved inside join_prepare_train_test via arg lang (drop_lang must be False)

In [18]:
##### item_data

# item_data_fn = 'item_data.pkl'
# item_data_fp = os.path.join(processed_data_dir, item_data_fn)
# item_data = pd.read_pickle(item_data_fp)
# item_data = item_data_desc[['item_id', 'title', 'domain_id']]

# lang = train_test[train_test.event_type == 'view'].copy()
# lang['event_info'] = lang.event_info.astype(int)
# lang = lang[['seq', 'event_info']]
# lang = pd.merge(lang, item_data[['item_id', 'lang_domain']], how = 'left', 
#                  left_on = 'event_info', right_on = 'item_id')
# lang = lang.groupby('seq').lang_domain.value_counts()
# lang = lang.unstack().fillna(0).idxmax(axis = 1)
# lang = lang.reset_index().rename(columns = {0: 'lang_seq'})

# train_test = pd.merge(train_test, lang, how = 'left')

In [20]:
train_test

Unnamed: 0,seq,event_info,views,event_type,lang_seq,in_nav,in_nav_pred
0,0,1.61599e+06,16.0,view,,,
1,0,1.78615e+06,2.0,view,,,
1924622,0,RELOGIO SMARTWATCH,,search,pt,False,0.655061
1924639,0,1.74883e+06,-1.0,buy,pt,False,0.655061
2,1,206667,1.0,view,,,
...,...,...,...,...,...,...,...
4603664,590232,NUTELLA 650,,search,pt,,
4603666,590232,NUTELLA 650,,search,pt,,
4603668,590232,NUTELLA 650,,search,pt,,
4603669,590232,XIAOMI MI 9 128GB PRETO,,search,pt,,


#### few seq's with no views. imputing with 'pt' (could run langdetect)
approximately 6.4% of the seq's do not have views. check with <br>
`train_test.groupby('seq').event_type.value_counts().unstack()['view'].isnull().value_counts(normalize = True)`

In [36]:
train_test.loc[train_test.lang_seq.isnull(), 'lang_seq'] = 'pt'

#### embeddings for searches

In [38]:
searches = train_test[train_test.event_type == 'search'].drop_duplicates(
    subset = ['event_info', 'lang_seq'])
searches = searches[['seq', 'event_info', 'lang_seq']]
searches['query_lower'] = searches.event_info.str.lower()

In [39]:
def emb_vectors(x):
    if x.lang_seq == 'pt':
        v = port_nlp(x.query_lower) 
    else:
        v = espa_nlp(x.query_lower)
        
    v_vector = v.vector
    vector_norm = v.vector_norm
    if vector_norm == 0:
        return np.zeros(v_vector.shape, dtype = np.float16)
    else:
        return (v_vector / vector_norm).astype(np.float16)

In [40]:
searches['query_embs'] = searches[['query_lower', 'lang_seq']].progress_apply(emb_vectors, axis = 1)

100%|██████████| 1172171/1172171 [02:07<00:00, 9195.53it/s] 


In [41]:
searches[['event_info', 'lang_seq', 'query_embs']].sample(10)

Unnamed: 0,event_info,lang_seq,query_embs
12126128,NS CONJUNTO,pt,"[0.11017, -0.0712, 0.1914, -0.08746, -0.1422, ..."
3991506,PO TRANSPARENTE UNHA PORCELANA,pt,"[-0.1888, -0.03647, 0.08826, -0.1487, 0.1819, ..."
3547447,PATINS ROLLER 4 FOR YOU NUMERO 34,pt,"[0.1766, 0.0355, 0.1692, -0.1627, -0.05573, -0..."
3828423,DISTINTIVO POLICIA CIVIL MT,pt,"[-0.11237, -0.02359, -0.1588, -0.1804, -0.2908..."
3934363,CFW500 38A,pt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1159159,SUPORTES PRENDER JANELAS,pt,"[0.012764, -0.001668, 0.0832, -0.01209, 0.0247..."
7975586,NUMARK TT 250 USB,pt,"[0.0489, -0.1137, 0.2847, -0.0929, -0.0638, -0..."
5033279,JOGO PARAFUSO TOYOTA COROLLA,pt,"[0.1178, -0.0864, 0.2573, -0.1937, -0.1727, 0...."
1227375,MESA GUITARRA,es,"[-0.1285, 0.1699, -0.09875, 0.0996, 0.1921, -0..."
1000707,DESS PUNTO,es,"[0.03647, 0.02017, 0.1942, 0.1488, -0.001874, ..."


In [44]:
train_test = pd.merge(train_test, searches[['event_info', 'lang_seq', 'query_embs']], how = 'left')

#### saving augmented train_test with search embeddings

In [45]:
train_test_embs_fn = 'train_test_embs.pkl'
train_test_embs_fp = os.path.join(interim_data_dir, train_test_embs_fn)
train_test.to_pickle(train_test_embs_fp)

## 👆 search dataprep above. finishes here