# EDA - VALIDATION & PREDICTION - Search

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

In [4]:
TEST_OFFSET = int(os.environ.get("TEST_OFFSET"))
TOTAL_TEST_SEQS = int(os.environ.get("TOTAL_TEST_SEQS"))

## imports

In [5]:
import numpy as np
from datetime import datetime

In [41]:
import pandas as pd
# pd.set_option('max_rows', 200)
pd.options.display.max_rows = 100
# pd.set_option('max_columns', None)
# pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [8]:
# from sklearn.model_selection import ParameterGrid

# import mlflow

# import pickle

# import seaborn as sns
# import matplotlib.pyplot as plt

In [9]:
# import spacy

# port_nlp_fn = 'nilc50skip'
# port_nlp_fp = os.path.join(wordvecs_data_dir, port_nlp_fn) 
# port_nlp = spacy.load(port_nlp_fp)

# espa_nlp_fn = 'suc30fast'
# espa_nlp_fp = os.path.join(wordvecs_data_dir, espa_nlp_fn) 
# espa_nlp = spacy.load(espa_nlp_fp)

In [10]:
from sklearn.metrics import ndcg_score

## project imports

In [11]:
import sys
sys.path.insert(0, '..')

In [12]:
from src.features import *
from src.models import *

# train/test search data dataprep for search info

In [17]:
item_data_fn = 'item_data.jl.gz'
item_data = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines = True)
item_data = item_data[['item_id', 'title']]

In [50]:
train_fn = 'train_dataset.pkl'
test_fn = 'test_dataset.pkl'
train, test = read_processed(train_fn, test_fn)

In [51]:
test_offset, test_shifted_seq_vals, train_test = join_prepare_train_test(train, test, 
                                                                         buy_weight = -1,
                                                                         just_concat = True)

In [52]:
train_test['item_id'] = train_test.item_bought.fillna(train_test.event_info)
train_test['query_str'] = [t if isinstance(t, str) else np.nan for t in train_test.item_id]

In [53]:
train_test = pd.merge(train_test, item_data, how = 'left')
train_test['title'] = train_test.title.fillna(train_test.query_str)
train_test = train_test[['seq', 'lang_seq', 'event_type', 'title']].copy()
train_test['texts_lower'] = train_test.title.str.lower() 
train_test.drop(columns = 'title', inplace = True)

In [54]:
train_test.query("seq == 57")

Unnamed: 0,seq,lang_seq,event_type,texts_lower
466061,57,pt,view,3 luminária led tubular sobrepor slim bivolt 1...
466062,57,pt,view,3 luminária led tubular sobrepor slim bivolt 1...
466063,57,pt,view,3 luminária led tubular sobrepor slim bivolt 1...
466064,57,pt,view,3 luminária led tubular sobrepor slim bivolt 1...
466065,57,pt,search,relogio ferro fluido
466066,57,pt,search,relogio ferro fluido
466067,57,pt,buy,3 luminária led tubular sobrepor slim bivolt 1...


In [14]:
# validation = 0.2 
# keep_train = 0.4 

# if validation:
#     train_red, test_red = shrink_and_split(train, keep_train = keep_train, validation = validation)

In [12]:
test_offset, test_shifted_seq_vals, train_test = join_prepare_train_test(train, 
                                                                         test, 
                                                                         buy_weight = -1, 
                                                                         return_search = True, 
                                                                         drop_timezone = True)

In [29]:
train_test

Unnamed: 0,seq,lang_seq,event_type,texts_lower
0,99,pt,view,"mangueira inox gas cozinha botijao 0,8 mts kit..."
1,99,pt,buy,kit fusivel de lamina 05a 10a 15a 20a 25a 30a ...
2,34,pt,view,pasta modeladora para bigode mustache paste 20g
3,34,pt,buy,spiderman homem aranha mídia física lacrado
4,76,,search,fiat uno vivace
...,...,...,...,...
5609,127,pt,search,sandalia mississipi
5610,127,pt,view,sandalia mississipi x7151 chennai rose - produ...
5611,127,pt,view,sandália feminina mississipi rasteira x9271 | ...
5612,127,pt,view,tênis feminino casual sapatilha sapatenis lanç...


#### identifying language for each seq
based on the browsed items and their domain MLB or MLM

##### item_data

In [16]:
item_data_fn = 'item_data.pkl'
item_data_fp = os.path.join(processed_data_dir, item_data_fn)
item_data = pd.read_pickle(item_data_fp)
# item_data = item_data_desc[['item_id', 'title', 'domain_id']]

In [17]:
lang = train_test[train_test.event_type == 'view'].copy()
lang['event_info'] = lang.event_info.astype(int)
lang = lang[['seq', 'event_info']]
lang = pd.merge(lang, item_data[['item_id', 'lang_domain']], how = 'left', 
                 left_on = 'event_info', right_on = 'item_id')
lang = lang.groupby('seq').lang_domain.value_counts()
lang = lang.unstack().fillna(0).idxmax(axis = 1)
lang = lang.reset_index().rename(columns = {0: 'lang_seq'})

In [18]:
train_test = pd.merge(train_test, lang, how = 'left')

#### few seq's with no views. imputing with 'pt' (could run langdetect)
less than 1% of the seq's do not have views. check with `train_test[train_test.lang_seq.isnull()].sample(10)`

In [37]:
train_test.loc[train_test.lang_seq.isnull(), 'lang_seq'] = 'pt'

#### embeddings for searches

In [38]:
searches = train_test[train_test.event_type == 'search'].drop_duplicates(
    subset = ['event_info', 'lang_seq'])
searches = searches[['seq', 'event_info', 'lang_seq']]
searches['query_lower'] = searches.event_info.str.lower()

In [39]:
def emb_vectors(x):
    if x.lang_seq == 'pt':
        v = port_nlp(x.query_lower) 
    else:
        v = espa_nlp(x.query_lower)
        
    v_vector = v.vector
    vector_norm = v.vector_norm
    if vector_norm == 0:
        return np.zeros(v_vector.shape, dtype = np.float16)
    else:
        return (v_vector / vector_norm).astype(np.float16)

In [40]:
searches['query_embs'] = searches[['query_lower', 'lang_seq']].progress_apply(emb_vectors, axis = 1)

100%|██████████| 1172170/1172170 [01:58<00:00, 9908.94it/s] 


In [45]:
searches[['event_info', 'lang_seq', 'query_embs']].sample(10)

Unnamed: 0,event_info,lang_seq,query_embs
2756509,FRONTAL J8,pt,"[0.010284, -0.08655, -0.0444, -0.1384, -0.1803..."
11914219,TONER SAMSUNG 2885FW,es,"[0.1293, 0.01061, -0.3735, 0.3396, 0.07104, -0..."
4759933,BOMBA AGUA FIORINO,pt,"[-0.0845, -0.0397, 0.09607, -0.1462, -0.2122, ..."
1490675,CARREGADOR PORTATIL 60000 MAH,pt,"[-0.01701, 0.0782, 0.2607, 0.009186, -0.11426,..."
13705062,MALETA MAQUIAGEM PROVISIONAL MUITA COISA,pt,"[-0.2898, -0.0623, -0.0649, -0.05875, -0.034, ..."
6041434,DIRECAO GOLF 05,pt,"[0.2307, -0.01178, 0.1909, -0.0468, -0.2339, 0..."
2489139,AMPLIFICADOR 600 WATTS RMS,es,"[0.05548, 0.04114, -0.2158, 0.3386, -0.01672, ..."
7421572,FABRICA ENERGETICO,pt,"[-0.1453, -0.07275, 0.1863, -0.0317, -0.0453, ..."
5410331,BRM39EBANA30,pt,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
858890,BATERIA 37V,pt,"[-0.2279, -0.07544, 0.05594, -0.06052, -0.1298..."


In [46]:
train_test = pd.merge(train_test, searches[['event_info', 'lang_seq', 'query_embs']], how = 'left')

#### saving augmented train_test with search embeddings

In [47]:
train_test_embs_fn = 'train_test_embs.pkl'
train_test_embs_fp = os.path.join(interim_data_dir, train_test_embs_fn)
train_test.to_pickle(train_test_embs_fp)

## 👆 search dataprep above. finishes here