# Initial Setups

## autoreload

In [1]:
%load_ext autoreload
%autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
interim_data_dir = os.path.join(project_dir, os.environ.get("INTERIM_DATA_DIR"))
wordvecs_data_dir = os.path.join(project_dir, os.environ.get("WORDVECS_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))
reports_dir = os.path.join(project_dir, os.environ.get("REPORTS_DIR"))
cv_dir = os.path.join(project_dir, os.environ.get("CV_DIR"))
models_dir = os.path.join(project_dir, os.environ.get("MODELS_DIR"))

## imports

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
# import sweetviz as sv
# import utils

In [5]:
pd.set_option('max_rows', 100)

In [6]:
from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


## project imports

In [7]:
import sys
sys.path.insert(0, '..')

from src.features import *
from src.models import *

## spaCy imports

In [9]:
import spacy

port_nlp_fn = 'nilc50skip'
port_nlp_fp = os.path.join(wordvecs_data_dir, port_nlp_fn) 
port_nlp = spacy.load(port_nlp_fp)

espa_nlp_fn = 'suc30fast'
espa_nlp_fp = os.path.join(wordvecs_data_dir, espa_nlp_fn) 
espa_nlp = spacy.load(espa_nlp_fp)

### item_data.jl.gz
Note there are titles with the same texts, but with different item_id's.<br>
check with `item_data[item_data.duplicated(subset = ['title'], keep = False)].sort_values('title')`

Run this to reprocess a new `item_data.pkl` in `processed_data_dir`, for example with new embeddings (more dimensions on pre-trained, or custom embeddings)

In [10]:
item_data_fn = 'item_data.jl.gz'
item_data = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines=True)

#### item_domain used in the scoring

In [None]:
item_domain = item_domain[['item_id', 'domain_id']]
item_domain_fn = 'item_domain.pkl'
item_domain_fp = os.path.join(processed_data_dir, item_domain_fn)
item_domain.to_pickle(item_domain_fp)

* MLB: Brasil
* MLM: Other countries

In [30]:
item_data.domain_id.str[:3].value_counts()

MLB    1723216
MLM     378210
Name: domain_id, dtype: int64

In [40]:
item_data['title_lower'] = item_data.title.str.lower()
item_data['lang_domain'] = item_data.domain_id.str[:3].replace({'MLM': 'es', 'MLB': 'pt'})

In [41]:
item_data_unique = item_data[['title_lower', 'lang_domain']].drop_duplicates()

In [42]:
def emb_vectors(x):
    if x.lang_domain == 'pt':
        v = port_nlp(x.title_lower) 
    else:
        v = espa_nlp(x.title_lower)
        
    v_vector = v.vector
    vector_norm = v.vector_norm
    if vector_norm == 0:
        return np.zeros(v_vector.shape, dtype = np.float16)
    else:
        return (v_vector / vector_norm).astype(np.float16)

In [44]:
item_data_unique['title_embs'] = item_data_unique.progress_apply(emb_vectors, axis = 1)

100%|██████████| 1925800/1925800 [05:17<00:00, 6071.83it/s]


In [49]:
item_data = pd.merge(item_data, item_data_unique, how = 'left')

In [51]:
item_data_fn = 'item_data.pkl'
item_data_fp = os.path.join(processed_data_dir, item_data_fn)
item_data.to_pickle(item_data_fp)

In [69]:
item_data = item_data[['item_id', 'lang_domain', 'title_embs']]

item_data_embs_only_fn = 'item_data_embs_only.pkl'
item_data_embs_only_fp = os.path.join(processed_data_dir, item_data_embs_only_fn)
item_data.to_pickle(item_data_embs_only_fp)

#### maybe cluster some of these domains into larger domains (smaller ones)? 

In [48]:
item_data.groupby('domain_id').item_id.count().sort_values().head(20)

domain_id
MLM-GAME_CONSOLE_CAMERA_MOUNTS         1
MLM-GOAL_NETS                          1
MLM-GOLF_BAGS                          1
MLM-GOLF_CLUBS_PUTTERS                 1
MLM-GONIOMETERS                        1
MLM-GPS_CASES_AND_COVERS               1
MLM-GRADUATED_CYLINDERS                1
MLM-GRANOLA                            1
MLM-GRASS_PAVERS                       1
MLM-GROOMING_TABLES                    1
MLM-GROUND_RESISTANCE_METERS           1
MLM-STACKABLE_BINS                     1
MLM-HABERDASHERY_FABRIC_FLOWERS        1
MLM-HABERDASHERY_FRINGES               1
MLB-INDUSTRIAL_SILOS                   1
MLM-ALL_TERRAIN_VEHICLE_DRIVE_BELTS    1
MLM-STOLES                             1
MLM-GLASS_SAFETY_FILMS                 1
MLM-AUTOMOTIVE_TRASH_BAGS              1
MLM-ALARM_REMOTE_CONTROLS              1
Name: item_id, dtype: int64

In [47]:
item_data.groupby('domain_id').item_id.count().sort_values().tail(20)

domain_id
MLB-MOTORCYCLES                            10155
MLB-INDUSTRIAL_AND_COMMERCIAL_EQUIPMENT    10252
MLB-PANTS                                  10309
MLB-SOUVENIRS                              11017
MLB-CELLPHONE_COVERS                       11177
MLB-HEADPHONES                             11328
MLB-VIDEO_GAMES                            12742
MLB-WRISTWATCHES                           13136
MLB-HAIR_TREATMENTS                        13529
MLB-ACTION_FIGURES                         13597
MLM-CARS_AND_VANS                          14165
MLB-SANDALS_AND_FLIP_FLOPS                 15102
MLB-VEHICLE_PARTS                          18727
MLB-DRESSES                                21590
MLB-SUPPLEMENTS                            22351
MLB-T_SHIRTS                               23823
MLB-VEHICLE_ACCESSORIES                    28986
MLB-SNEAKERS                               32636
MLB-CELLPHONES                             38390
MLB-CARS_AND_VANS                          41420
Name: item

## train_dataset.jl.gz

In [8]:
%%time
train = read_raw_save_processed('train_dataset.jl.gz', 'train_dataset.pkl')
# save_true_labels(train)

CPU times: user 3.89 s, sys: 1.44 s, total: 5.33 s
Wall time: 5.52 s


In [51]:
t = train.groupby('seq').in_nav.any().reset_index()
train.drop('in_nav', axis = 1, inplace = True)
train = pd.merge(train, t, how = 'left')

In [53]:
processed_fn = 'train_dataset.pkl'
processed_fp = os.path.join(processed_data_dir, processed_fn)
train.to_pickle(processed_fp)

In [10]:
# train['time_diff_intra_session'] = [td if td < 1800 else 0 for td in train.time_diff]

In [55]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# lr = LogisticRegression()
# rf = RandomForestClassifier(max_depth = 7, min_samples_split = 0.001)

In [57]:
# t1 = train.groupby('seq').event_type.value_counts().unstack().fillna(0)
# t2 = train.groupby('seq').agg({'in_nav': np.any, 'lang_seq': 'first'})
# t4 = train[train.event_type == 'view'].groupby('seq').event_info.value_counts(normalize = True)
# t4.name = 'mode_freq'
# t4 = t4.reset_index()
# t4 = t4.groupby('seq').mode_freq.first()

# t = pd.concat([t1, t2, t4], axis = 1)

# t.drop(columns = 'buy', inplace = True)
# t['lang_seq'] = t.lang_seq.replace({'pt': 0, 'es': 1}).fillna(0)

# t = t.fillna(-100)

In [60]:
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import confusion_matrix, classification_report

# X, y = t[['search', 'view', 'lang_seq', 'mode_freq']], t.in_nav

In [79]:
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# params = {'n_estimators': [50, 100, 200], 'max_depth': [5, 8, 11, 14], 'min_samples_split': [0.01, 0.001, 0.0001]}

# gs = RandomizedSearchCV(RandomForestClassifier(), param_distributions = params, scoring = 'f1', verbose = 1, n_jobs = -1, cv = 3, n_iter = 6)

# gs.fit(X, y)

# rf_best = gs.best_estimator_

In [69]:
# X_train, X_test, y_train, y_test = train_test_split(X, y)

In [80]:
# rf_best
# RandomForestClassifier(max_depth=14, min_samples_split=0.0001, n_estimators=200)

RandomForestClassifier(max_depth=14, min_samples_split=0.0001, n_estimators=200)

In [84]:
# rf_best.fit(X_train, y_train)

# y_pred = rf_best.predict(X_test)

# confusion_matrix(y_test, y_pred)

# print(classification_report(y_test, y_pred))

In [89]:
rf_best.fit(X, y)

RandomForestClassifier(max_depth=14, min_samples_split=0.0001, n_estimators=200)

In [92]:
t['in_nav_pred'] = rf_best.predict_proba(X)[:, 1]

In [95]:
t.reset_index(inplace = True)

In [101]:
train = pd.merge(train, t[['seq', 'in_nav_pred']], how = 'left', left_on = 'seq', right_on = 'seq')

In [103]:
processed_fn = 'train_dataset.pkl'
processed_fp = os.path.join(processed_data_dir, processed_fn)
train.to_pickle(processed_fp)

### 4% of the events correspond to the item bought

In [34]:
train.in_nav.value_counts(normalize = True)

False    0.959279
True     0.040721
Name: in_nav, dtype: float64

### 30% of the items bought show up in the navigation

In [14]:
train.groupby('seq').in_nav.any().value_counts(normalize = True)

False    0.706116
True     0.293884
Name: in_nav, dtype: float64

In [36]:
train.query("seq == 121")#.time_diff

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,lang_seq,in_nav
2392969,121,388604,MINOXIDIL,2019-10-13 15:33:41,search,,es,False
2392970,121,388604,MINOXIDIL,2019-10-13 17:08:36,search,5695.0,es,False
2392971,121,388604,429456,2019-10-13 17:09:04,view,28.0,es,False
2392972,121,388604,429456,2019-10-13 17:09:43,view,39.0,es,False
2392973,121,388604,429456,2019-10-13 17:10:37,view,54.0,es,False
2392974,121,388604,MINOXIDIL,2019-10-13 17:10:38,search,1.0,es,False
2392975,121,388604,625179,2019-10-13 17:10:46,view,8.0,es,False
2392976,121,388604,MINOXIDIL,2019-10-13 17:10:58,search,12.0,es,False
2392977,121,388604,410942,2019-10-13 17:11:45,view,47.0,es,False
2392978,121,388604,410942,2019-10-13 17:12:29,view,44.0,es,False


### 49% of the domain_id of items bought show up in the navigated domain_id's

In [18]:
train = pd.merge(train, item_data[['item_id', 'domain_id']], how = 'left', 
                 left_on = 'item_bought', right_on = 'item_id')

In [19]:
train.rename(columns = {'domain_id': 'bought_domain_id'}, inplace = True)

In [22]:
train = pd.merge(train, item_data[['item_id', 'domain_id']], how = 'left', 
                 left_on = 'event_info', right_on = 'item_id')

In [23]:
train.rename(columns = {'domain_id': 'nav_domain_id'}, inplace = True)

In [25]:
train.drop(['item_id_x', 'item_id_y'], axis = 1, inplace = True)

In [27]:
train['in_nav_domain'] = train.bought_domain_id == train.nav_domain_id

In [34]:
train.groupby('seq').in_nav_domain.any().value_counts(normalize = True)

False    0.506594
True     0.493406
Name: in_nav_domain, dtype: float64

## test_dataset.jl.gz

In [25]:
%%time
test = read_raw_save_processed('test_dataset.jl.gz', 'test_dataset.pkl')

CPU times: user 1.28 s, sys: 536 ms, total: 1.82 s
Wall time: 2.14 s


_____

* test_dataset.jl.gz
* sample_submission.csv