# Initial Setups

## autoreload

In [1]:
# %load_ext autoreload
# %autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
figures_dir = os.path.join(project_dir, os.environ.get("FIGURES_DIR"))

## imports

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import sweetviz as sv
# import utils

In [5]:
# pd.set_option('max_rows', 100)

In [6]:
from tqdm import tqdm

### item_data.jl.gz

In [7]:
item_data_fn = 'item_data.jl.gz'
item_data = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines=True)

In [8]:
item_data.sample(10)

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
1860260,1455691,Bíblia Sagrada Nvt Índice - Rosa Capa Dura - M...,MLB-BOOKS,,48.9,MLB437616,new
1053880,1604526,Sapato Masculino Drive Mocassim Dockside 958 C...,MLB-LOAFERS_AND_OXFORDS,12950993.0,58.26,MLB274752,new
569311,1615704,"Porta Sanfonada Esquadriplast 0,60 X 2,10 Bran...",MLB-DOORS,13750378.0,92.8,MLB179703,new
1582311,2002652,Fone De Ouvido Headset Bluetooth Dobrável,MLB-HEADPHONES,,65.9,MLB196208,new
805847,11473,Kit 2 Baterias E Carregador P/ Controle Xbox O...,MLB-GAME_CONSOLES_VIDEO_GAMES_AND_ARCADE_MACHINES,,23.99,MLB118890,new
19477,1424152,1 Ampolleta Hydraderm Acido Hialuronico 10ml D...,MLM-FACIAL_SKIN_CARE_PRODUCTS,,1900.0,MLM178705,new
1887639,556551,Jg Alargador Para Lama Jeep Willys Quadrado 3 ...,MLB-AUTOMOTIVE_FENDER_FLARES,,840.0,MLB430228,new
1899338,698552,Linda Esmeralda Bruta,MLB-PRECIOUS_AND_SEMI_PRECIOUS_STONES,,320.0,MLB1441,new
763221,906689,Cooler Térmico Tipo Imita Lata De Cerveja Nfl ...,MLB-SPORTS_AND_FITNESS,,290.0,MLB1306,used
788810,1276022,Conj 2 Frigideiras Ø24/26cm Indução Rev Cerâm ...,MLB-FRYING_PANS_WOKS_GRIDDLES_AND_GRILL_PANS,,186.0,MLB107482,new


* MLB: Brasil
* MLM: Other countries

In [9]:
item_data.domain_id.str[:3].value_counts()

MLB    1723216
MLM     378210
Name: domain_id, dtype: int64

### train_dataset.jl.gz

In [10]:
def proc_dataset(df):
    number_of_batches = len(df) // 50
    proc_df = list()
    for df_p in tqdm(np.array_split(df, number_of_batches)):
        if 'item_bought' in df_p:
            df_p = pd.concat([df_p.user_history.apply(pd.Series), df_p.item_bought], axis = 1).stack()
            train_dataset = True
        else:
            df_p = df_p.user_history.apply(pd.Series).stack()
            train_dataset = False
            
        df_p = df_p.apply(pd.Series)
        df_p.reset_index(inplace = True)
        df_p.drop(columns = 'level_1', inplace = True)
        
        if train_dataset:
            new_columns = {0: 'item_bought', 'level_0': 'seq'}
            df_p['event_type'] = df_p.event_type.fillna('buy')
        else:
            new_columns = {'level_0': 'seq'}
        
        df_p.rename(columns = new_columns, inplace = True)
        
        df_p['timezone'] = df_p.event_timestamp.str[-4:]
        df_p['event_timestamp'] = pd.to_datetime(df_p.event_timestamp.str[:-9])
        df_p['time_diff'] = df_p.groupby('seq').event_timestamp.diff().dt.seconds
        
        # if train_dataset:
        proc_df.append(df_p)
        
    proc_df = pd.concat(proc_df)        
    return proc_df

In [11]:
def read_data(raw_fn = 'train_dataset.jl.gz', processed_fn = 'train_dataset.pkl',
              force_process = False, nrows = None):

    processed_fp = os.path.join(processed_data_dir, processed_fn)
    if os.path.exists(processed_fp) and not force_process:
        processed = pd.read_pickle(processed_fp)
    else:
        raw = pd.read_json(os.path.join(raw_data_dir, raw_fn), lines = True, nrows = nrows)
        raw['len_events'] = raw.user_history.str.len()
        raw.sort_values('len_events', inplace = True)
        raw.drop('len_events', axis = 1, inplace = True)
        processed = proc_dataset(raw)
        processed.to_pickle(os.path.join(processed_data_dir, processed_fn))
        
    if 'item_bought' in processed: 
        processed.item_bought = processed.item_bought.fillna(method = 'backfill').astype(int)
        processed['in_nav'] = processed.item_bought == processed.event_info
    
    return processed

In [12]:
%%time
train = read_data('train_dataset.jl.gz', 'train_dataset.pkl')

100%|██████████| 8263/8263 [1:39:32<00:00,  1.38it/s]


CPU times: user 1h 40min 3s, sys: 55.6 s, total: 1h 40min 59s
Wall time: 1h 41min 14s


In [13]:
def save_true_labels(df, true_fn = 'true.pkl'):
    true_fp = os.path.join(processed_data_dir, true_fn)
    true_df = df[(df.event_type.isnull()) | (df.event_type == 'buy')]
    true_df = true_df[['seq', 'item_bought']]
    true_df.to_pickle(true_fp)

In [14]:
save_true_labels(train)

In [29]:
del old_true_df

### 4% dos eventos sao iguais ao produto comprado

In [12]:
train.in_nav.value_counts(normalize = True)

False    0.959279
True     0.040721
Name: in_nav, dtype: float64

### 30% dos produtos comprados foram navegados (nesse periodo de 1 semana)

In [13]:
train.groupby('seq').in_nav.any().value_counts(normalize = True)

False    0.706116
True     0.293884
Name: in_nav, dtype: float64

In [14]:
train.query("seq == 121")#.time_diff

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,in_nav
864,121,388604,MINOXIDIL,2019-10-13 15:33:41,search,,False
865,121,388604,MINOXIDIL,2019-10-13 17:08:36,search,5695.0,False
866,121,388604,429456,2019-10-13 17:09:04,view,28.0,False
867,121,388604,429456,2019-10-13 17:09:43,view,39.0,False
868,121,388604,429456,2019-10-13 17:10:37,view,54.0,False
869,121,388604,MINOXIDIL,2019-10-13 17:10:38,search,1.0,False
870,121,388604,625179,2019-10-13 17:10:46,view,8.0,False
871,121,388604,MINOXIDIL,2019-10-13 17:10:58,search,12.0,False
872,121,388604,410942,2019-10-13 17:11:45,view,47.0,False
873,121,388604,410942,2019-10-13 17:12:29,view,44.0,False


### test_dataset.jl.gz

In [36]:
%%time
test = read_data('test_dataset.jl.gz', 'test_dataset.pkl')

100%|██████████| 3541/3541 [34:21<00:00,  1.72it/s]


CPU times: user 34min 41s, sys: 17.4 s, total: 34min 58s
Wall time: 34min 59s


_____

* test_dataset.jl.gz
* sample_submission.csv