# Initial Setups

## autoreload

In [1]:
# %load_ext autoreload
# %autoreload 2

## dir setup

In [2]:
from dotenv import load_dotenv, find_dotenv
import os

this_file_path = os.path.dirname(os.path.abspath("__file__"))
# if script (not notebook)...
# project_dir = os.path.join(os.path.dirname(__file__), os.pardir)

# project directory
project_dir = os.path.join(this_file_path, os.pardir)

# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

True

In [3]:
raw_data_dir = os.path.join(project_dir, os.environ.get("RAW_DATA_DIR"))
processed_data_dir = os.path.join(project_dir, os.environ.get("PROCESSED_DATA_DIR"))
figures_data_dir = os.path.join(project_dir, os.environ.get("FIGURES_DATA_DIR"))

## imports

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
import sweetviz as sv
# import utils

In [5]:
# pd.set_option('max_rows', None)

In [6]:
from tqdm import tqdm

### item_data.jl.gz

In [7]:
item_data_fn = 'item_data.jl.gz'
item_data = pd.read_json(os.path.join(raw_data_dir, item_data_fn), lines=True)

In [8]:
item_data.sample(10)

Unnamed: 0,item_id,title,domain_id,product_id,price,category_id,condition
1122500,1795400,5 Pilhas Bateria 6v 4lr44 Eunicell Para Coleir...,MLB-CELL_BATTERIES,,49.49,MLB7060,new
1452933,910493,Filhote Pit Bull,MLB-PUREBRED_DOGS,6571822.0,500.0,MLB1073,new
1103934,535467,Escova Raquete Hair Style Cerdas Javali - Marc...,MLB-HAIR_BRUSHES,,46.9,MLB44081,new
756432,966811,Macaco Hidráulico Tipo Jacaré De 2 Toneladas C...,MLB-HYDRAULIC_VEHICLE_JACKS,,185.32,MLB116338,new
326021,1611281,Arnes Electrico Para Barra Led,MLM-CAR_LIGHT_BULBS,,269.0,MLM92625,new
211146,843273,Control Para Ps3 Generico Inalambrico Recargab...,MLM-GAMEPADS_AND_JOYSTICKS,,259.99,MLM58377,new
1515377,1997503,Módulo Amplificador Hertz Hdp 5 ( 5 Canais / C...,MLB-AUTOMOTIVE_AMPLIFIERS,,4170.0,MLB3385,new
1363344,1786217,Geladeira Duplex Bosch Dynamic Cooling,MLB-ANTIQUE_REFRIGERATORS,,500.0,MLB40272,used
485546,1826162,550g Mix Frozen Granillo Perla Nacarad Comesti...,MLM-KITCHEN_SUPPLIES,,135.0,MLM167551,new
144311,1739865,Funda Colchón Anti Chinche Y Ácaros Individual...,MLM-MATTRESS_COVERS,,419.0,MLM159222,new


* MLB: Brasil
* MLM: Other countries

In [9]:
item_data.domain_id.str[:3].value_counts()

MLB    1723216
MLM     378210
Name: domain_id, dtype: int64

### train_dataset.jl.gz

In [11]:
def proc_dataset(df):
    number_of_batches = len(df) // 50
    proc_df = list()
    for df_p in tqdm(np.array_split(df, number_of_batches), ):
        if 'item_bought' in df_p:
            df_p = pd.concat([df_p.user_history.apply(pd.Series), df_p.item_bought], axis = 1).stack()
        else:
            df_p = df_p.user_history.apply(pd.Series).stack()
            
        df_p = df_p.apply(pd.Series)
        df_p.reset_index(inplace = True)
        df_p.drop(columns = 'level_1', inplace = True)
        
        if 'item_bought' in df_p:
            new_columns = {0:'item_bought', 'level_0': 'seq'}
        else:
            new_columns = {'level_0': 'seq'}
        
        df_p.rename(columns = new_columns, inplace = True)
        
        df_p['event_timestamp'] = pd.to_datetime(df_p.event_timestamp.str[:-9])
        df_p['time_diff'] = df_p.groupby('seq').event_timestamp.diff().dt.seconds
        
        if 'item_bought' in df_p:
            df_p['event_type'] = df_p.event_type.fillna('buy')
        
        proc_df.append(df_p)
        
    proc_df = pd.concat(proc_df)        
    return proc_df

In [12]:
def read_data(raw_fn = 'train_dataset.jl.gz', processed_fn = 'train_dataset.pkl',
              force_process = False):

    processed_fp = os.path.join(processed_data_dir, processed_fn)
    if os.path.exists(processed_fp) and not force_process:
        processed = pd.read_pickle(processed_fp)
    else:
        raw = pd.read_json(os.path.join(raw_data_dir, raw_fn), lines = True)
        raw['len_events'] = raw.user_history.str.len()
        raw.sort_values('len_events', inplace = True)
        raw.drop('len_events', axis = 1, inplace = True)
        processed = proc_dataset(raw)
        processed.to_pickle(os.path.join(processed_data_dir, processed_fn))
        
    if 'item_bought' in processed: 
        processed.item_bought = processed.item_bought.fillna(method = 'backfill').astype(int)
        processed['in_nav'] = processed.item_bought == processed.event_info
    
    return processed

In [13]:
%%time
train = read_data('train_dataset.jl.gz', 'train_dataset.pkl')

CPU times: user 3.4 s, sys: 901 ms, total: 4.31 s
Wall time: 4.49 s


In [29]:
def save_true_labels(df, true_fn = 'true.pkl'):
    true_fp = os.path.join(processed_data_dir, true_fn)
    true_df = df[(df.event_type.isnull()) | (df.event_type == 'buy')]
    true_df = true_df[['seq', 'item_bought']]
    true_df.to_pickle(true_fp)

In [30]:
save_true_labels(train)

### 4% dos eventos sao iguais ao produto comprado

In [12]:
train.in_nav.value_counts(normalize = True)

False    0.959279
True     0.040721
Name: in_nav, dtype: float64

### 30% dos produtos comprados foram navegados (nesse periodo de 1 semana)

In [13]:
train.groupby('seq').in_nav.any().value_counts(normalize = True)

False    0.706116
True     0.293884
Name: in_nav, dtype: float64

In [14]:
train.query("seq == 121")#.time_diff

Unnamed: 0,seq,item_bought,event_info,event_timestamp,event_type,time_diff,in_nav
864,121,388604,MINOXIDIL,2019-10-13 15:33:41,search,,False
865,121,388604,MINOXIDIL,2019-10-13 17:08:36,search,5695.0,False
866,121,388604,429456,2019-10-13 17:09:04,view,28.0,False
867,121,388604,429456,2019-10-13 17:09:43,view,39.0,False
868,121,388604,429456,2019-10-13 17:10:37,view,54.0,False
869,121,388604,MINOXIDIL,2019-10-13 17:10:38,search,1.0,False
870,121,388604,625179,2019-10-13 17:10:46,view,8.0,False
871,121,388604,MINOXIDIL,2019-10-13 17:10:58,search,12.0,False
872,121,388604,410942,2019-10-13 17:11:45,view,47.0,False
873,121,388604,410942,2019-10-13 17:12:29,view,44.0,False


### test_dataset.jl.gz

In [17]:
%%time
test = read_data('test_dataset.jl.gz', 'test_dataset.pkl')

100%|██████████| 3541/3541 [42:38<00:00,  1.38it/s]


CPU times: user 41min 30s, sys: 48.6 s, total: 42min 18s
Wall time: 43min 2s


In [19]:
test.sample()

Unnamed: 0,seq,event_info,event_timestamp,event_type,time_diff
318,67440,FORMAS 3D PENTE,2019-10-01 04:42:14,search,15.0


In [20]:
test[test.seq == 67440]

Unnamed: 0,seq,event_info,event_timestamp,event_type,time_diff
270,67440,FORMA ABS,2019-09-30 08:18:08,search,
271,67440,1205559,2019-09-30 08:18:16,view,8.0
272,67440,FORMA ABS,2019-09-30 08:19:25,search,69.0
273,67440,1205559,2019-09-30 08:19:30,view,5.0
274,67440,1205559,2019-09-30 08:20:31,view,61.0
275,67440,FORMA ABS,2019-09-30 08:21:42,search,71.0
276,67440,1205559,2019-09-30 08:21:47,view,5.0
277,67440,1787446,2019-09-30 08:29:57,view,490.0
278,67440,1205559,2019-09-30 09:14:29,view,2672.0
279,67440,CARENAGEM MINI QUADRICICLO 49CC,2019-09-30 20:58:49,search,42260.0


_____

* test_dataset.jl.gz
* sample_submission.csv