In [1]:
import sys

import csv
import numpy as np
from tqdm import tqdm_notebook as tqdm

In [2]:
item_ids = set()
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        item_ids.add(int(line[0]))
        
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item'] and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))
            
with open('../Dataset/test.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item']  and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
property_to_index = {}
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    index = 0
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            if prop not in property_to_index:
                property_to_index[prop] = index
                index += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [4]:
item_to_property = {}

for item in item_ids:
    item_to_property[item] = np.zeros(157)

with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            item_to_property[int(line[0])][property_to_index[prop]] = 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
user = set()
action_type = set()
platform = set()
city = set()
device = set()
current_filters = set()

In [6]:
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)
            
with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            for filter in line[9].split('|'):
                current_filters.add(filter)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [7]:
user = sorted(list(user))
action_type = sorted(list(action_type))
platform = sorted(list(platform))
city = sorted(list(city))
device = sorted(list(device))
current_filters = sorted(list(current_filters))

In [8]:
user_to_ind = {user[i]:i for i in range(len(user))}
action_to_ind = {action_type[i]:i for i in range(len(action_type))}
platform_to_ind = {platform[i]:i for i in range(len(platform))}
city_to_ind = {city[i]:i for i in range(len(city))}
device_to_ind = {device[i]:i for i in range(len(device))}
current_filters_to_ind = {current_filters[i]:i for i in range(len(current_filters))}

In [9]:
reference = [set() for _ in range(10)]
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [10]:
# reference_to_ind[i]: dictionary object that maps reference string of action_type[i] to integer index

reference_to_ind = [{} for _ in range(10)]
for i in range(10):
    reference_i = list(reference[i])
    
    if i in [1, 3, 4, 5, 6, 8]:    # for items, non-item-index references are mapped to zero. item-indexes are mapped to itself.
        for j in range(len(reference_i)):
            if reference_i[j] in ['', 'Estació de Sants', 'Shinjuku Station', 'Lower Manhattan', 'Miyako Airport', 'unknown']:
                reference_to_ind[i][reference_i[j]] = 0
            else:
                reference_to_ind[i][reference_i[j]] = int(reference_i[j])
    else:   # for non-items, reference values are indexed from 0 to number_of_references-1
        reference_to_ind[i].update({reference_i[j]:j for j in range(len(reference_i))})

In [11]:
def embed_session(session):
    # session: a python list of all lines in the session where its action_type references **an item**
    
    # first timestamp of the session
    base_time = int(session[0][2])
    
    # initialize session embedding matrix
    sess = np.zeros((len(session), 366))
    
    for i in range(len(session)):
        sess[i][0] = int(session[i][2])-base_time
        sess[i][1] = platform_to_ind[session[i][6]]
        sess[i][2] = city_to_ind[session[i][7]]
        sess[i][3] = device_to_ind[session[i][8]]
        if session[i][5].isdigit():
            sess[i][4:161] = item_to_property[int(session[i][5])]
        if session[i][9]:
            for f in session[i][9].split('|'):
                sess[i][161+int(current_filters_to_ind[f])] += 1.0
        
    return sess

In [12]:
x = []
y = []
choices = []
prices = []

with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    sess = []
    
    for line in tqdm(reader):
        if line[3]=='1' and len(sess) > 0:      # first action of the next session met
            if sess[-1][4] == 'clickout item':   # take into the traing dataset only if the last action was 'clickout item'.
                x.append(embed_session(sess))
                choices.append(np.asarray(list(map(int, sess[-1][10].split('|')))))
                prices.append(np.asarray(list(map(int, sess[-1][11].split('|')))))
                for i in range(len(choices[-1])):
                    if choices[-1][i] == int(sess[-1][5]):
                        y.append(i)
                        break
                else:
                    x.pop(-1)
                    choices.pop(-1)
                    prices.pop(-1)
            sess = []
        # Only collect actions that reference **an item**.
        if line[4] in ['clickout item', 'interaction item rating', 'interaction item info', 'interaction item image', 'interaction item deals', 'search for item']:
            sess.append(line)
    
    # For the last session
    if sess[-1][4] == 'clickout item':
        x.append(embed_session(np.array(sess)))
        choices.append(np.asarray(list(map(int, sess[-1][10].split('|')))))
        prices.append(np.asarray(list(map(int, sess[-1][11].split('|')))))
        for i in range(len(choices[-1])):
            if choices[-1][i] == int(sess[-1][5]):
                y.append(i)
                break
        else:
            x.pop(-1)
            choices.pop(-1)
            prices.pop(-1)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [13]:
M = -1
m = 26
mask = []
for i in range(len(choices)):
    mask.append(len(choices[i]))
    if M < mask[-1]:
        M = mask[-1]
    if m >  mask[-1]:
        m =  mask[-1]
print(f"min: {m}, max: {M}")
print(f"len(mask): {len(mask)}")

min: 1, max: 25
len(mask): 676718


In [14]:
item = []
for i in tqdm(range(len(choices))):
    choice_items = []
    for j in range(len(choices[i])):
        choice_items.append(np.append(item_to_property[choices[i][j]], prices[i][j]))
    
    # for sessions with less than 25 impressions, the item vector is padded with zeros.
    for j in range(25-len(choices[i])):
        choice_items.append(np.zeros(158))
    tmp = np.asarray(choice_items, dtype=np.double)
    
    item.append(tmp)

HBox(children=(IntProgress(value=0, max=676718), HTML(value='')))




In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

import matplotlib.pyplot as plt

In [None]:
class NN(nn.Module):
    
    def __init__(self, session_dim, item_dim):
        self.lstm = nn.LSTM(session_dim, 128, 1)
        self.layers = nn.Sequential(
            nn.Linear()
        )