In [1]:
import csv
import numpy as np
from tqdm import tqdm_notebook as tqdm

collect all item ids that appear in the dataset

In [2]:
item_ids = set()
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        item_ids.add(int(line[0]))
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item'] and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))
with open('../Dataset/test.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        if line[4] in ['clickout item', 'interaction item deals', 'interaction item image', 'interaction item info',  'interaction item rating',  'search for item']  and line[5].isdigit():
            item_ids.add(int(line[5]))
        if line[10] != '':
            item_ids.update(set(list(map(int, line[10].split('|')))))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
len(item_ids)

928080

Collect all the properties of items, and map them to integer indexes

In [4]:
property_to_index = {}
with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    index = 0
    for line in tqdm(reader):
        for prop in line[1].split('|'):
            if prop not in property_to_index:
                property_to_index[prop] = index
                index += 1

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [5]:
len(property_to_index)

157

Map items to property lists, each of length 157 (1 if the property exists, else 0)  
This is itself the item embeddings, later concatenated with their prices

In [62]:
item_to_property = {}

for item in item_ids:
    item_to_property[item] = [0 for _ in range(157)]

with open('../Dataset/item_metadata.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in reader:
        for prop in line[1].split('|'):
            item_to_property[int(line[0])][property_to_index[prop]] = 1

In [17]:
len(item_to_property)

928080

In [18]:
item_to_property[102947]

array([1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0])

Map other non-integer properties to integers

In [19]:
user = set()
action_type = set()
platform = set()
city = set()
device = set()
current_filters = set()

In [21]:
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            current_filters.add(line[9])
            
with open('../Dataset/test.csv') as f:
    reader = csv.reader(f)
    next(reader)
    
    for line in tqdm(reader):
        user.add(line[0])
        action_type.add(line[4])
        platform.add(line[6])
        city.add(line[7])
        device.add(line[8])
        if line[9]:
            current_filters.add(line[9])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [22]:
user = sorted(list(user))
action_type = sorted(list(action_type))
platform = sorted(list(platform))
city = sorted(list(city))
device = sorted(list(device))
current_filters = sorted(list(current_filters))

In [23]:
print(len(user))
print(len(action_type))
print(len(platform))
print(len(city))
print(len(device))
print(len(current_filters))

948041
10
55
37843
3
74420


In [24]:
user_to_ind = {user[i]:i for i in range(len(user))}
action_to_ind = {action_type[i]:i for i in range(len(action_type))}
platform_to_ind = {platform[i]:i for i in range(len(platform))}
city_to_ind = {city[i]:i for i in range(len(city))}
device_to_ind = {device[i]:i for i in range(len(device))}
current_filters_to_ind = {current_filters[i]:i for i in range(len(current_filters))}

Mapping reference value to integers, each in separate bins based on their action type

In [14]:
reference = [set() for _ in range(10)]
with open('../Dataset/train.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for line in tqdm(reader):
        reference[action_to_ind[line[4]]].add(line[5])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [26]:
for i in range(10):
    count = 0
    for ref in reference[i]:
        if not ref.isdigit():
            count += 1
    print(f'{count} / {len(reference[i])}')

8 / 8
0 / 289506
201 / 201
0 / 83016
0 / 198632
5 / 118613
0 / 87435
22844 / 22844
0 / 69741
13352 / 13352


There are noises in the 'interaction item info' action reference value

In [27]:
for ref in reference[5]:
    if not ref.isdigit():
        print(ref)

Estació de Sants
Shinjuku Station
Lower Manhattan
Miyako Airport
unknown


In [15]:
reference_to_ind = [{} for _ in range(10)]
for i in range(10):
    reference_i = list(reference[i])
    
    if i in [1, 3, 4, 5, 6, 8]:
        for j in range(len(reference_i)):
            if reference_i[j] in ['Estació de Sants', 'Shinjuku Station', 'Lower Manhattan', 'Miyako Airport', 'unknown']:
                reference_to_ind[i][reference_i[j]] = 0
            else:
                reference_to_ind[i][reference_i[j]] = int(item_to_ind[int(reference_i[j])])
    else:
        reference_to_ind[i].update({reference_i[j]:j for j in range(len(reference_i))})

Preprocessing train.csv with the mapped values

In [16]:
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    with open('../Dataset/processed_train.csv', 'w', newline='') as w:
        wr = csv.writer(w)
        wr.writerow(next(reader))

        timestep = 1541037460
        sess_index = -1
        for row in tqdm(reader):
            if row[3]=='1':
                timestep = int(row[2])
                sess_index += 1
            wr.writerow([user_to_ind[row[0]], sess_index, int(row[2])-timestep, int(row[3]), action_to_ind[row[4]], reference_to_ind[action_to_ind[row[4]]][row[5]],
                                  platform_to_ind[row[6]], city_to_ind[row[7]], device_to_ind[row[8]], current_filters_to_ind.get(row[9], 0), row[10], row[11]])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Mapping filters to integers

In [28]:
filter_types = set()
for filters in current_filters:
    for filter in filters.split('|'):
        filter_types.add(filter)

In [29]:
len(filter_types)

205

In [30]:
filter_to_index = {}
index = 0
for filter in filter_types:
    filter_to_index[filter] = index
    index += 1

Extracting the training dataset

In [49]:
def embed_session(session):
    total_time = int(session[-1][2]) - int(session[0][2])
    filter = np.arange(1, len(session)) / (len(session)*(len(session)-1)*0.5)
    vec = np.zeros(362)
    
    for i in range(len(session)-1):
        stall = int(session[i+1][2]) - int(session[i][2])
        multiplier = stall * filter[i] / total_time
        if session[i][5].isdigit():
            vec[0:157] += multiplier * item_to_property[int(session[i][5])]
        if session[i][9]:
            for f in session[i][9].split('|'):
                vec[157+int(filter_to_index[f])] += multiplier
    
    vec[359] = platform_to_ind[session[0][6]]
    vec[360] = city_to_ind[session[0][7]]
    vec[361] = device_to_ind[session[0][8]]
    
    return vec

In [46]:
x = []
y = []
choices = []
prices = []

In [50]:
with open('../Dataset/train.csv', 'r') as f:
    reader = csv.reader(f)
    next(reader)
    
    sess = []
    
    for line in tqdm(reader):
        if line[3]=='1' and len(sess) > 0:
            if sess[-1][4] == 'clickout item':
                if len(sess) <= 1:
                    sess = []
                    continue
                x.append(embed_session(np.array(sess)))
                choices.append(sess[-1][10].split('|'))
                prices.append(sess[-1][11].split('|'))
                y.append(int(sess[-1][5]))
            sess = []
        elif line[4] in ['clickout item', 'interaction item rating', 'interaction item info', 'interaction item image', 'interaction item deals', 'search for item']:
            sess.append(line)

  


item[i] corresponds to the choices given to the user at clickout time.  
each choices are encoded with their properties, and its price concatenated at the end.

In [69]:
item = np.asarray([np.asarray([item_to_property[int(choices[j][i])] + [int(prices[j][i])] for i in range(len(choices[j]))]) for j in range(len(choices))])

Model definition and testing

In [53]:
import torch
import torch.nn as nn
import torch.optim as optim

from itertools import islice

In [101]:
class FM(nn.Module):

    def __init__(self, d, session_dim, item_dim):
        super().__init__()

        self.Q = torch.nn.Parameter(torch.rand(session_dim, d))
        self.P = torch.nn.Parameter(torch.rand(item_dim, d))
        self.bq = torch.nn.Parameter(torch.rand(session_dim))
        self.bp = torch.nn.Parameter(torch.rand(item_dim))
        
        self.Q.data = torch.rand(session_dim, d)
        self.P.data = torch.rand(item_dim, d)
        self.bq.data = torch.rand(session_dim)
        self.bp.data = torch.rand(item_dim)
        

    def forward(self, session, item):
        """
        session: (batch_size, 1, session_dim)
        item: (batch_size, 25, item_dim)
        
        returns (batch_size, 25)
        """
#        bmm (batch_size, 1, d) * (batch_size, d, 25) => (batch_size, 1, 25)
#        matmul (batch_size, 1, session_dim) * (session_dim) => (batch_size, 1, 1)
#        matmul (batch_size, 25, item_dim) * (item_dim) => (batch_size, 25, 1)
        return torch.bmm(torch.matmul(session, self.Q), torch.matmul(item, self.P).transpose(1, 2)) + torch.matmul(session, self.bq).unsqueeze(dim=2) + torch.matmul(item, self.bp).unsqueeze(dim=1)


In [102]:
model = FM(d=100, session_dim=362, item_dim=158).to('cuda')

In [103]:
optimizer = optim.Adagrad(model.parameters())

In [99]:
#session0, item0 = x[0].to('cuda'), item[0].to('cuda')
x[0].shape, item[0].shape
session0 = torch.Tensor(x[0]).unsqueeze(0).unsqueeze(1).to('cuda')
item0 = torch.Tensor(item[0]).unsqueeze(0).to('cuda')

In [110]:
session01 = torch.Tensor([x[0], x[1]]).unsqueeze(1).to('cuda')
item01 = torch.Tensor([item[0], item[1]]).to('cuda')

In [112]:
model(session01, item01).shape

torch.Size([2, 1, 25])