In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
path = '/Users/ivoliv/data/yoochoose'
NUMROWS = 1e6

In [3]:
!ls -l $path

total 3754144
-rw-r--r--@ 1 ivoliv  staff        3973 Nov  6  2014 dataset-README.txt
-rw-r--r--  1 ivoliv  staff    55583744 Nov  5  2014 yoochoose-buys.dat
-rw-r--r--  1 ivoliv  staff  1486798186 Nov  5  2014 yoochoose-clicks.dat
-rw-r--r--  1 ivoliv  staff   371725851 Nov  5  2014 yoochoose-test.dat


In [4]:
def print_uniques(df):
    name =[x for x in globals() if globals()[x] is df][0]
    print('== {} =='.format(name))
    print('{:<15}: {:,}'.format('total obs', len(df)))
    print('unique values:')
    for c in df.columns:
        print(' {:<14}: {:,}'.format(c, len(df[c].unique())))

def filter_sessions(table, filter_table):
    filter_sessions_ = filter_table['sessionID'].unique()
    table = table[table['sessionID'].isin(filter_sessions_)]
    return table.sort_values(by=['sessionID', 'timestamp'])

In [5]:
clicks = pd.read_csv(os.path.join(path, 'yoochoose-clicks.dat'), header=None, nrows=NUMROWS,
                     names=['sessionID', 'timestamp', 'itemID', 'category'])
buys = pd.read_csv(os.path.join(path, 'yoochoose-buys.dat'), header=None, nrows=NUMROWS,
                   names=['sessionID', 'timestamp', 'itemID', 'price', 'quantity'])

### Only keep buy sessions with positive number of items

In [6]:
buys = buys[buys['quantity'] > 0]
buys=buys.sort_values(by=['sessionID', 'timestamp'])

### Only keep click sessions related to positive buys

In [7]:
clicks = filter_sessions(clicks, buys)
print_uniques(clicks)

== clicks ==
total obs      : 115,338
unique values:
 sessionID     : 16,551
 timestamp     : 115,321
 itemID        : 9,038
 category      : 1


### Filter buys to only relate to remaining (led to positive) clicks

In [8]:
buys = filter_sessions(buys, clicks)
print_uniques(buys)

== buys ==
total obs      : 31,728
unique values:
 sessionID     : 16,551
 timestamp     : 31,453
 itemID        : 4,327
 price         : 348
 quantity      : 19


In [9]:
# Click count statistics
clicks.groupby('sessionID')['sessionID'].count().describe()

count    16551.000000
mean         6.968642
std          6.949923
min          1.000000
25%          3.000000
50%          5.000000
75%          9.000000
max        121.000000
Name: sessionID, dtype: float64

In [10]:
clicks[clicks['sessionID'] == 11]

Unnamed: 0,sessionID,timestamp,itemID,category
24,11,2014-04-03T10:44:35.672Z,214821275,0
25,11,2014-04-03T10:45:01.674Z,214821275,0
26,11,2014-04-03T10:45:29.873Z,214821371,0
27,11,2014-04-03T10:46:12.162Z,214821371,0
28,11,2014-04-03T10:46:57.355Z,214821371,0
29,11,2014-04-03T10:53:22.572Z,214717089,0
30,11,2014-04-03T10:53:49.875Z,214563337,0
31,11,2014-04-03T10:55:19.267Z,214706462,0
32,11,2014-04-03T10:55:47.327Z,214717436,0
33,11,2014-04-03T10:56:30.520Z,214743335,0


In [11]:
buys[buys['sessionID'] == 11]

Unnamed: 0,sessionID,timestamp,itemID,price,quantity
10,11,2014-04-03T11:04:11.417Z,214821371,1046,1
11,11,2014-04-03T11:04:18.097Z,214821371,1046,1


In [12]:
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
import tqdm as tqdm

  from collections import Sequence


In [13]:
class SessionDataset(Dataset):
    def __init__(self, clicks_df, buys_df, 
                 train_split=0.7, dev_test_split=0.5, random_state=123):
        sessionIDs = clicks['sessionID'].unique()
        
        assert sum(sessionIDs != buys['sessionID'].unique()) == 0, \
            "sessionIDs of clicks and buys have to match, be sure to filter and sort by sessions ID"
    
        print('Indexing... ', flush=True, end='')
        sessionIDs = clicks['sessionID'].unique()
        clicks_df['idx'] = clicks['sessionID'].apply(lambda x: np.argwhere(sessionIDs==x)[0][0])
        buys_df['idx'] = buys['sessionID'].apply(lambda x: np.argwhere(sessionIDs==x)[0][0])
        indices = clicks['idx'].unique()
        print('Done.', flush=True)
        
        print('Processing clicks... ', flush=True, end='')
        clicks_items = {}
        for i in clicks['idx'].unique():
            clicks_items[i] = list(clicks_df[clicks_df['idx'] == i]['itemID'])
        print('Done.', flush=True)

        print('Processing buys... ', flush=True, end='')
        buys_items = {}
        for i in buys['idx'].unique():
            buys_items[i] = list(buys_df[buys_df['idx'] == i]['itemID'])
        print('Done.', flush=True)
        
        self.items = np.unique(np.concatenate([clicks['itemID'], buys['itemID']]))
        self.item_to_i= {}
        self.i_to_item= {}
        for i, k in enumerate(self.items):
            self.item_to_i[k] = i
            self.i_to_item[i] = k
        self.nitems = len(self.items)
        
        self.X_train, X_left = train_test_split(
            indices, test_size=1-train_split, random_state=random_state, shuffle=True)

        self.X_val, self.X_test = train_test_split(
            X_left, test_size=dev_test_split, random_state=random_state, shuffle=True)
        
        self.clicks_items_train = {i: clicks_items[k] for i,k in enumerate(self.X_train)}
        self.clicks_items_val = {i: clicks_items[k] for i,k in enumerate(self.X_val)}
        self.clicks_items_test = {i: clicks_items[k] for i,k in enumerate(self.X_test)}
 
        self.buys_items_train = {i: buys_items[k] for i,k in enumerate(self.X_train)}
        self.buys_items_val = {i: buys_items[k] for i,k in enumerate(self.X_val)}
        self.buys_items_test = {i: buys_items[k] for i,k in enumerate(self.X_test)}
        
        self._lookup_dict = {'train': (self.clicks_items_train, self.buys_items_train),
                             'val': (self.clicks_items_val, self.buys_items_val),
                             'test': (self.clicks_items_test, self.buys_items_test)}
        
        self.set_split('train')
        
    def __len__(self):
        return len(self._lookup_dict[self._split][0])
        
    def set_split(self, split):
        self._split = split
    
    def __getitem__(self, index):
        clicks_items, buys_items = self._lookup_dict[self._split]
        
        one_hot_clicks = np.zeros(self.nitems)
        
        for item in clicks_items[index]:
            one_hot_clicks[self.item_to_i[item]] = 1
            
        first_item_bought = self.item_to_i[buys_items[index][0]]
        
        return {'idx': index,
                'x_data': torch.tensor(one_hot_clicks).float(),
                'y_target': torch.tensor(first_item_bought).view(-1)}
    
    def getitem_raw(self, index):
        clicks_items, buys_items = self._lookup_dict[self._split]
        
        return {'idx': index,
                'x_data': clicks_items[index],
                'y_target': buys_items[index]}
        
    def __str__(self):
        strval = 'sessions in ' + self._split + ': {:,}'.format(len(self._lookup_dict[self._split][0])) + '\n'
        strval += 'items in ' + self._split + ': {:,}'.format(self.nitems)
        return strval

In [14]:
dataset = SessionDataset(clicks, buys)

Indexing... Done.
Processing clicks... Done.
Processing buys... Done.


In [15]:
print(dataset)

sessions in train: 11,585
items in train: 9,038


In [16]:
batch_size = 16
dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                        drop_last=False, shuffle=True)

In [17]:
for i in dataloader:
    idx = i['idx'][0].item()
    print(i['idx'][0])
    print(sum(i['x_data'][0]))
    print(i['y_target'][0])
    print(i)
    break

tensor(3212)
tensor(5.)
tensor([7534])
{'idx': tensor([ 3212,  2470, 10432,  7665,  5191,  4245,  3917,  1374,  5342,  9561,
         8737,  9021,  3124, 10444,  2195,   887]), 'x_data': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'y_target': tensor([[7534],
        [7784],
        [7368],
        [7032],
        [2601],
        [ 157],
        [7618],
        [6260],
        [5369],
        [4824],
        [5873],
        [7406],
        [6619],
        [7053],
        [6637],
        [2802]])}


In [18]:
idx

3212

In [19]:
clicks[clicks['idx'] == dataset.X_train[idx]]

Unnamed: 0,sessionID,timestamp,itemID,category,idx
669230,198889,2014-04-06T17:47:21.655Z,214827005,0,10313
669231,198889,2014-04-06T17:47:22.062Z,214827005,0,10313
669232,198889,2014-04-06T17:47:22.785Z,214827005,0,10313
669233,198889,2014-04-06T17:48:14.282Z,214827007,0,10313
669234,198889,2014-04-06T17:48:14.879Z,214827007,0,10313
669235,198889,2014-04-06T17:48:15.388Z,214827007,0,10313
669236,198889,2014-04-06T17:48:43.735Z,214827007,0,10313
669237,198889,2014-04-06T17:48:44.387Z,214827007,0,10313
669238,198889,2014-04-06T17:49:03.398Z,214834865,0,10313
669239,198889,2014-04-06T17:49:50.765Z,214834865,0,10313


In [20]:
buys[buys['idx'] == dataset.X_train[idx]]

Unnamed: 0,sessionID,timestamp,itemID,price,quantity,idx
13002,198889,2014-04-06T18:21:28.741Z,214827007,837,1,10313
13003,198889,2014-04-06T18:21:29.023Z,214834865,523,1,10313
13004,198889,2014-04-06T18:21:29.079Z,214826925,837,1,10313
13005,198889,2014-04-06T18:21:29.471Z,214827007,837,1,10313
13006,198889,2014-04-06T18:21:29.648Z,214834865,523,1,10313
13007,198889,2014-04-06T18:21:29.718Z,214826925,837,1,10313


In [21]:
from torch import nn
import torch.nn.functional as F

In [22]:
class productMLP(nn.Module):
    def __init__(self, input_size, h_sizes, output_size):
        super(productMLP, self).__init__()
        
        self.fc1 = nn.Linear(input_size, h_sizes[0])
        
        self.hidden = nn.ModuleList()
        for i in range(len(h_sizes)-1):
            self.hidden.append(nn.Linear(h_sizes[i], h_sizes[i+1]))
            
        self.last = nn.Linear(h_sizes[i], output_size)
        
    def forward(self, inputs):
        
        x = F.relu(self.fc1(inputs))
        for h in self.hidden:
            x = F.relu(h(x))
        output = self.last(x)
        
        return output

In [23]:
model = productMLP(dataset.nitems, [100,100], dataset.nitems)

In [24]:
print(model)

productMLP(
  (fc1): Linear(in_features=9038, out_features=100, bias=True)
  (hidden): ModuleList(
    (0): Linear(in_features=100, out_features=100, bias=True)
  )
  (last): Linear(in_features=100, out_features=9038, bias=True)
)


In [25]:
data = next(iter(dataloader))

In [26]:
x_data = data['x_data']
y_target = data['y_target']

In [27]:
y_predict = model(x_data)
y_predict

tensor([[ 0.0576, -0.0649,  0.0051,  ..., -0.1050, -0.0360,  0.0554],
        [ 0.0605, -0.0638,  0.0063,  ..., -0.1079, -0.0352,  0.0567],
        [ 0.0606, -0.0648,  0.0081,  ..., -0.1058, -0.0381,  0.0579],
        ...,
        [ 0.0593, -0.0603,  0.0078,  ..., -0.1018, -0.0340,  0.0541],
        [ 0.0601, -0.0632,  0.0080,  ..., -0.1051, -0.0371,  0.0557],
        [ 0.0601, -0.0636,  0.0074,  ..., -0.1070, -0.0380,  0.0568]],
       grad_fn=<ThAddmmBackward>)

In [28]:
ce_loss = nn.CrossEntropyLoss()

In [29]:
y_target.view(-1)

tensor([2353, 6065, 7389, 6953, 7427, 3113, 7477, 8938, 3992,  661, 8713, 7412,
        7494, 6797, 4838, 1330])

In [30]:
y_predict.shape

torch.Size([16, 9038])

In [31]:
loss = ce_loss(y_predict, y_target.view(-1))

In [32]:
loss

tensor(9.1115, grad_fn=<NllLossBackward>)

In [33]:
from torch.optim import Adam

In [34]:
optim = Adam(params=model.parameters(), lr=0.01)
batch_size = 16

In [35]:
def run_batch(model, runtype='train', device=torch.device('cpu')):
    
    model.to(device)
    
    if runtype == 'train':
        model.train()
    dataset.set_split(runtype)
    batch_size = 16
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            drop_last=False, shuffle=True)
    
    running_loss = 0
    
    for data in dataloader:

        model.zero_grad()
        
        x_data = data['x_data'].to(device)
        y_target = data['y_target'].to(device)
    
        y_predict = model(x_data)
        
        loss = ce_loss(y_predict, y_target.view(-1))
        
        running_loss += loss.item()
        
        loss.backward()
        
        optim.step()
        
    running_loss /= len(dataset)
    
    return running_loss

In [None]:
EPOCHS = 30
hist = []
for epoch in range(EPOCHS):
    loss_train = run_batch(model, 'train')
    loss_val = run_batch(model, 'val')
    print('{} {:.4} {:.4}'.format(epoch+1, loss_train, loss_val))
    hist.append((loss_train, loss_val))

1 0.3828 0.3437
2 0.2703 0.2777
3 0.231 0.2401
4 0.2081 0.2236
5 0.1919 0.2049
6 0.1818 0.1898
7 0.1756 0.1823
8 0.1714 0.1814
9 0.1628 0.1753
10 0.1618 0.169


In [None]:
hist = np.array(hist)
plt.plot(hist[:,0], 'r', hist[:,1], 'g')