In [1]:
import pandas as pd
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim

import time
import datetime
import os
from tqdm import tqdm

In [2]:
result_dir = "Results"
if not os.path.exists(result_dir):
    os.makedirs(result_dir)

In [3]:
class Dataset(object):
    def __init__(self, path, itemMap=None):
        self.sessionKey = 'SessionId'
        self.itemKey = 'ItemId'
        self.timeKey = 'Time'
        self.ItemIdx = 'ItemIdx'
        self.data = pd.read_csv(path, sep=',', dtype={self.sessionKey: int, self.itemKey: int, self.timeKey: float})

        # アイテムマップの作成し結合（評価時は学習データのアイテムマップを使用する）
        self.itemMap = itemMap;
        if (self.itemMap is None):
            itemIds = self.data[self.itemKey].unique()
            itemMap = pd.Series(data=np.arange(len(itemIds)), index=itemIds);
            item_indices = itemMap[itemIds].values
            itemDict = {
                self.itemKey: itemIds,
                self.ItemIdx: item_indices
            }
            self.itemMap = pd.DataFrame(itemDict)
        self.data = pd.merge(self.data, self.itemMap, on=self.itemKey, how='inner')
        
        # セッション数の累積合計リスト（セッションの開始位置リスト）を取得
        self.data.sort_values([self.sessionKey, self.timeKey], inplace=True)
        session_nItem = self.data.groupby(self.sessionKey).size()
        cum_session_nItem = session_nItem.cumsum()
        self.sessionsArray = np.r_[0, cum_session_nItem.values]

        self.nItems = len(self.itemMap)

class DataGenerator():
    def __init__(self, dataset, nSample=2048):
        self.data = dataset.data
        self.nItems = dataset.nItems
        self.sessionsArray = dataset.sessionsArray

        self.batchSize = 32
        self.nSample = nSample   
        self.sampleAlpha = 0.75
        self.sampleBuffer = 10000000
        
        if self.nSample:
            self.popularArray = self.createPopular(dataset.itemMap, dataset.itemKey);
        self.sessionIdxArr = np.argsort(self.data.groupby(dataset.sessionKey)[dataset.timeKey].min().values) 
        self.totalIters = ((len(self.data) - len(self.sessionsArray)) // self.batchSize)

    def __iter__(self):
        dataItems = self.data.ItemIdx.values;
        sessionsArray = self.sessionsArray
        sessionIdxArr = self.sessionIdxArr
        
        iters = np.arange(self.batchSize)
        maxiter = iters.max()
        start = sessionsArray[sessionIdxArr[iters]]
        end = sessionsArray[sessionIdxArr[iters] + 1]      
        nSessions = len(sessionsArray) - 1
        
        finished = False
        finishedMask = (end - start <= 1)
        validMask = (iters < nSessions)
        
        while not finished:
            minlen = (end - start).min()
            outIdx = dataItems[start]

            for i in range(minlen - 1):
                inIdx = outIdx
                outIdx = dataItems[start + i + 1]
                if (self.nSample):
                    if(self.sampleBuffer):
                        if(self.samplePointer == self.generatelength):
                            self.negativSamples = self.generateNegSamples(self.popularArray, self.generatelength)
                            self.samplePointer = 0
                        sample = self.negativSamples[self.samplePointer]
                        self.samplePointer += 1;
                    else:
                        sample = self.generateNegSamples(self.pop, 1);
                    y = np.hstack([outIdx, sample])
                else:
                    y = outIdx;
                    
                input = torch.LongTensor(inIdx)
                target = torch.LongTensor(y)
                yield input, target, finishedMask, validMask
                                
                finishedMask[:] = False;
                validMask[:] = True
                
            start = start + minlen - 1
            finishedMask = (end - start <= 1)
            nFinished = finishedMask.sum()
            iters[finishedMask] = maxiter + np.arange(1, nFinished + 1)
            maxiter += nFinished;
            
            validMask = (iters < nSessions)
            nValid = validMask.sum()
            
            if (nValid == 0):
                finished = True;
                break;
   
            iters[~validMask] = 0;         
            sessions = sessionIdxArr[iters[finishedMask]]
            start[finishedMask] = sessionsArray[sessions]
            end[finishedMask] = sessionsArray[sessions + 1]
            iters = iters[validMask]
            start = start[validMask]
            end = end[validMask]

    def generateNegSamples(self, popularArray, length):
        sample = np.searchsorted(popularArray, np.random.rand(self.nSample * length))
        if length > 1:
            sample = sample.reshape((length, self.nSample))
        return sample

    def createPopular(self, itemMap, itemKey):
        popularArray = self.data.groupby(itemKey).size()
        itemIds = itemMap.loc[:, itemKey].values

        print(popularArray)
        
        # 人気度の確率分布を作成
        popularArray = popularArray[itemIds].values ** self.sampleAlpha
        popularArray = popularArray.cumsum() / popularArray.sum()

        # ネガティブサンプリング
        self.generatelength = (self.sampleBuffer // self.nSample)
        self.negativSamples = np.searchsorted(popularArray, np.random.rand(self.nSample * self.generatelength))
        self.negativSamples = self.negativSamples.reshape((self.generatelength, self.nSample))

        self.samplePointer = 0

        return popularArray

In [4]:
dataFolder = '../../../data/'
trainDataFile = 'Train.csv'
validDataFile = 'Valid.csv'

sessionKey = 'SessionId';
itemKey = 'ItemId';
timeKey = 'Time';

trainPath = os.path.join(dataFolder, trainDataFile)
validPath = os.path.join(dataFolder, validDataFile)

trainDataSet = Dataset(trainPath)
validDataSet = Dataset(validPath, itemMap=trainDataSet.itemMap)

In [5]:
class GRU4Rec(nn.Module):
    def __init__(self, inputSize, outputSize):

        super(GRU4Rec, self).__init__()

        self.inputSize = inputSize
        self.outputSize = outputSize
        self.batchSize = 32
        self.hiddenSize = 100
        self.nLayers = 1
        self.sigma = 0.0
        self.negative = True

        self.embeddingDim = -1
        
        self.dropoutHidden = 0.0
        self.dropoutEmbed = 0.0
        self.initAsNormal = False
        
        self.device = torch.device('cuda' if torch.cuda.is_available()  else 'cpu')

        self.gru = nn.GRU(self.inputSize, self.hiddenSize, self.nLayers, bias=False, dropout=self.dropoutHidden)
        self.onehotBuffer = torch.FloatTensor(self.batchSize, self.outputSize)
        self.linear = nn.Linear(self.hiddenSize, self.outputSize)
        self.Tanh = nn.Tanh()

        self.initParams();
        
        self = self.to(self.device)

    def forward(self, input, hidden, target=None):

        # 1-of-Nエンコーディング
        embedded = self.onehotEncode(input)
        embedded = embedded.unsqueeze(0)

        
        output, hNew = self.gru(embedded, hidden) 
        output = output.view(-1, output.size(-1))                   
        output = self.linear(output) 
        output = self.Tanh(output)
        
        return output, hNew

    def onehotEncode(self, input):
        
        self.onehotBuffer.zero_()
        index = input.view(-1, 1)
        onehot = self.onehotBuffer[:len(index)].scatter_(1, index, 1)
        
        return onehot

    def initHidden(self, batchSize):
        h0 = torch.zeros(self.nLayers, int(batchSize), self.hiddenSize).to(self.device)
        return h0
    
    def resetHidden(self, hidden, finishedMask, validMask):
        if any(finishedMask):
            hidden[:, finishedMask, :] = 0 

        if any((~validMask)):
            hidden = hidden[:, validMask, :]
            
        return hidden.data;
    
    def initParams(self):
        for name, param in self.gru.named_parameters():
            if 'weight' in name:
                WR, WZ, WN = param.chunk(3, 0)
                self.initMatrix(WR)
                self.initMatrix(WZ)
                self.initMatrix(WN)
    
            elif 'bias' in name :
                param.data.zero_()
        
        self.initMatrix(self.linear.weight)
        self.linear.bias.data.zero_()
        
    def initMatrix(self, param):
        
        shape = list(param.shape)
        sigma = np.sqrt(6.0 / np.sum(shape))
        param.data.uniform_(-sigma, sigma)

In [6]:
class LossFunction(nn.Module):
    def __init__(self, lossType='top1', useCuda=True, bpreg=1.0):
        
        super(LossFunction, self).__init__()
        self.lossType = lossType
        self.useCuda = useCuda
        lossType = lossType.lower()

        if lossType == 'top1':
            self._lossFn = TOP1Loss()
        else:
            raise NotImplementedError

    def forward(self, input, target=None):
        return self._lossFn(input, target)

class TOP1Loss(nn.Module):
    def __init__(self):
        super(TOP1Loss, self).__init__()
    def forward(self, input, target=None):
        diff = -(input.diag().view(-1, 1).expand_as(input) - input)
        loss = torch.sigmoid(diff).mean() + torch.sigmoid(input ** 2).mean()
        return loss

In [7]:
class Optimizer:
    def __init__(self, params, optimizerType='adagrad', lr=0.05, momentum=0, weightDecay=0, eps=1e-6):

        optimizerType = optimizerType.lower()
        if optimizerType == 'adagrad':
            self.optimizer = optim.Adagrad(params, lr=lr, weight_decay=weightDecay, eps=1e-6)
        else:
            raise NotImplementedError

    def zero_grad(self):
        self.optimizer.zero_grad()

    def step(self):
        self.optimizer.step()

In [8]:
def getRecall(indices, targets): 
    targets = targets.view(-1, 1).expand_as(indices)
    hits = (targets == indices).nonzero()
    if len(hits) == 0:
        return 0
    n_hits = (targets == indices).nonzero()[:, :-1].size(0)
    recall = float(n_hits) / targets.size(0)
    return recall

def getMrr(indices, targets):
    tmp = targets.view(-1, 1)
    targets = tmp.expand_as(indices)
    hits = (targets == indices).nonzero()
    ranks = hits[:, -1] + 1
    ranks = ranks.float()
    rranks = torch.reciprocal(ranks)
    mrr = torch.sum(rranks).data / targets.size(0)
    return mrr
    
def calc(indices, targets, k=20):
    _, indices = torch.topk(indices, k, -1)
    recall = getRecall(indices, targets)
    mrr = getMrr(indices, targets)
    return recall, mrr

In [9]:
class Evaluation(object):
    def __init__(self, model, lossFunc=None, k=20):
        self.model = model
        self.lossFunc = lossFunc
        self.topk = k
        self.device = model.device

    def evalute(self, validGenerator):
        self.model.eval()
        losses = []
        recalls = []
        mrrs = []
        with torch.no_grad():
            batchSize = validGenerator.batchSize
            hidden = self.model.initHidden(batchSize)
            for ii , (input, target, finishedMask, validMask) in tqdm(enumerate(validGenerator),
                                                                     total=validGenerator.totalIters,
                                                                     miniters=1000, position=0, leave=True):
                input = input.to(self.device)
                target = target.to(self.device)
                hidden = self.model.resetHidden(hidden, finishedMask, validMask)
                logit, hidden = self.model(input, hidden)
               
                if(self.lossFunc is not None):
                    loss = self.lossFunc(logit, target)
                    if(~np.isnan(loss.item())):
                        losses.append(loss.item())

                recall, mrr = calc(logit, target, k=self.topk)
                recalls.append(recall)
                mrrs.append(mrr.cpu().numpy())
                
        if(len(losses)):
            meanLoss = np.mean(losses)
        else :
            meanLoss = 0
                    
        meanRecall = np.mean(recalls)
        meanMrr = np.mean(mrrs)

        return meanLoss, meanRecall, meanMrr

In [10]:
class Trainer(object):
    def __init__(self, model, trainGenerator, validGenerator, optim, lossFunc, topN, resultDir):
                
        self.topN = topN
        self.model = model
        self.optim = optim
        self.lossFunc = lossFunc
        self.resultDir = resultDir
        self.device = model.device
        self.evalutor = Evaluation(self.model, self.lossFunc, k=topN)
        
        self.trainGenerator = trainGenerator
        self.validGenerator = validGenerator
        
    def train(self, nEpochs=10):
        for epoch in range(nEpochs):
            st = time.time()
            print('Start Epoch #', epoch)
            
            trainLoss = self.trainEpoch(epoch)
            validLoss, recall, mrr = self.evalutor.evalute(self.validGenerator)
            
            print("Epoch: {}, train loss: {:.4f}, validloss: {:.4f}, recall: {:.4f}, mrr: {:.4f}, time: {}".format(epoch, trainLoss, validLoss, recall, mrr, time.time() - st))
            self.saveModel(epoch, validLoss, trainLoss, recall, mrr) 

    def trainEpoch(self, epoch):
        losses = []
        self.model.train()
        batchSize = float(self.trainGenerator.batchSize)
        hidden = self.model.initHidden(batchSize)
        negative = self.model.negative
        
        for _ , (input, target, finishedMask, validMask) in tqdm(enumerate(self.trainGenerator), total=self.trainGenerator.totalIters, miniters=1000, position=0, leave=True):
            input = input.to(self.device)
            target = target.to(self.device)            
            hidden = self.model.resetHidden(hidden, finishedMask, validMask)
            logit, hidden = self.model(input, hidden, target)

            loss = self.lossFunc(logit, target)        
            loss = (float(len(input)) / batchSize) * loss
            
            if(~np.isnan(loss.item())):
                losses.append(loss.item())
                loss.backward()
                self.optim.step() 
                self.optim.zero_grad()
            
        meanLoss = np.mean(losses)
        return meanLoss
    
    def saveModel(self, epoch, validLoss, trainLoss, recall, mrr):
        checkPoints = {
              'model': self.model,
              'epoch': epoch,
              'optim': self.optim,
              'validLoss': validLoss,
              'trainLoss': trainLoss,
              'recall': recall,
              'mrr': mrr
        }
        modelName = os.path.join(self.resultDir, "model_{0:05d}.pt".format(epoch))
        torch.save(checkPoints, modelName)
        print("Save model as %s" % modelName)

In [11]:
inputSize = trainDataSet.nItems 
outputSize = inputSize 
hiddenSize = 100   
nLayers = 1 
batchSize = 32 
negative = True 
embeddingDim = -1 
dropoutHidden = 0.0
dropoutEmbed = 0.0
sigma = 0.0
initAsNormal = False
cuda = torch.cuda.is_available() 

lr = 0.1
weightDecay = 0.0
momentum = 0.0
bpreg = 1.0
nEpochs = 10

timeSort = True
trainRandomOrder=False
sampleAlpha=0.75
trainNSample = 2048 
validNSample = 0
sampleStore = 10000000
topN = 20

resultDir = 'Results'

In [12]:
hiddenSize = 100
nEpochs = 1
batchSize = 32
dropoutHidden = 0.0
dropoutEmbed = 0.0
lr = 0.2
trainNSample = 2048
sampleAlpha=0.0
bpreg = 1.0
timeSort = False
embeddingDim = -1

In [13]:
model = GRU4Rec(inputSize=inputSize, outputSize=outputSize)

In [14]:
optimizer = Optimizer(model.parameters(), lr=lr, weightDecay=weightDecay, momentum=momentum)   

In [15]:
lossFunc = LossFunction(useCuda=cuda, bpreg=bpreg)

In [16]:
trainGenerator = DataGenerator(trainDataSet, nSample=trainNSample)
validGenerator = DataGenerator(validDataSet, nSample=validNSample)

ItemId
5          1332
11        25520
12         9062
13        28385
14        20896
          ...  
302376        2
304023        1
306650        1
307663        2
326359        3
Length: 4602, dtype: int64


In [17]:
trainer = Trainer(model, trainGenerator=trainGenerator, validGenerator=validGenerator, optim=optimizer, lossFunc=lossFunc, topN=topN, resultDir=resultDir)

In [18]:
trainer.train(nEpochs)

Start Epoch # 0


147621it [49:42, 49.49it/s]                                                                                            
49109it [06:06, 134.16it/s]                                                                                            

Epoch: 0, train loss: 0.7380, validloss: 0.7398, recall: 0.0547, mrr: 0.0111, time: 3349.142196893692
Save model as Results\model_00000.pt



