In [37]:
import torch
import torch.nn as nn
import os
import sys
import pickle as pk
import numpy as np
import random

from sklearn.metrics import roc_auc_score


In [38]:
workspace_dir = '.'
try:
    from google.colab import drive
    drive.mount( '/content/drive/' )

    workspace_dir = os.path.join( '.' , 'drive', 'My Drive', 'DIN-pytorch')
    sys.path.append( workspace_dir)
    ! rm -rf data
    ! tar zxf "{workspace_dir}/data.tar.gz" -C ./
    ! tar zxf "{workspace_dir}/loader.tar.gz" -C ./
    ! ls -al data   
except ImportError:
    pass

In [39]:
from model import DIN, DIEN, DynamicGRU
from DataLoader import MyDataSet

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
#Model hyper parameter
MAX_LEN = 100
EMBEDDING_DIM = 18
# HIDDEN_SIZE_ATTENTION = [80, 40]
# HIDDEN_SIZE_FC = [200, 80]
# ACTIVATION_LAYER = 'LeakyReLU' # lr = 0.01


# Adam
LR = 1e-3
BETA1 = 0.5
BETA2 = 0.99

# Train
BATCH_SIZE = 1
EPOCH_TIME = 8
TEST_ITER = 1000

RANDOM_SEED = 19940808

USE_CUDA = True

In [41]:
train_file = os.path.join( 'D:/myf/data/rec_DIN_pytorch1/data', "local_train_splitByUser")
test_file  = os.path.join( 'D:/myf/data/rec_DIN_pytorch1/data', "local_test_splitByUser")
uid_voc    = os.path.join( 'D:/myf/data/rec_DIN_pytorch1/data', "uid_voc.pkl")
mid_voc    = os.path.join( 'D:/myf/data/rec_DIN_pytorch1/data', "mid_voc.pkl")
cat_voc    = os.path.join( 'D:/myf/data/rec_DIN_pytorch1/data', "cat_voc.pkl")

In [42]:
if USE_CUDA and torch.cuda.is_available():
    print( "Cuda is avialable" )
    device = torch.device('cuda')
    dtype = torch.cuda.FloatTensor
else:
    device = torch.device( 'cpu')
    dtype = torch.FloatTensor

In [43]:
device

device(type='cpu')

In [44]:
# Stable the random seed
def same_seeds(seed = RANDOM_SEED):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
    np.random.seed(seed)  
    random.seed(seed) 
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Initilize  parameters
def weights_init( m):
    try:
        classname = m.__class__.__name__
        if classname.find( 'BatchNorm') != -1:
            nn.init.normal_( m.weight.data, 1.0, 0.02)
            nn.init.constant_( m.bias.data, 0)
        elif classname.find( 'Linear') != -1:
            nn.init.normal_( m.weight.data, 0.0, 0.02)
        elif classname.find( 'Embedding') != -1:
            m.weight.data.uniform_(-1, 1)
    except AttributeError:
        print( "AttributeError:", classname)
    


def eval_output( scores, target, loss_function = torch.nn.functional.binary_cross_entropy_with_logits):
    loss = loss_function( scores.type( dtype) , target.type( dtype))

    y_pred = scores.sigmoid().round()
    accuracy = ( y_pred == target).type( dtype).mean()

    auc = roc_auc_score( target.cpu().detach(), scores.cpu().detach() )
    return loss, accuracy, auc

In [45]:
# The dict mapping description(string) to type index(int) 
# A more graceful api https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html#sklearn.preprocessing.LabelEncoder not used in this project

user_map = pk.load( open( uid_voc, 'rb')); n_uid = len( user_map)
material_map = pk.load( open( mid_voc, 'rb')); n_mid = len( material_map)
category_map = pk.load( open( cat_voc, 'rb')); n_cat = len( category_map)

In [46]:
same_seeds( RANDOM_SEED)

dataset_train = MyDataSet( train_file, user_map, material_map, category_map, max_length = MAX_LEN)
dataset_test = MyDataSet( test_file, user_map, material_map, category_map, max_length = MAX_LEN)

loader_train = torch.utils.data.DataLoader( dataset_train, batch_size = BATCH_SIZE, shuffle = True)
loader_test = torch.utils.data.DataLoader( dataset_test, batch_size = BATCH_SIZE, shuffle = False)

# with open( 'data/loader.pk', 'rb') as fin:
#     loader_train, loader_test = pk.load(fin) 

In [47]:
 for i, data in enumerate(loader_train):
        print(data)
        print(data[0].shape)
        print(data[1].shape)
        print(data[2].shape)
        print(data[3].shape)
        print(data[4].shape)
        print(data[5].shape)
        print(data[6].shape)
        print(data[7].shape)
        print(data[8].shape)
        print(data[9].shape)
        break

[tensor([352760]), tensor([[41667,  4996, 42136, 73494, 68549, 10342, 29868,  5004, 24174,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]]), tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0,

列1,2,3是历史特征（序列）

列0,5,6,7,8是 单一特征（非序列）

列9是label

列0-8分别是：user, material_historical, category_historical, mask, sequential_length, \
material, category, material_historical_neg, category_historical_neg = data

此外，MAX_LEN = 100

In [48]:
# Get model and initialize it
# model = DIEN(  n_uid, n_mid, n_cat, EMBEDDING_DIM).to( device)
model = DIN(  n_uid, n_mid, n_cat, EMBEDDING_DIM ).to( device)
model.apply( weights_init)

# Set loss function and optimizer
optimizer = torch.optim.Adam( model.parameters(), LR, ( BETA1, BETA2))

model.train(); iter = 0
for epoch in range( EPOCH_TIME):

    for i, data in enumerate( loader_train):
        iter += 1

        # transform data to target device
   
        data = [ item.to( device) if item != None else None for item in data]
        target = data.pop(-1)     
        
        model.zero_grad()

        scores = model( data, neg_sample = False)
        
        loss, accuracy, auc = eval_output( scores, target)

        loss.backward()
        optimizer.step( )
        
        print( "\r[%d/%d][%d/%d]\tloss:%.5f\tacc:%.5f\tauc:%.5f"%( epoch + 1, EPOCH_TIME, i + 1, len( loader_train), loss.item(), accuracy.item(), auc.item() ) ,end='')

        if iter % TEST_ITER == 0:
            model.eval()
            with torch.no_grad():
                score_list = []; target_list = []
                for data in  loader_test:
                    data = [ item.to( device) if item != None else None for item in data]
                    
                    target = data.pop(-1)

                    scores = model( data, neg_sample = False)
                    score_list.append( scores)
                    target_list.append( target)
                scores = torch.cat( score_list, dim = -1)
                target = torch.cat( target_list, dim = -1)
                loss, accuracy, auc = eval_output( scores, target)
                print( "\tTest Set\tloss:%.5f\tacc:%.5f\tauc:%.5f"%( loss.item(), accuracy.item(), auc.item() ) )
            model.train()

AttributeError: InputEmbedding


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

### For Test

In [None]:
x=2
[x * 8]+[1,2,3]

[16, 1, 2, 3]

In [None]:
em = nn.Embedding( 5, 4)
x =torch.tensor([0,1,2,3,4,1,2,3])
em(x)

tensor([[ 0.1555,  0.2382, -1.1286, -1.4313],
        [ 0.5782,  0.5709, -1.8673,  0.0081],
        [-0.7404, -0.4467,  1.7358,  1.0103],
        [-0.4847, -1.8467,  0.6648, -1.3730],
        [ 1.1842, -0.4927, -0.7235, -3.0902],
        [ 0.5782,  0.5709, -1.8673,  0.0081],
        [-0.7404, -0.4467,  1.7358,  1.0103],
        [-0.4847, -1.8467,  0.6648, -1.3730]], grad_fn=<EmbeddingBackward>)

In [None]:
B=2
D= 8
T =10
fact = torch.ones(B,T,D)
scores = torch.ones(B,1,T)
torch.matmul( scores, fact).squeeze().shape

torch.Size([2, 8])

In [None]:
x=torch.tensor([1,2,3])
y=torch.tensor([1,2,3])
x * y

tensor([1, 4, 9])