## Rating csv


In [1]:
import pandas as pd
import datetime
import numpy as  np
from tqdm import tqdm
from scipy.sparse import csr_matrix
np.random.seed(1337)
with open('./kaggle/rating_train.csv', 'r') as f:
    ls = f.readlines()[1:]
u_map = {}

dates = []
foods = []
users = []
    


with tqdm(total=len(ls)) as pbar:
    for l in ls:
        date_str, user, food = l.strip().split(',')
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        user, food = int(user), int(food)
        if user not in u_map:
            u_map[user] = []
        u_map[user].append( (date, food) )
        
        dates.append(date)
        users.append(user)
        foods.append(food)
        pbar.update(1)
        

user_map = {u:i for i, u in enumerate(set(users))}        
food_map = {f:i for i, f in enumerate(set(foods))}


# for ranking sparse matrix
rows = [user_map[u] for u in users]
cols = [food_map[f] for f in foods]
R = csr_matrix((np.ones([len(rows), ]), (rows, cols)), shape=(len(user_map), len(food_map)))

pos_count = np.array(np.sum(R, axis=0)).flatten()
neg_count = len(ls) - pos_count

class_weight =  1. / 2.*neg_count
pos_weight = neg_count / pos_count
print R.shape
print neg_count.shape
print pos_count.shape

100%|██████████| 2681494/2681494 [00:19<00:00, 134161.20it/s]


(2608, 5532)
(5532,)
(5532,)


## User csv


In [2]:
import pandas as pd
import numpy as np
import keras
import os
from constants import MAX_TEXT_SEQ_LEN, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences



csv = pd.read_csv('./kaggle/user.csv')
print csv.columns
# print 'userid,username,age,gender,location,city,state,title,about_me,reasons,inspirations,friends_count'
# for csv.iterrows
texts = []
id_list = []
age_list = []
gender_list = []
print 'Starting read texts.'
for row in csv.iterrows():
    r = row[1]
    s = ''
    s += r['about_me'] if not pd.isnull(r['about_me']) else ''
    s += r['reasons'] if not pd.isnull(r['reasons']) else ''
    s += r['inspirations'] if not pd.isnull(r['inspirations']) else ''
    id_list.append(r['userid'])
    age_list.append(r['age'])
    gender_list.append(r['gender'])
    texts.append(s)

# normalize age
valid_age_list = [age for age in age_list if not np.isnan(age)]
m, std = np.mean(valid_age_list), np.std(valid_age_list)
for i,age in enumerate(age_list):
    if not np.isnan(age):
        age_list[i] = float(age-m) / std
    else:
        age_list[i] = 0
buf = [len(s) for s in texts]
print np.max(buf), np.mean(buf), np.std(buf)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
pad_data = data = pad_sequences(sequences, maxlen=MAX_TEXT_SEQ_LEN, padding='post', truncating='post')


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

        
# 
userid_map = {user:i for i, user in enumerate(id_list)}
def get_user_feature_fn(userid):
    idx = userid_map[userid]
    age = age_list[idx]
    gender = 1 if gender_list[idx] == 'Female' else 0
    text_seq = pad_data[idx, :]
    x = np.array([age, gender])
    return x, text_seq
u, u_text = get_user_feature_fn(8526)
print u.shape, u_text.shape
print u, u_text
print embedding_matrix.shape

Using TensorFlow backend.


Index([u'userid', u'username', u'age', u'gender', u'location', u'city',
       u'state', u'title', u'about_me', u'reasons', u'inspirations',
       u'friends_count'],
      dtype='object')
Starting read texts.
6990 567.4152607361963 737.8887977118112
Found 13852 unique tokens.
Preparing embedding matrix.
(2,) (2000,)
[-0.53151114  1.        ] [  4 372  19 ...   0   0   0]
(2001, 300)


#### prepare sample generator


In [13]:
from constants import MAX_SEQ_LEN

val_num = len(u_map) // 10
idx = np.random.permutation(len(u_map))
train_idx, val_idx = idx[val_num:], idx[:val_num]
train_u_map = {k:u_map[k] for k in u_map.keys()[val_num:]}
val_u_map = {k:u_map[k] for k in u_map.keys()[:val_num]}
def batch_boostrap_generator(batch_size, u_map, food_map, max_history_len, get_user_feature_fn):
    G = boostrap_generator(u_map, food_map, max_history_len, get_user_feature_fn)
    while True:
        X = []
        Y = []
        U = []
        U_text = []
        
        for i in range(batch_size):
            x, y, u, u_text = next(G)
            X.append(np.expand_dims(x, axis=0))
            Y.append(np.expand_dims(y, axis=0))
            U.append(np.expand_dims(u, axis=0))
            U_text.append(np.expand_dims(u_text, axis=0))
        yield np.vstack(X), np.vstack(Y), np.vstack(U), np.vstack(U_text)
def boostrap_generator(u_map, food_map, max_history_len, get_user_feature_fn):
    while True:
        keys = u_map.keys()
        for user_idx in np.random.permutation(len(u_map)):
            user = keys[user_idx]
            X = np.zeros([max_history_len, len(food_map)])
            history = u_map[user]
            ds = np.array([d for d,f in history])
            fs = np.array([f for d,f in history])
            date_list = sorted(set(ds))
            rest_date_set = date_list[-7:]
            sorted_idx = np.argsort(ds)
            ds = ds[sorted_idx]
            fd = fs[sorted_idx]
            
            date_idx = 0
            now_date = ds[0]
            y = np.zeros([len(food_map),])
            for food, date in zip(fs,ds):
                if date in rest_date_set:
                    y[food_map[food]] = 1
                else:
                    if date != now_date:
                        date_idx+=1
                        now_date = date
                    X[date_idx, food_map[food]] = 1
            u, u_text = get_user_feature_fn(user)
            
            yield X, y, u, u_text
            
    

G = batch_boostrap_generator(32, train_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
val_G = batch_boostrap_generator(32//2, val_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
x, pad_mask, u, u_text = next(G)
print x.shape, pad_mask.shape, u.shape, u_text.shape
# x, pad_mask = next(val_G)
# print x.shape, pad_mask.shape

# G2 = boostrap_generator(train_u_map, food_map, max_history_len=MAX_SEQ_LEN)
# x, x_len = next(G2)
# print x.shape
# print pad_mask[3,:,3]

(32, 165, 5532) (32, 5532) (32, 2) (32, 2000)


### Transformer with ALS embedding Training

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
# construct neuron network

def scaled_dot_attention(Q, K, V, mask):
    assert Q.size()[-1] == K.size()[-1]
    assert len(Q.size()) == 3 and len(K.size()) == 3 and len(V.size()) == 3
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).cuda()
    out = torch.matmul(Q,K.permute(0,2,1)) / torch.sqrt(dk) 
    if mask is not None:
        out.masked_fill_(mask, -float('inf'))
    return torch.matmul(F.softmax(out, dim=-1), V)

def positional_encoding(d_model, pos):
    assert d_model % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,d_model], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
                            
class Transformer_v4(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, p_drop, d_ff, use_cuda=True, posi_cache_length=200):
        super(Transformer_v4, self).__init__()
#         for construct cache positional encoding matrix.
        self.d_model = dm
        self.use_cuda = use_cuda
        
        self.encoder = Stack_Encoder(layer_num, dk, dv, dm, h, p_drop, d_ff)
        self.emb_drop = nn.Dropout(p_drop)
        self.init_pos_mat(posi_cache_length)

    def forward(self, Q):
    
        
#         decoder
        batch, Q_len, d = Q.size()
        
        try:
            Q = Q + self.get_pos_mat(Q_len)
        except RuntimeError, e:
            if e.message == 'TensorIterator expected type torch.cuda.FloatTensor but got torch.FloatTensor':
                if Q.is_cuda != self.get_pos_mat(K_len).is_cuda:
                    print('Make sure cache positional matrix is same type of tensor with input, both cuda tensor or not.\nBy setting argument use_cuda=True to set cache positional encoding matrix as a cuda tensor.')
            raise
        
        Q = self.emb_drop(Q)
        
        en_out = self.encoder(Q)
        return en_out
    
#     To speed up the positional encoding by construct an cache matrix. 
    def init_pos_mat(self, cache_length):
        print('init postional matrix with length : %d ' % cache_length)
        self.positional_matrix = torch.cat([positional_encoding(self.d_model, i) for i in range(0,cache_length)], dim=0)
        self.positional_matrix.requires_grad = False
        if self.use_cuda:
            self.positional_matrix = self.positional_matrix.cuda()
            
        
    def get_pos_mat(self, length):
        if length > self.positional_matrix.shape[0]:
            print('input sequence length reach positional matrix maximum length. %d ' % length)
            ret = torch.cat([positional_encoding(self.d_model, i) for i in range(length)], dim=0)
            ret.requires_grad = False
            print('Increase positional matrix maximum length. %d ' % length)
            self.positional_matrix = ret
            if self.use_cuda:
                self.positional_matrix = self.positional_matrix.cuda()
            return ret
        else:
            return self.positional_matrix[:length]
        

    
    

class Stack_Encoder(nn.Module):
    """
    Stacked Decoder
    """
    def __init__(self, layer_num, dk, dv, dm, h, p_drop, d_ff,):
        super(Stack_Encoder, self).__init__()
        self.encoders = nn.ModuleList([Encoder(dk, dv, dm, h, p_drop, d_ff) for i in range(layer_num)])
        
        
    def forward(self, Q):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.encoders:
            Q = lay(Q)
        return Q           

class Encoder(nn.Module):
    def __init__(self, dk, dv, dm, h, p_drop, d_ff):
        super(Encoder, self).__init__()
        
#         query attention residual block
        self.Q_attention_lay = nn.Linear(dm, dm)
        self.Q_att_drop = nn.Dropout(p_drop)

#         feed forward residual block
        self.fcn = nn.Linear(dm, dm)
        self.linear_drop = nn.Dropout(p_drop)
        

    def forward(self, Q):
#         query attention
        Q_attention_out = self.Q_attention_lay(scaled_dot_attention(Q, Q, Q, mask=None))
        Q_att_out = self.Q_att_drop(Q_attention_out)
        
#         feed forward
        linear_out = self.fcn(Q_att_out)
        return linear_out

    
# Transformer paper baseline hyper-parameters
STACKED_NUM = 1
H = 4
D_MODEL = 128
DK = DV = D_MODEL//H
P_DROP = 0.05
D_FF = D_MODEL*4




    

import torch
import torch.nn as nn
import torch.nn.functional as F### Transformer with ALS embedding Training
# import Transformer/

import numpy as np
from constants import FOOD_NUM, USER_NUM
class Net(nn.Module):

    def __init__(self, dm, p_drop, emb_mat):
        super(Net, self).__init__()
        self.drop = nn.Dropout(p_drop)
        self.food_emb = Food_embedding(FOOD_NUM, dm, 1, p_drop)
        self.user_emb = User_embedding(dm, 3, emb_mat, p_drop, activation_fn=F.relu)
        self.transformer = Transformer_v4(STACKED_NUM, DK, DV, D_MODEL, H, P_DROP, D_FF, use_cuda=True).cuda()
        self.history_att_w = nn.Parameter(torch.zeros([1, dm], dtype=torch.float))
        torch.nn.init.xavier_normal_(self.history_att_w)
        self.output_linear = nn.Linear(2*dm, FOOD_NUM)

    def forward(self, history, u, u_text):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        
        x = self.food_emb(history)
        batch, x_len, d = x.size()
        
        x = self.transformer(x)
        att_w = self.history_att_w.view(1,1, -1).repeat(batch, 1, 1)
        att_out = scaled_dot_attention(att_w, x, x, mask=None).squeeze(1)
        
#         x = torch.sigmoid(x)
        u_out = self.user_emb(u, u_text)
    
        y = self.output_linear(torch.cat([att_out, u_out], dim=-1))
        
        return y
        
class Food_embedding(nn.Module):
    def __init__(self, c_in, dm, layer_num, p_drop, activation_fn=F.selu):
        super(Food_embedding, self).__init__()
        self.activation_fn = activation_fn
        self.drop = nn.Dropout(p_drop)
        assert layer_num >= 1
        self.first_linear = nn.Linear(c_in, dm)
        self.linears = nn.ModuleList([nn.Linear(dm, dm) for i in range(layer_num-1)])
        

    def forward(self, x):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        x = self.first_linear(x)
        for lay in self.linears:
            x = self.activation_fn(lay(x))
            if lay != self.linears[-1]:
                x = self.drop(x)
        return x

class User_embedding(nn.Module):
    def __init__(self, dm, layer_num, emb_mat, p_drop, activation_fn=F.selu):
        super(User_embedding, self).__init__()
        self.activation_fn = activation_fn
        self.drop = nn.Dropout(p_drop)
        assert layer_num >= 1
        self.emb = nn.Embedding(emb_mat.shape[0], emb_mat.shape[1], padding_idx=0)
        self.emb.weight = nn.Parameter(torch.FloatTensor(emb_mat))
        self.emb.weight.requires_grad_(False)
        self.emb_linear = nn.Linear(emb_mat.shape[1], dm)
        self.att_weight = nn.Parameter(torch.zeros([1, dm-2**(layer_num+1), ], dtype=torch.float))
        torch.nn.init.xavier_normal_(self.att_weight)
        self.linears = nn.ModuleList([nn.Linear(2**(i+1), 2**(i+2)) for i in range(layer_num)])
        

    def forward(self, u, u_text):
        for lay in self.linears:
            u = self.activation_fn(lay(u))
#         for text
        u_text = self.emb(u_text)
        u_text = self.activation_fn(self.emb_linear(u_text))
        batch, seq, d = u_text.size()
        att_w = self.att_weight.view(1,1, -1).repeat(batch, 1, 1)
        Q = torch.cat([u.unsqueeze(1),att_w], dim=-1)
        u_att = scaled_dot_attention(Q, u_text, u_text, mask=None)
        u_att.squeeze_(1)
        
        return u_att
    
batch = 7
dm = D_MODEL
Q = torch.rand([batch, 18, FOOD_NUM]).cuda()
u = torch.rand([batch, 2]).cuda()
u_text = torch.randint(MAX_NUM_WORDS,[batch, MAX_TEXT_SEQ_LEN], dtype=torch.long).cuda()
model = Net(dm, 0.1, embedding_matrix).cuda()
o = model(Q, u, u_text)
# print t
print(o.size())
# print o

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))




init postional matrix with length : 200 
torch.Size([7, 5532])
2201936


In [None]:
from collections import deque
from tqdm import tqdm as tqdm

import time
def dump_log(model, n_iter, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f\n' % (n_iter, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1)
    log_file_stream.write(log_text)
    if n_iter % 10 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)
def normal_acc(pred, label):
    assert len(label.size()) == 2
    label = label.type(torch.uint8)
    mask = torch.ones_like(label)
#     
    acc = pred == label
    acc = torch.sum(acc).item() / float(torch.sum(mask))
#     
    TP = float(torch.sum(pred.masked_select(label)).item())
    TP_FP = torch.sum(pred.masked_select(mask)).item()
    precision = TP / TP_FP if TP_FP != 0. else 0.
    recall = TP / torch.sum(label).item()
# 
    f1 = 2.*precision*recall / (precision+recall) if (precision+recall) != 0  else 0
    
    return acc, precision, recall, f1
        
acc_q = deque(maxlen=1000)
precision_q = deque(maxlen=1000)
recall_q = deque(maxlen=1000)
loss_q = deque(maxlen=1000)
f1_q = deque(maxlen=1000)

val_acc_q = deque(maxlen=10000)
val_loss_q = deque(maxlen=10000)
val_precision_q = deque(maxlen=10000)
val_recall_q = deque(maxlen=10000)
val_f1_q = deque(maxlen=1000)

t = time.time()
best_f1  = 0
best_loss = float('inf')

epochs = 100
batch_size = 16
G = batch_boostrap_generator(batch_size, train_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
val_G = batch_boostrap_generator(batch_size, val_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
# criterion = nn.BCEWithLogitsLoss(weight=torch.FloatTensor(class_weight).cuda(), reduction='none', pos_weight=torch.FloatTensor(pos_weight).cuda())
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
print 'start training.'
with open('log-hybrid.txt', 'w') as f:
    with open('best-hybrid.txt', 'w') as best_log:
        iters = 100000000
        with tqdm(total=iters) as pbar:
            for it in range(iters):
                optimizer.zero_grad()
                model.train()
                seq, y, u, u_text = next(G)
                seq = torch.FloatTensor(seq).cuda()
                y = torch.FloatTensor(y).cuda()
                u = torch.FloatTensor(u).cuda()
                u_text = torch.LongTensor(u_text).cuda()
                
                seq.requires_grad_(False)
                u.requires_grad_(False)
                y.requires_grad_(False)
                u_text.requires_grad_(False)
                
                x = seq

                output = model(x, u, u_text)
                loss = criterion(output, y)
                pred = output > 0.5
                label = y


                acc, precision, recall, f1 = normal_acc(pred, label)
                acc_q.append(acc)
                precision_q.append(precision)
                recall_q.append(recall)
                f1_q.append(f1)
                loss.backward()

                optimizer.step()
                with torch.no_grad():
                    model.eval()
                    seq, y, u, u_text = next(val_G)
                    seq = torch.FloatTensor(seq).cuda()
                    y = torch.FloatTensor(y).cuda()
                    u = torch.FloatTensor(u).cuda()
                    u_text = torch.LongTensor(u_text).cuda()

                    seq.requires_grad_(False)
                    y.requires_grad_(False)
                    u.requires_grad_(False)
                    u_text.requires_grad_(False)


                    x = seq
                    output = model(x, u, u_text)
                    val_loss = criterion(output, y)
    
                    pred = output > 0.5

                    label = y

                    val_acc, val_precision, val_recall, val_f1 = normal_acc(pred, label)
                    val_acc_q.append(val_acc)
                    val_precision_q.append(val_precision)
                    val_recall_q.append(val_recall)
                    val_f1_q.append(val_f1)
                
                acc = np.mean(acc_q)
                precision = np.mean(precision_q)
                recall = np.mean(recall_q)
                f1 = np.mean(f1_q)
                
                val_acc = np.mean(val_acc_q)
                val_precision = np.mean(val_precision_q)
                val_recall = np.mean(val_recall_q)
                val_f1 = np.mean(val_f1_q)
                
#                     pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, loss : %.3f, val_loss : %.3f \t %.3f, %.3f, %.3f, %.3f' % (acc, val_acc, loss.item(), val_loss.item(), a,b,c,d), refresh=False)
                pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, precision : %.3f, val_precision : %.3f, recall : %.3f, val_recall : %.3f, loss : %.3f, val_loss : %.3f, f1 : %.3f, val_f1 : %.3f' % (acc, val_acc, precision, val_precision, recall, val_recall, loss.item(), val_loss.item(), f1, val_f1), refresh=False)
                pbar.update(batch_size)
                dump_log(model, (it+1)*batch_size, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1, f,'./tmp-hybrid.pt')
                if val_f1 > best_f1 and it > 100:
                    torch.save(model, './best-hybrid.pt')
                    best_f1 = val_f1
                    best_log.write('%d\t%.5f\n' % ((it+1)*batch_size, best_f1))
                    best_log.flush()

# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

  0%|          | 0/100000000 [00:00<?, ?it/s]

start training.


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 444368/100000000 [1:45:04<379:55:08, 72.79it/s, acc : 0.999, val_acc : 0.989, precision : 0.974, val_precision : 0.179, recall : 0.910, val_recall : 0.157, loss : 0.002, val_loss : 0.159, f1 : 0.941, val_f1 : 0.162]

In [59]:
a = torch.FloatTensor([0,0,1,1,0,0])
torch.

tensor([0, 0, 1, 1, 0, 0], dtype=torch.uint8)
tensor([0, 0, 0, 1, 0, 1], dtype=torch.uint8)
tensor([0., 0., 1., 0., 0., 1.])
tensor([2., 2., 2., 4., 2., 4.])
tensor(1.)


In [68]:
a = torch.FloatTensor([0,0,1,1,0,0])
b = torch.ByteTensor([0,0,0,1,0,1])
a.requires_grad_(False)
print a.clone().requires_grad

False


### testing with Transformer model

In [None]:
from constants import MAX_SEQ_LEN

def batch_boostrap_generator(batch_size, u_map, food_map, Y_map, max_history_len, flip):
    G = boostrap_generator(u_map, food_map, Y_map, max_history_len, flip)
    while True:
        X = []
        pos_Y = []
        neg_Y = []
        for i in range(batch_size):
            x, pos_y, neg_y = next(G)
            x = np.pad(x, ((0,max_history_len-x.shape[0]),(0,0)), 'constant', constant_values=0)
            X.append(np.expand_dims(x, axis=0))
            pos_Y.append(np.expand_dims(pos_y, axis=0))
            neg_Y.append(np.expand_dims(neg_y, axis=0))
        yield np.vstack(X), np.vstack(pos_Y), np.vstack(neg_Y) 

        
flip = True       
model = torch.load('./best.pt')
model.eval()
batch_size = 256
food_buf = torch.LongTensor(np.arange(len(food_map))).view(1,-1).repeat(batch_size, 1).cuda()
answer_sheet = np.zeros([len(u_map), len(food_map)])

with torch.no_grad():
    with tqdm(total=len(u_map)*len(food_map)) as pbar:
        keys = u_map.keys()
        ran = range((len(keys) // batch_size) +1) if len(keys) % batch_size != 0 else range(len(keys) // batch_size)
        for i in ran:
            a = i*batch_size
            b = (i+1)*batch_size if (i+1)*batch_size < len(keys) else len(keys)
            X = []
            for user in keys[a:b]:
                x = np.zeros([MAX_SEQ_LEN, len(food_map)])
                history = u_map[user]
                ds = np.array([d for d,f in history])
                fs = np.array([f for d,f in history])
                sorted_idx = np.flip(np.argsort(ds), axis=-1) if flip else np.argsort(ds)
                ds = ds[sorted_idx]
                fd = fs[sorted_idx]

                date_idx = 0
                now_date = ds[0]
                for food, date in zip(fs,ds):
                    if date != now_date:
                        date_idx+=1
                        now_date = date
                    x[date_idx, food_map[food]] = 1
                X.append(np.expand_dims(x,axis=0))
            
            X = torch.FloatTensor(np.vstack(X)).cuda()
            for food_idx  in range(len(food_map)):
                target = food_buf[:b-a, food_idx:food_idx+1]
                output = model(target, X)
                answer_sheet[a:b, food_idx:food_idx+1] =  output.cpu()
                pbar.update(batch_size)
                
np.save('./output_sheet', answer_sheet)
print answer_sheet.shape
print'Done'

rev_food_map = {v:k for k,v in food_map.items()}
k=20
a = ''
buf = []
with open('predict.csv', 'w') as f:
    f.write('userid,foodid\n')
    for i,user in enumerate(u_map.keys()):
        s = ''
        for food_idx in reversed(np.argsort(answer_sheet[i,:])[-k:]):
            s += ' %d' % rev_food_map[food_idx]
        f.write('%d,%s\n' % (user, s) )
        buf.append(a == s)
        a = s
print buf
print 'done'

    
            


In [None]:
from constants import MAX_SEQ_LEN

print 'V2'        
flip = True       
model = torch.load('./best_v2.pt')
model.eval()
batch_size = 256
food_buf = torch.LongTensor(np.arange(len(food_map))).view(1,-1).repeat(batch_size, 1).cuda()
answer_sheet = np.zeros([len(u_map), len(food_map)])

with torch.no_grad():
    with tqdm(total=len(u_map)*len(food_map)) as pbar:
        keys = u_map.keys()
        ran = range((len(keys) // batch_size) +1) if len(keys) % batch_size != 0 else range(len(keys) // batch_size)
        for i in ran:
            a = i*batch_size
            b = (i+1)*batch_size if (i+1)*batch_size < len(keys) else len(keys)
            X = []
            for user in keys[a:b]:
                x = np.zeros([MAX_SEQ_LEN, len(food_map)])
                history = u_map[user]
                ds = np.array([d for d,f in history])
                fs = np.array([f for d,f in history])
                sorted_idx = np.flip(np.argsort(ds), axis=-1) if flip else np.argsort(ds)
                ds = ds[sorted_idx]
                fd = fs[sorted_idx]

                date_idx = 0
                now_date = ds[0]
                for food, date in zip(fs,ds):
                    if date != now_date:
                        date_idx+=1
                        now_date = date
                    x[date_idx, food_map[food]] = 1
                X.append(np.expand_dims(x,axis=0))
            
            X = torch.FloatTensor(np.vstack(X)).cuda()
            for food_idx  in range(len(food_map)):
                target = food_buf[:b-a, food_idx:food_idx+1]
                output = model(target, X)
                answer_sheet[a:b, food_idx:food_idx+1] =  output.cpu()
                pbar.update(batch_size)
                
np.save('./output_sheet2', answer_sheet)
print answer_sheet.shape
print'Done'

rev_food_map = {v:k for k,v in food_map.items()}
k=20
a = ''
buf = []
with open('predict2.csv', 'w') as f:
    f.write('userid,foodid\n')
    for i,user in enumerate(u_map.keys()):
        s = ''
        for food_idx in reversed(np.argsort(answer_sheet[i,:])[-k:]):
            s += ' %d' % rev_food_map[food_idx]
        f.write('%d,%s\n' % (user, s) )
        buf.append(a == s)
        a = s
print buf
print 'done'

    
            


In [None]:
i = 0 
a = [b for b in reversed(np.argsort(answer_sheet[i,:])[-k:])]
print a

# i=1
a = [b for b in reversed(np.argsort(answer_sheet[i,:])[-k:])]
print a



In [None]:
print answer_sheet[0,:]
print answer_sheet[1,:]

In [None]:
# print acc_q
# print output.shape
# c = output < 0.5
# print output < 0.5

print pred.shape
print label.shape
print torch.sum(pred == label.type(torch.uint8)).item() / float(output.shape[0])

In [None]:
a = torch.tensor([[0.49]])
b = torch.zeros_like(a)
print a.shape
print a > 0.5
c = a > 0.5
print b,c
print b.type(torch.uint8) == c


In [None]:
salnj;kvahjk
# print u_map[6]
buf  = [] 
def ck(ds):
    l = max(ds) - min(ds)
    min_d = min(ds)
    
#     d_list = [(d-min_d) for d in ds]
#     d_list = sorted(set(d_list))
    d_list = sorted(set(ds))
#     print l, len(d_list)
    for i,d in enumerate(d_list):
        if i == 0:
            continue
        d_ = d_list[i-1]
        if (d - d_).days.real != 1:
            d_diff = (d - d_).days.real
            buf.append(d_diff)
#             print d_diff
#             assert d_diff < 10
def cf(ds):
    l = max(ds) - min(ds)
    min_d = min(ds)
    
#     d_list = [(d-min_d) for d in ds]
#     d_list = sorted(set(d_list))
    d_list = sorted(set(ds))
    
#     print l, len(d_list)
    for i,d in enumerate(d_list):
        if i == 0:
            continue
        d_ = d_list[i-1]
        if (d - d_).days.real != 1:
            d_diff = (d - d_).days.real
            buf.append(d_diff)
#             print d_diff
#             assert d_diff < 10
                        
    
# for i in [6,8,12]:
with tqdm(total=len(u_map)) as pbar:
    buf = []
    buf2 = []
    for i in u_map.keys():
        ds = [d for d,f in u_map[i]]
        threshold = ((max(ds) - min(ds)).days.real * (4./5.))
        min_d = min(ds)
        fl = []
        m = {}
        for d,f in u_map[i]:
            if (d - min_d).days.real > threshold:
                fl.append(f)
            if d not in m:
                m[d] = []
            m[d].append(f)
        for k,v in m.items():    
            buf2.append(len(set(v)))
#         print len(u_map[i]), len(fl), len(set(fl))
        buf.append(len(set(fl)))
        #     print i, min(ds), max(ds), len(ds)
#         ck(ds)
        pbar.update(1)
# d6 = [d for d,f in u_map[6]]
# ck(d6)
    
#     print ck(ds)
# print u_map[33]
print np.mean(buf), np.std(buf)
print np.mean(buf2), np.std(buf2)


In [None]:


print min(dates), max(dates),  max(dates)- min(dates)


In [None]:
print min(dates)
m = min(dates)
print max(dates)
a = dates[0]
print dir(a - min(dates))
print a
# def normalize_date(min_date, max_date, date):
# for k in u_map.keys()[:10]:
#     print len(u_map[k])
ds = [(d-m).total_seconds() / (60*60*24) for d in dates]
# import numpy as np
print np.mean(ds)
print np.min(ds), np.max(ds)

In [None]:
import keras
from os.path import join
import os
from bs4 import BeautifulSoup as BS
from constants import MAX_SEQUENCE_LENGTH, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences
import numpy as np
np.random.seed(1337)
def quote_title_abstract(xml_path):
    with open(xml_path, 'r') as f:
        data = f.read()
    soup = BS(data)
    title, abstract = soup.find('title').text, soup.find('abstract').text
    return title.strip(), abstract.strip()

# text preprocessing
data_path = join('./','kaggle/')
xml_dir = join(data_path, 't2-doc')
xml_list = [f for f in os.listdir(xml_dir) if f.endswith('.xml')]
# print(len(xml_list))


texts = []

for xml in xml_list:
    path = join(xml_dir,xml)
    title, abstract = quote_title_abstract(path)
    text = title + '' + abstract
    texts.append(text)
#     texts.append(title)
#     texts.append(abstract)
print('read all %d xml files.' % len(xml_list))
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
xml_id_map = {}
for i,xml in enumerate(xml_list):
    node_id = int(xml.replace('.xml',''))
    xml_id_map[node_id] = data[i,:]


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

print('done')