## Rating csv


In [2]:
import pandas as pd
import datetime
import numpy as  np
from tqdm import tqdm
from scipy.sparse import csr_matrix
np.random.seed(1337)
with open('./kaggle/rating_train.csv', 'r') as f:
    ls = f.readlines()[1:]
u_map = {}

dates = []
foods = []
users = []
    


with tqdm(total=len(ls)) as pbar:
    for l in ls:
        date_str, user, food = l.strip().split(',')
        date = datetime.datetime.strptime(date_str, '%Y-%m-%d')
        user, food = int(user), int(food)
        if user not in u_map:
            u_map[user] = []
        u_map[user].append( (date, food) )
        
        dates.append(date)
        users.append(user)
        foods.append(food)
        pbar.update(1)
        

user_map = {u:i for i, u in enumerate(set(users))}        
food_map = {f:i for i, f in enumerate(set(foods))}


# for ranking sparse matrix
rows = [user_map[u] for u in users]
cols = [food_map[f] for f in foods]
R = csr_matrix((np.ones([len(rows), ]), (rows, cols)), shape=(len(user_map), len(food_map)))

pos_count = np.array(np.sum(R, axis=0)).flatten()
neg_count = len(ls) - pos_count

class_weight =  1. / 2.*neg_count
pos_weight = neg_count / pos_count
print R.shape
print neg_count.shape
print pos_count.shape

100%|██████████| 2681494/2681494 [00:20<00:00, 130701.89it/s]


(2608, 5532)
(5532,)
(5532,)


## User csv


In [1]:
import pandas as pd
import numpy as np
import keras
import os
from constants import MAX_TEXT_SEQ_LEN, MAX_NUM_WORDS, EMBEDDING_DIM
from keras.preprocessing.sequence import pad_sequences



csv = pd.read_csv('./kaggle/user.csv')
print csv.columns
# print 'userid,username,age,gender,location,city,state,title,about_me,reasons,inspirations,friends_count'
# for csv.iterrows
texts = []
id_list = []
age_list = []
gender_list = []
print 'Starting read texts.'
for row in csv.iterrows():
    r = row[1]
    s = ''
    s += r['about_me'] if not pd.isnull(r['about_me']) else ''
    s += r['reasons'] if not pd.isnull(r['reasons']) else ''
    s += r['inspirations'] if not pd.isnull(r['inspirations']) else ''
    id_list.append(r['userid'])
    age_list.append(r['age'])
    gender_list.append(r['gender'])
    texts.append(s)

# normalize age
valid_age_list = [age for age in age_list if not np.isnan(age)]
m, std = np.mean(valid_age_list), np.std(valid_age_list)
for i,age in enumerate(age_list):
    if not np.isnan(age):
        age_list[i] = float(age-m) / std
    else:
        age_list[i] = 0
buf = [len(s) for s in texts]
print np.max(buf), np.mean(buf), np.std(buf)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
sequences = tokenizer.texts_to_sequences(texts)
pad_data = data = pad_sequences(sequences, maxlen=MAX_TEXT_SEQ_LEN, padding='post', truncating='post')


print('Preparing embedding matrix.')
embeddings_index = {}
# with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r', encoding='utf8') as f:
with open(os.path.join('./','glove', 'glove.6B.%dd.txt' % EMBEDDING_DIM), 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

        
# 
userid_map = {user:i for i, user in enumerate(id_list)}
def get_user_feature_fn(userid):
    idx = userid_map[userid]
    age = age_list[idx]
    gender = 1 if gender_list[idx] == 'Female' else 0
    text_seq = pad_data[idx, :]
    x = np.array([age, gender])
    return x, text_seq
u, u_text = get_user_feature_fn(8526)
print u.shape, u_text.shape
print u, u_text
print embedding_matrix.shape

Using TensorFlow backend.


Index([u'userid', u'username', u'age', u'gender', u'location', u'city',
       u'state', u'title', u'about_me', u'reasons', u'inspirations',
       u'friends_count'],
      dtype='object')
Starting read texts.
6990 567.4152607361963 737.8887977118112
Found 13852 unique tokens.
Preparing embedding matrix.
(2,) (2000,)
[-0.53151114  1.        ] [  4 372  19 ...   0   0   0]
(2001, 300)


In [6]:
print len([s for s in texts if len(s) > 1])/ float(len(user_map))

0.720475460123


#### prepare sample generator


In [3]:
from constants import MAX_SEQ_LEN


idx = np.random.permutation(len(u_map))
val_num = len(u_map)//10
train_idx, val_idx = idx[val_num:], idx[:val_num]
train_u_map = {k:u_map[k] for k in u_map.keys()[val_num:]}
val_u_map = {k:u_map[k] for k in u_map.keys()[:val_num]}
def batch_boostrap_generator(batch_size, u_map, food_map, max_history_len, get_user_feature_fn):
    G = boostrap_generator(u_map, food_map, max_history_len, get_user_feature_fn)
    while True:
        X = []
        pad_masks = []
        U = []
        U_text = []
        for i in range(batch_size):
            x, x_len, u, u_text = next(G)
            X.append(np.expand_dims(x, axis=0))
            U.append(np.expand_dims(u, axis=0))
            U_text.append(np.expand_dims(u_text, axis=0))
            pad_mask = np.zeros_like(x)
            for idx in range(x_len):
                pad_mask[idx, :] = (max_history_len-x_len+idx+1)*0.3
            pad_masks.append(np.expand_dims(pad_mask, axis=0))
        yield np.vstack(X), np.vstack(pad_masks), np.vstack(U), np.vstack(U_text)
def boostrap_generator(u_map, food_map, max_history_len, get_user_feature_fn):
    while True:
        keys = u_map.keys()
        for user_idx in np.random.permutation(len(u_map)):
            user = keys[user_idx]
            X = np.zeros([max_history_len, len(food_map)])
            history = u_map[user]
            ds = np.array([d for d,f in history])
            fs = np.array([f for d,f in history])
            sorted_idx = np.argsort(ds)
            ds = ds[sorted_idx]
            fd = fs[sorted_idx]
            
            date_idx = 0
            now_date = ds[0]
            for food, date in zip(fs,ds):
                if date != now_date:
                    date_idx+=1
                    now_date = date
                X[date_idx, food_map[food]] = 1
            x_len = date_idx+1
#             user feature
            u, u_text = get_user_feature_fn(user)
            yield X, x_len, u, u_text
            
    

G = batch_boostrap_generator(32, train_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
# val_G = batch_boostrap_generator(32//2, val_u_map, food_map, max_history_len=MAX_SEQ_LEN)

x, pad_mask, u, u_text = next(G)
print x.shape, pad_mask.shape, u.shape, u_text.shape
# x, pad_mask = next(val_G)
# print x.shape, pad_mask.shape

# G2 = boostrap_generator(train_u_map, food_map, max_history_len=MAX_SEQ_LEN)
# x, x_len = next(G2)
# print x.shape
# print pad_mask[3,:,3]

(32, 165, 5532) (32, 165, 5532) (32, 2) (32, 2000)


### Transformer model with one-for-all BCE loss

In [4]:
# import imp
# from os.path import expanduser
# home = expanduser("~")
# Transformer = imp.load_source('Transformer', '%s/git/grandchallenge/ASR/Transformer.py' % home)


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
# construct neuron network

def scaled_dot_attention(Q, K, V, mask):
    assert Q.size()[-1] == K.size()[-1]
    assert len(Q.size()) == 3 and len(K.size()) == 3 and len(V.size()) == 3
    dk = torch.tensor(K.size()[-1], dtype=torch.float32, requires_grad=False).cuda()
    out = torch.matmul(Q,K.permute(0,2,1)) / torch.sqrt(dk) 
    if mask is not None:
        out.masked_fill_(mask, -float('inf'))
    return torch.matmul(F.softmax(out, dim=-1), V)

def positional_encoding(d_model, pos):
    assert d_model % 2 == 0
    pos = torch.tensor(pos, dtype=torch.float32, requires_grad=False)
    pe = torch.zeros([1,d_model], dtype=torch.float32, requires_grad=False)
    for i in range(D_MODEL//2):
        a = torch.tensor(10000, dtype=torch.float32, requires_grad=False)
        b = torch.tensor(2.*i/float(D_MODEL), dtype=torch.float32, requires_grad=False)
        c = pos / torch.pow(a, b)
        pe[0, 2*i] = torch.sin(c)
        pe[0, 2*i+1] = torch.cos(c)
    return pe
                            
class Transformer_v3(nn.Module):

    def __init__(self, layer_num, dk, dv, dm, h, p_drop, d_ff, use_mask, use_cuda=True, posi_cache_length=200):
        super(Transformer_v3, self).__init__()
#         for construct cache positional encoding matrix.
        self.d_model = dm
        self.use_cuda = use_cuda
        
        self.decoder = Stack_Decoder(layer_num, dk, dv, dm, h, p_drop, d_ff, use_mask)
        self.emb_drop = nn.Dropout(p_drop)
        self.init_pos_mat(posi_cache_length)

    def forward(self, Q):
    
        
#         decoder
        batch, Q_len, d = Q.size()
        
        try:
            Q = Q + self.get_pos_mat(Q_len)
        except RuntimeError, e:
            if e.message == 'TensorIterator expected type torch.cuda.FloatTensor but got torch.FloatTensor':
                if Q.is_cuda != self.get_pos_mat(K_len).is_cuda:
                    print('Make sure cache positional matrix is same type of tensor with input, both cuda tensor or not.\nBy setting argument use_cuda=True to set cache positional encoding matrix as a cuda tensor.')
            raise
        
        Q = self.emb_drop(Q)
        
        de_out = self.decoder(Q)
        return de_out
    
#     To speed up the positional encoding by construct an cache matrix. 
    def init_pos_mat(self, cache_length):
        print('init postional matrix with length : %d ' % cache_length)
        self.positional_matrix = torch.cat([positional_encoding(self.d_model, i) for i in range(0,cache_length)], dim=0)
        self.positional_matrix.requires_grad = False
        if self.use_cuda:
            self.positional_matrix = self.positional_matrix.cuda()
            
        
    def get_pos_mat(self, length):
        if length > self.positional_matrix.shape[0]:
            print('input sequence length reach positional matrix maximum length. %d ' % length)
            ret = torch.cat([positional_encoding(self.d_model, i) for i in range(length)], dim=0)
            ret.requires_grad = False
            print('Increase positional matrix maximum length. %d ' % length)
            self.positional_matrix = ret
            if self.use_cuda:
                self.positional_matrix = self.positional_matrix.cuda()
            return ret
        else:
            return self.positional_matrix[:length]
        

    
    

class Stack_Decoder(nn.Module):
    """
    Stacked Decoder
    """
    def __init__(self, layer_num, dk, dv, dm, h, p_drop, d_ff, use_mask):
        super(Stack_Decoder, self).__init__()
        self.decoders = nn.ModuleList([Decoder(dk, dv, dm, h, p_drop, d_ff, use_mask) for i in range(layer_num)])
        
        
    def forward(self, Q):
        # ModuleList can act as an iterable, or be indexed using ints
        for lay in self.decoders:
            Q = lay(Q)
        return Q           

class Decoder(nn.Module):
    def __init__(self, dk, dv, dm, h, p_drop, d_ff, use_mask):
        super(Decoder, self).__init__()
        self.use_mask = use_mask
        
#         query attention residual block
        self.Q_attention_lay = nn.Linear(dm, dm)
        self.Q_att_drop = nn.Dropout(p_drop)

#         feed forward residual block
        self.fcn = nn.Linear(dm, dm)
        self.linear_drop = nn.Dropout(p_drop)
        

    def forward(self, Q):
        if self.use_mask:
            batch, Q_len, d = Q.size()
            mask = self.mask_matrix(batch, Q_len)
        else:
            mask = None
#         query attention
        Q_attention_out = self.Q_attention_lay(scaled_dot_attention(Q, Q, Q, mask=mask))
        Q_att_out = self.Q_att_drop(Q_attention_out)
        
#         feed forward
        linear_out = self.fcn(Q_att_out)
        return linear_out
    def mask_matrix(self, batch, Q_len):
#         ByteTensor
        mask = torch.zeros([1, Q_len, Q_len], dtype=torch.uint8, requires_grad=False)
        for i in range(Q_len):
            mask[0,i,i+1:] = 1
        return mask.repeat(batch,1, 1).cuda()


    
# Transformer paper baseline hyper-parameters
STACKED_NUM = 2
H = 4
D_MODEL = 128
DK = DV = D_MODEL//H
P_DROP = 0.05
D_FF = D_MODEL*4




    
# bat = 3
# Q = torch.rand([bat, 13, D_MODEL]).cuda()
# model = Transformer_v2(STACKED_NUM, DK, DV, D_MODEL, H, P_DROP, D_FF, use_mask=True, use_cuda=True).cuda()
# o = model(Q)
# print(o.size())

# Q = torch.rand([bat, 47, D_MODEL]).cuda()
# o = model(Q)
# print(o.size())
# # # print o
# def count_parameters(model):
#     return sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(count_parameters(model))



In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F### Transformer with ALS embedding Training
# import Transformer/

import numpy as np
from constants import FOOD_NUM, USER_NUM
class Net(nn.Module):

    def __init__(self, dm, p_drop, emb_mat):
        super(Net, self).__init__()
        self.drop = nn.Dropout(p_drop)
        self.food_emb = Food_embedding(FOOD_NUM, dm, 1, p_drop)
        self.user_emb = User_embedding(dm, 3, emb_mat, p_drop, activation_fn=F.relu)
        self.transformer = Transformer_v3(STACKED_NUM, DK, DV, D_MODEL, H, P_DROP, D_FF, use_mask=True, use_cuda=True).cuda()

        self.output_linear = nn.Linear(2*dm, FOOD_NUM)

    def forward(self, history, u, u_text):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        
        x = self.food_emb(history)
        batch, x_len, d = x.size()
        
        x = self.transformer(x)
#         x = torch.sigmoid(x)
        u_out = self.user_emb(u, u_text).unsqueeze(1).repeat(1, x_len, 1)
        x = self.output_linear(torch.cat([x, u_out], dim=-1))
        
        return x
        
class Food_embedding(nn.Module):
    def __init__(self, c_in, dm, layer_num, p_drop, activation_fn=F.selu):
        super(Food_embedding, self).__init__()
        self.activation_fn = activation_fn
        self.drop = nn.Dropout(p_drop)
        assert layer_num >= 1
        self.first_linear = nn.Linear(c_in, dm)
        self.linears = nn.ModuleList([nn.Linear(dm, dm) for i in range(layer_num-1)])
        

    def forward(self, x):
#         print(K.size(), get_pos_mat(MAX_SEQUENCE_LENGTH).size())
        x = self.first_linear(x)
        for lay in self.linears:
            x = self.activation_fn(lay(x))
            if lay != self.linears[-1]:
                x = self.drop(x)
        return x

class User_embedding(nn.Module):
    def __init__(self, dm, layer_num, emb_mat, p_drop, activation_fn=F.selu):
        super(User_embedding, self).__init__()
        self.activation_fn = activation_fn
        self.drop = nn.Dropout(p_drop)
        assert layer_num >= 1
        self.emb = nn.Embedding(emb_mat.shape[0], emb_mat.shape[1], padding_idx=0)
        self.emb.weight = nn.Parameter(torch.FloatTensor(emb_mat))
        self.emb.weight.requires_grad_(False)
        self.emb_linear = nn.Linear(emb_mat.shape[1], dm)
        self.att_weight = nn.Parameter(torch.zeros([1, dm-2**(layer_num+1), ], dtype=torch.float))
        torch.nn.init.xavier_normal_(self.att_weight)
        self.linears = nn.ModuleList([nn.Linear(2**(i+1), 2**(i+2)) for i in range(layer_num)])
        

    def forward(self, u, u_text):
        for lay in self.linears:
            u = self.activation_fn(lay(u))
#         for text
        u_text = self.emb(u_text)
        u_text = self.activation_fn(self.emb_linear(u_text))
        batch, seq, d = u_text.size()
        att_w = self.att_weight.view(1,1, -1).repeat(batch, 1, 1)
        Q = torch.cat([u.unsqueeze(1),att_w], dim=-1)
        u_att = scaled_dot_attention(Q, u_text, u_text, mask=None)
        u_att.squeeze_(1)
        
        return u_att
    
batch = 7
dm = D_MODEL
Q = torch.rand([batch, 18, FOOD_NUM]).cuda()
u = torch.rand([batch, 2]).cuda()
u_text = torch.randint(MAX_NUM_WORDS,[batch, MAX_TEXT_SEQ_LEN], dtype=torch.long).cuda()
model = Net(dm, 0.1, embedding_matrix).cuda()
o = model(Q, u, u_text)
# print t
print(o.size())
# print o

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))




init postional matrix with length : 200 
torch.Size([7, 18, 5532])
2234832


In [None]:
from collections import deque
from tqdm import tqdm as tqdm

import time
def dump_log(model, n_iter, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%.5f\n' % (n_iter, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1)
    log_file_stream.write(log_text)
    if n_iter % 10 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)
def normal_acc(pred, label, pad_mask):
    label = label.type(torch.uint8)
    mask = (pad_mask != 0).type(torch.uint8)
#     
    acc = pred == label
    acc = acc.masked_select(mask)
    acc = torch.sum(acc).item() / float(torch.sum(mask).item())
#     
    TP = float(torch.sum(pred.masked_select(label)).item())
    TP_FP = torch.sum(pred.masked_select(mask)).item()
#     precision = TP / TP_FP if TP_FP != 0 else float('nan')
    precision = TP / TP_FP if TP_FP != 0. else 0.
    recall = TP / torch.sum(label).item()
# 
    f1 = 2.*precision*recall / (precision+recall) if (precision+recall) != 0  else 0
    
    return acc, precision, recall, f1
def rev_mask(m):
    out = torch.ones_like(m, dtype=torch.uint8, requires_grad=False)
    out.masked_fill_(m, 0)
    return out
        
acc_q = deque(maxlen=1000)
precision_q = deque(maxlen=1000)
recall_q = deque(maxlen=1000)
loss_q = deque(maxlen=1000)
f1_q = deque(maxlen=1000)

val_acc_q = deque(maxlen=10000)
val_loss_q = deque(maxlen=10000)
val_precision_q = deque(maxlen=10000)
val_recall_q = deque(maxlen=10000)
val_f1_q = deque(maxlen=1000)

t = time.time()
best_f1  = 0
best_loss = float('inf')

epochs = 100
batch_size = 16
G = batch_boostrap_generator(batch_size, train_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
val_G = batch_boostrap_generator(batch_size, val_u_map, food_map, max_history_len=MAX_SEQ_LEN, get_user_feature_fn=get_user_feature_fn)
# criterion = nn.BCEWithLogitsLoss(weight=torch.FloatTensor(class_weight).cuda(), reduction='none', pos_weight=torch.FloatTensor(pos_weight).cuda())
criterion = nn.BCEWithLogitsLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
print 'start training.'
with open('log-hybrid.txt', 'w') as f:
    with open('best-hybrid.txt', 'w') as best_log:
        iters = 100000000
        with tqdm(total=iters) as pbar:
            for it in range(iters):
                optimizer.zero_grad()
                model.train()
                seq, pad_mask, u, u_text = next(G)
                seq = torch.FloatTensor(seq).cuda()
                pad_mask = torch.FloatTensor(pad_mask).cuda()
                u = torch.FloatTensor(u).cuda()
                u_text = torch.LongTensor(u_text).cuda()
                
                seq.requires_grad_(False)
                pad_mask.requires_grad_(False)
                u.requires_grad_(False)
                u_text.requires_grad_(False)
                
                x = seq[:,:-1,:]
                y = seq[:,1:,:]

                output = model(x, u, u_text)
#                     a,b = foo(output, y)
#                     loss = normal_loss(criterion, output, y)

#                 loss = torch.sum(criterion(output, y) * pad_mask[:,1:, :]) / torch.sum(pad_mask[:,1:, :])
                loss = torch.sum(criterion(output, y) * pad_mask[:,1:, :]) 
                pred = output > 0.5
                label = y


                acc, precision, recall, f1 = normal_acc(pred, label, pad_mask[:,1:,:])
                acc_q.append(acc)
                precision_q.append(precision)
                recall_q.append(recall)
                f1_q.append(f1)
                loss.backward()

                optimizer.step()
                with torch.no_grad():
                    model.eval()
                    seq, pad_mask, u, u_text = next(val_G)
                    seq = torch.FloatTensor(seq).cuda()
                    pad_mask = torch.FloatTensor(pad_mask).cuda()
                    u = torch.FloatTensor(u).cuda()
                    u_text = torch.LongTensor(u_text).cuda()

                    seq.requires_grad_(False)
                    pad_mask.requires_grad_(False)
                    u.requires_grad_(False)
                    u_text.requires_grad_(False)


                    x = seq[:,:-1,:]
                    y = seq[:,1:,:]
                    output = model(x, u, u_text)
#                     c,d = foo(output, y)
#                         val_loss = normal_loss(criterion, output, y)
#                         val_loss = criterion(output, y)
#                         pred = output > 0.5
#                     val_loss = torch.sum(criterion(output, y)* pad_mask[:,1:, :]) / torch.sum(pad_mask[:,1:, :])
                    val_loss = torch.sum(criterion(output, y)* pad_mask[:,1:, :]) 
    
                    pred = output > 0.5

                    label = y

                    val_acc, val_precision, val_recall, val_f1 = normal_acc(pred, label, pad_mask[:,1:,:])
                    val_acc_q.append(val_acc)
                    val_precision_q.append(val_precision)
                    val_recall_q.append(val_recall)
                    val_f1_q.append(val_f1)
                
                acc = np.mean(acc_q)
                precision = np.mean(precision_q)
                recall = np.mean(recall_q)
                f1 = np.mean(f1_q)
                
                val_acc = np.mean(val_acc_q)
                val_precision = np.mean(val_precision_q)
                val_recall = np.mean(val_recall_q)
                val_f1 = np.mean(val_f1_q)
                
#                     pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, loss : %.3f, val_loss : %.3f \t %.3f, %.3f, %.3f, %.3f' % (acc, val_acc, loss.item(), val_loss.item(), a,b,c,d), refresh=False)
                pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, precision : %.3f, val_precision : %.3f, recall : %.3f, val_recall : %.3f, loss : %.3f, val_loss : %.3f, f1 : %.3f, val_f1 : %.3f' % (acc, val_acc, precision, val_precision, recall, val_recall, loss.item(), val_loss.item(), f1, val_f1), refresh=False)
                pbar.update(batch_size)
                dump_log(model, (it+1)*batch_size, loss, val_loss, acc, val_acc, precision, val_precision, recall, val_recall, f1, val_f1, f,'./tmp-hybrid.pt')
                if val_f1 > best_f1 and it > 100:
                    torch.save(model, './best-hybrid.pt')
                    best_f1 = val_f1
                    best_log.write('%d\t%.5f\n' % ((it+1)*batch_size, best_f1))
                    best_log.flush()

# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

  0%|          | 0/100000000 [00:00<?, ?it/s]

start training.


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  1%|          | 754416/100000000 [7:12:05<757:05:16, 36.41it/s, acc : 0.998, val_acc : 0.998, precision : 0.759, val_precision : 0.429, recall : 0.148, val_recall : 0.095, loss : 1656430.250, val_loss : 2543757.000, f1 : 0.246, val_f1 : 0.154] 