In [1]:
import numpy as np
from tqdm import tqdm
from constants import MAX_SEQ_LEN

with open('../hw_train.csv', 'rb') as f:
    ls = f.readlines()[1:]
# count vocabulary and max sequence length
lyrics_map = {}
bos_token = 's'.decode('utf8')
eos_token = 'e'.decode('utf8')
line_token = 'l'.decode('utf8')
oov_token = 'o'.decode('utf8')
none_lyrics_words = [u'編詞', u'作曲', u'作詞', u'編曲', u'監製']
vocab = [bos_token, eos_token, line_token, oov_token]
lens = []
c = 0 
with tqdm(total=len(ls)) as pbar:
    for l in ls:
        l = l.decode('utf8')
        no, l = l.strip().split(',')
        no = int(no)
        if no not in lyrics_map:
            lyrics_map[no] = []
        lyrics_map[no].append(l)
        pbar.update(1)
# split into Q,K pair
with tqdm(total=len(lyrics_map)) as pbar:
    for k,v in lyrics_map.items():
        line_num = len(v)
        V = [bos_token, ]
        Q = [bos_token, ]
        for i,l in enumerate(v):
            if len(l) <= 1:
                continue
            exist_none_layrics_word = False
            for nw in none_lyrics_words:
                if nw in l:
                    exist_none_layrics_word = True
                    break
            if exist_none_layrics_word:
                continue
            
            for ws in l.strip().split(' '):
                for w in ws:
                    if i <= line_num//2:
                        V.append(w)
                    else:
                        Q.append(w)
                    vocab.append(w)
            if i <= line_num//2:
                V.append(line_token)
            else:
                Q.append(line_token)
        
        V.append(eos_token)
        Q.append(eos_token)
        if len(Q) == 2 or len(V) == 2:
            lyrics_map.pop(k)
            pbar.update(1)
            continue
        lens.append(len(Q))
        lens.append(len(V))
        lyrics_map[k] = (Q, V)
        pbar.update(1)
            
print 'vocabulary size : %d, max len : %.0f, min len : %.0f, mean : %.0f, std : %.0f' % (len(set(vocab)), np.max(lens), np.min(lens), np.mean(lens), np.std(lens))
print 'lyrics number : %d' % len(lyrics_map)
# print np.max(lens), np.mean(lens), np.std(lens)



100%|██████████| 667085/667085 [00:01<00:00, 502697.37it/s]
100%|██████████| 18338/18338 [00:02<00:00, 7550.44it/s]


vocabulary size : 6450, max len : 4016, min len : 5, mean : 180, std : 117
lyrics number : 18223


In [2]:
import numpy as np
import keras
import pickle
from constants import MAX_SEQ_LEN, VOCAB_DIM
from keras.preprocessing.sequence import pad_sequences

texts = vocab

tokenizer = keras.preprocessing.text.Tokenizer(num_words=VOCAB_DIM, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~ ',
                                   lower=False, split=' ', char_level=True, oov_token=oov_token)
tokenizer.fit_on_texts(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
with open('tokenizer','wb') as f:
    pickle.dump(tokenizer, f)

Using TensorFlow backend.


Found 6450 unique tokens.


In [3]:
from constants import MAX_SEQ_LEN
from keras.preprocessing.sequence import pad_sequences

idx = np.random.permutation(len(lyrics_map))
val_num = len(lyrics_map)//10
train_idx, val_idx = idx[val_num:], idx[:val_num]
train_u_map = {k:lyrics_map[k] for k in lyrics_map.keys()[val_num:]}
val_u_map = {k:lyrics_map[k] for k in lyrics_map.keys()[:val_num]}
def boostrap_generator(lyrics_map, max_seq_len, tokenizer):
    while True:
        keys = lyrics_map.keys()
        for idx in np.random.permutation(len(lyrics_map)):
            no = keys[idx]
            Q, K = lyrics_map[no]
            Q = np.array([tokenizer.word_index[w] if w in tokenizer.word_index else tokenizer.word_index[oov_token] for w in Q], dtype=np.int32).reshape(1,-1)
            K = np.array([tokenizer.word_index[w] if w in tokenizer.word_index else tokenizer.word_index[oov_token] for w in K], dtype=np.int32).reshape(1,-1)
            yield Q, K
    
bat = 4
G = boostrap_generator(train_u_map, MAX_SEQ_LEN, tokenizer)
q, k = next(G)
    
print q.shape, k.shape


(1, 242) (1, 233)


### Attention

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.init import xavier_normal_
from constants import VOCAB_DIM, D_MODEL, MAX_SEQ_LEN
# construct neuron network
class Seq2seq_att(nn.Module):

    def __init__(self, dm, num_lay):
        super(Seq2seq_att, self).__init__()
#         for construct cache positional encoding matrix.
        self.emb = nn.Embedding(VOCAB_DIM+1, dm, padding_idx=0)
        self.encoder = nn.GRU(dm, dm, num_lay, batch_first=True)
        self.decoder = nn.GRU(dm, dm, num_lay, batch_first=True)
        self.linear = nn.Linear(dm, VOCAB_DIM+1)

    def forward(self, Q, K):
        K = self.emb(K)
        Q = self.emb(Q)
        en_out, hn = self.encoder(K) 
        Q, _ = self.decoder(Q, hn)
        batch, q_len, _ = Q.size()
        batch, k_len, _ = K.size()
        att_in = torch.cat([en_out, Q], dim=1)
        Q_mask = self.Q_mask_matrix(batch, q_len)
        K_mask = torch.zeros([batch, q_len, k_len], dtype=torch.uint8).cuda()
        
        mask = torch.cat([K_mask, Q_mask], dim=-1)
        out = self.dot_attention(Q, att_in, att_in, mask=mask)
        
        
        
        y = self.linear(out)
        return y
    def Q_mask_matrix(self, batch, Q_len):
#         ByteTensor
        mask = torch.zeros([1, Q_len, Q_len], dtype=torch.uint8, requires_grad=False)
        for i in range(Q_len):
            mask[0,i,i+1:] = 1
        return mask.repeat(batch,1, 1).cuda()
    
    def dot_attention(self, Q, K, V, mask):
        assert Q.size()[-1] == K.size()[-1]
        assert len(Q.size()) == 3 and len(K.size()) == 3 and len(V.size()) == 3
        out = torch.matmul(Q,K.permute(0,2,1))
        if mask is not None:
            out.masked_fill_(mask, -float('inf'))
        return torch.matmul(F.softmax(out, dim=-1), V)


bat = 7
lay_num = 2
model = Seq2seq_att(D_MODEL, lay_num).cuda()
# print(o.size())
q, k = next(G)
q = torch.LongTensor(q).cuda()
k = torch.LongTensor(k).cuda()
# Q = torch.randint(VOCAB_DIM+1, [bat, MAX_SEQ_LEN]).cuda()
# K = torch.randint(VOCAB_DIM+1, [bat, MAX_SEQ_LEN]).cuda()

o = model(q, k)
print(o.size())
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(count_parameters(model))



torch.Size([1, 129, 6451])
2054195


In [5]:
from collections import deque
from tqdm import tqdm as tqdm

import time
def dump_log(model, n_iter, loss, val_loss, acc, val_acc, train_pred, train_label, val_pred, val_label, log_file_stream, tmp_model_path):
    log_text = '%.7d<split>%.5f<split>%.5f<split>%.5f<split>%.5f<split>%s<split>%s<split>%s<split>%s\n' % (n_iter, loss, val_loss, acc, val_acc, train_pred, train_label, val_pred, val_label)
    log_file_stream.write(log_text.encode('utf8'))
    if n_iter % 10 == 0 :
        log_file_stream.flush()
        torch.save(model, tmp_model_path)
def normal_acc(pred, label):
    pred = torch.argmax(pred, dim=-1)
    mask = torch.ones_like(label, dtype=torch.uint8)
#     
    acc = pred == label
    acc = torch.sum(acc).item() / float(torch.sum(mask).item())
    
    return acc
def seq2text(output, index_word):
    assert len(output.size()) == 1
    seq = output
    s = ''
    for i in seq:
        i = int(i.item())
        if i == 0:
            continue
        if i not in index_word:
            w = oov_token
        else:
            w = index_word[i]
        s += w
    return s
    
def rev_mask(m):
    out = torch.ones_like(m, dtype=torch.uint8, requires_grad=False)
    out.masked_fill_(m, 0)
    return out
def scheduled_sampling_rate(acc):
    return (acc)/3.
def scheduled_sampling_label(output, label, rate):
    assert len(label.size()) == 2 and len(output.size()) == 2
    ret = torch.zeros_like(label)
    for b in range(label.size()[0]):
        for i in range(label.size()[1]):
            r = np.random.random()
            if r < rate:
                ret[b,i] = output[b,i]
            else:
                ret[b,i] = label[b,i]
    return ret
                
acc_q = deque(maxlen=100)
loss_q = deque(maxlen=10)
acc = 0
val_acc_q = deque(maxlen=100)
val_loss_q = deque(maxlen=10)

t = time.time()
best_loss = float('inf')

epochs = 100
batch_size = 1
G = boostrap_generator(train_u_map, MAX_SEQ_LEN, tokenizer)
val_G = boostrap_generator(val_u_map, MAX_SEQ_LEN, tokenizer)
# criterion = nn.CrossEntropyLoss(reduction='none')
weight = torch.ones([VOCAB_DIM+1,], dtype=torch.float)
weight[tokenizer.word_index[line_token]] = 100.

# criterion = nn.CrossEntropyLoss(weight=weight.cuda())
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
print 'start training.'
with open('log-att.txt', 'w') as f:
    with open('best-att.txt', 'w') as best_log:
        iters = 100000000
        with tqdm(total=iters) as pbar:
            for it in range(iters):
                optimizer.zero_grad()
                model.train()
                q, k = next(G)
                q = torch.LongTensor(q).cuda()
                k = torch.LongTensor(k).cuda()

                q.requires_grad_(False)
                k.requires_grad_(False)
                
                with torch.no_grad():
                    output = model(q, k)
                q = scheduled_sampling_label(torch.argmax(output, dim=-1), q[:,:-1], scheduled_sampling_rate(acc))
            

                output = model(q, k)
                y = q[:,1:]
#                 loss = torch.sum(criterion(output, y) * pad_mask[:,1:, :]) 
                loss = criterion(output[:,:-1,:].permute(0,2,1), y) 
                label = y

                pred = output[:,:-1,:,]
                acc = normal_acc(pred, label)
                acc_q.append(acc)
                train_pred = seq2text(torch.argmax(output[0,:,:], dim=-1), tokenizer.index_word)
                train_label= seq2text(k[0,:], tokenizer.index_word)
                loss.backward()
                loss_q.append(loss.item())
                
                optimizer.step()
                with torch.no_grad():
                    model.eval()
                    q, k = next(val_G)
                    q = torch.LongTensor(q).cuda()
                    k = torch.LongTensor(k).cuda()
                    
                    q.requires_grad_(False)
                    k.requires_grad_(False)
                    
                    output = model(q, k)
                    y = q[:,1:]
                    val_loss = criterion(output[:,:-1,:,].permute(0,2,1), y) 
                    label = y

                    pred = output[:,:-1,:,]
                    val_acc = normal_acc(pred, label)
                    val_acc_q.append(val_acc)
                    val_pred = seq2text(torch.argmax(output[0,:,:], dim=-1), tokenizer.index_word)
                    val_label= seq2text(k[0,:], tokenizer.index_word)
                    val_loss_q.append(val_loss.item())


                
                acc = np.mean(acc_q)
                val_acc = np.mean(val_acc_q)
                loss = np.mean(acc_q)
                val_loss = np.mean(val_acc_q)
                
#                     pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, loss : %.3f, val_loss : %.3f \t %.3f, %.3f, %.3f, %.3f' % (acc, val_acc, loss.item(), val_loss.item(), a,b,c,d), refresh=False)
                pbar.set_postfix_str('acc : %.3f, val_acc : %.3f, loss : %.3f, val_loss : %.3f' % (acc, val_acc, loss.item(), val_loss.item()), refresh=False)
                pbar.update(batch_size)
                dump_log(model, (it+1)*batch_size, loss, val_loss, acc, val_acc, train_pred, train_label, val_pred, val_label, f,'./tmp-att.pt')
                if val_loss.item() < best_loss and it > 100:
                    torch.save(model, './best-att.pt')
                    best_loss = val_loss
                    best_log.write('%d\t%.5f\n' % ((it+1)*batch_size, best_loss))
                    best_log.flush()
                if it % 2000 == 0 and it >= 100: 
                    print 'train pred : %s\ntrain label : %s' % (train_pred, train_label)
                    print 'validation pred : %s\nvalidation label : %s' % (val_pred, val_label)

# Train model
print("Optimization Finished!")
# print("Total time elapsed: {:.4f}s".format(time.time() - t_total))

  0%|          | 0/100000000 [00:00<?, ?it/s]

start training.


  "type " + obj.__name__ + ". It won't be checked "
  0%|          | 2004/100000000 [01:53<1356:48:06, 20.47it/s, acc : 0.172, val_acc : 0.161, loss : 0.172, val_loss : 0.161]

train pred : 我们你l要你们己l果llll我样l我们有你你你l我lll我经不你的我要llll我是你的lllllllll我们有你你你l我lll我你你的的ll以lll我是l的在你的ll是你lll我们你你你你lllll我经不你的不要我lll我是你的lll柔llll我我们有你你你lllll我你你的的ll以lll我是l的在你的我你是你么ll我
train label : s收拾好行李我要去哪里l只要跟着你可是梦一推就醒l车外的风景有你的身影l多希望是你牵起我这身白衣裙l是我的婚礼对面不是你l我控制不了我自己如果你出现在这里l我终于成了别人的女人l曾经为你奋不顾身的人l只为你偶尔的温柔越走越深l我终于成了别人的女人l等到最后无路可退的人l还担心留给你的爱还是那么深l车外的风景有你的身影l多希望是你牵起我这身白衣裙l是我的婚礼对面不是你le
validation pred : 我们心lll我们心lll我们心lll我的你次ll是l要lll我l
validation label : s我的吧的吧l我的吧的吧l我的吧的吧说着一句谁也听不懂的话l我结吧结吧l我结吧结吧l我结吧结吧唱着一句谁也听不懂的歌le


  0%|          | 4003/100000000 [03:24<1340:01:41, 20.73it/s, acc : 0.205, val_acc : 0.211, loss : 0.205, val_loss : 0.211]

train pred : 我你见你惜l此llll我llll亮lllll我起lll界l经不有ll我的经不有l一丽的界l我春llllllllll一的的个人待l的ll我一丽的l界l经l抱ll我的经不有l样丽l界l的
train label : s心升明月l飞鸟归山林落日入东海l我心上的人你从哪里来l青山随云走大地沿河流l这深情一片等待谁收留l这广阔的天地如何安放我l我如何安放这广阔天地l我心深似海你宛如明月l这般美如画却遥不可及le
validation pred : 你的不经ll不你lll我你的果的lll我是是你你么l我l的一ll的ll的心诺l你的你的果你受l你是样的开l的我的
validation label : s当我想你的时候l我的心在颤抖l当我想你的时候l泪水也悄悄的滑落l当我想你的时候l才知道寂寞是什么l当我想你的时候l谁听我诉说le


  0%|          | 6003/100000000 [04:58<1372:47:01, 20.23it/s, acc : 0.214, val_acc : 0.225, loss : 0.214, val_loss : 0.225]

train pred : 我不我l天l然要ll是lll不在ll的我l样去llll我里l个都美l我想l能lll我要l去我寞的有ll个人l么l什天lll个歌l我少年春l生l心l成了己l我要要不经lllll我寞的有ll要ll样l寞l人l以lll我
train label : s寂寞中想起需要我么l闷闷地想听一首歌l初恋的年头都与我们渡过l曾是醉人情节l再想极傻l玩乐声遮掩不了痛楚l共大家一起苦笑么l多少青年人l伤了变成自我l谁亦也曾年少爱玩火l莫为昨日独自难过l又是现实又是无心的错l没有开始知道这原是结果le
validation pred : 我的的我是能再ll的心丽l我想你你天的眼ll我雨你的得我我样l我么不是不见lll我前不不能让我l我的不能l我l我的我的l去l市lll的l去lll我们你身天l间ll市ll默l论l我想有你一你的命l着ll是我么多ll我的现在我的心命l着我llllll我的的我是能再ll的心丽l我的的我是能再ll的心丽l我息
validation label : s我想是不期的而遇l你在二月的春风里l怎么就会遇见了你l从此我不再是我l你也不再是你l我想是春天的旨意l风把你吹到了这里l怎么就会遇见了你l从此我不再是我l你也不再是你l你在我走过的城市里走我走过的脚印l我在你千里之外的城市里默默无闻l我没有成为你生活牵绊你总是那么单纯l你出现了我的生活带着前一天的余温le


  0%|          | 7672/100000000 [06:28<1405:30:39, 19.76it/s, acc : 0.220, val_acc : 0.221, loss : 0.220, val_loss : 0.221] 


RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED

In [None]:
print output.shape
print y.shape
print q.shape