In [43]:
import torch
import torch.nn as nn
import time
import random
import math
import zipfile
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')

In [44]:
def load_data_lyrics():
    with zipfile.ZipFile('/home/data/jaychou_lyrics.txt.zip') as zin:
        with zin.open('jaychou_lyrics.txt')as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n',' ').replace('\r',' ')
    corpus_chars = corpus_chars[:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([[char,i]for i,char in enumerate(idx_to_char)])
    vocab_size = len(idx_to_char)
    corpus_iter = [char_to_idx[char]for char in corpus_chars]
    return idx_to_char,char_to_idx,vocab_size,corpus_iter
idx_to_char,char_to_idx,vocab_size,corpus_iter = load_data_lyrics()
corpus_iter[:10]

[219, 572, 181, 62, 961, 928, 275, 219, 572, 819]

In [45]:
def data_iter_random(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    num_examples = (len(corpus_iter)-1)//num_steps
    num_epochs = num_examples//batch_size
    example_iter = list(range(num_examples))
    random.shuffle(example_iter)
    def _data(pos):
        return corpus_iter[pos:pos+num_steps]
    for i in range(num_epochs):
        i = i*batch_size
        batch_iter = example_iter[i:i+batch_size]
        X = [_data(j*num_steps)for j in batch_iter]
        Y = [_data(j*num_steps+1)for j in batch_iter]
        yield torch.tensor(X,dtype=torch.float32,device=device),torch.tensor(Y,dtype=torch.float32,device=device)

In [46]:
x = list(range(30))
print(x)
for X,Y in data_iter_random(x,2,6):
    print(X,Y)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
tensor([[18., 19., 20., 21., 22., 23.],
        [ 6.,  7.,  8.,  9., 10., 11.]], device='cuda:0') tensor([[19., 20., 21., 22., 23., 24.],
        [ 7.,  8.,  9., 10., 11., 12.]], device='cuda:0')
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [12., 13., 14., 15., 16., 17.]], device='cuda:0') tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [13., 14., 15., 16., 17., 18.]], device='cuda:0')


In [47]:
def data_iter_consecution(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    corpus_iter = torch.tensor(corpus_iter,dtype=torch.float32,device=device)
    data_len = len(corpus_iter)
    num_batch = data_len//batch_size
    iteration = corpus_iter[0:batch_size*num_batch].view(batch_size,num_batch)
    num_epochs = (num_batch-1)//num_steps
    for i in range(num_epochs):
        i = i *num_steps
        x = iteration[:,i:i+num_steps]
        y = iteration[:,i+1:i+num_steps+1]
        yield x,y


In [48]:
x = list(range(30))
print(x)
for X,Y in data_iter_consecution(x,2,6):
    print(X,Y)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]], device='cuda:0') tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]], device='cuda:0')
tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]], device='cuda:0') tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]], device='cuda:0')


In [49]:
def onehot(x,n_class,dtype=torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0],n_class,dtype=torch.float32,device=x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res

In [50]:
def to_onehot(x,n_class):
    return [onehot(x[:,i],n_class)for i in range(x.shape[1])]

In [51]:
num_inputs,num_hiddens,num_outputs = vocab_size,256,vocab_size

In [52]:
def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0,0.01,size=shape),dtype=torch.float32,device=device)
        return nn.Parameter(ts,requires_grad=True)
    def _three():
       return ( _one((num_inputs,num_hiddens)),
                _one((num_hiddens,num_hiddens)),
                nn.Parameter(torch.zeros(num_hiddens,device=device,dtype=torch.float32),requires_grad=True))
    w_xz,w_hz,b_z = _three()
    w_xr,w_hr,b_r = _three()
    w_xh,w_hh,b_h = _three()
    
    w_hq = _one((num_hiddens,num_outputs))
    b_q = nn.Parameter(torch.zeros(num_outputs,device=device,dtype=torch.float32),requires_grad=True)
    
    return nn.ParameterList([w_xz,w_hz,b_z,w_xr,w_hr,b_r, w_xh,w_hh,b_h,w_hq,b_q])

In [53]:
def init_gru_states(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device))

In [54]:
def GRU(inputs,state,params):
    w_xz,w_hz,b_z,w_xr,w_hr,b_r, w_xh,w_hh,b_h,w_hq,b_q= params
    H = state
    outputs = []
    for x in inputs:
        R = torch.matmul(x,w_xr)+torch.matmul(H,w_hr)+b_r
        R = torch.sigmoid(R)
        Z = torch.matmul(x,w_xz)+torch.matmul(H,w_hz)+b_z
        Z = torch.sigmoid(Z)
        H_tilda = torch.matmul(x,w_xh)+R*torch.matmul(H,w_hh)+b_h
        H_tilda = torch.tanh(H_tilda)
        H = Z*H +(1-Z)*H_tilda
        Y = torch.matmul(H,w_hq)+b_q
        outputs.append(Y)
    return outputs,H

In [55]:
def pred_gru(prefil,gru,num_chars,params,init_gru_states,num_hiddens,vocab_size,device,idx_to_char,char_to_idx):
    state = init_gru_states(1,num_hiddens,device)
    output = [char_to_idx[prefil[0]]]
    for t in range(num_chars+len(prefil)-1):
        x = to_onehot(torch.tensor([[output[-1]]],device=device),vocab_size)
        (Y,state) = gru(x,state,params)
        if t<len(prefil)-1:
            output.append(char_to_idx[prefil[t+1]])
        else:
            output.append(int(Y[0].argmax(dim=1).item()))
    return ''.join([idx_to_char[i]for i in output])

In [56]:
prefil='分开'
params = get_params()
pred_gru(prefil,GRU,10,params,init_gru_states,num_hiddens,vocab_size,device,idx_to_char,char_to_idx)

'分开耳熟手格格元站因诉词'

In [57]:
def clip_gradient(params,theta,device):
    norm = torch.tensor([0.0],device=device)
    for param in params:
        norm += (param.grad.data**2).sum()
    norm = norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data *=(theta/norm)

In [58]:
def sgd(params,lr,batch_size):
    for param in params:
        param.data -=(lr*param.grad)/batch_size

In [62]:
def train_and_pred_GRU(gru,get_params,init_gru_states,num_hiddens,vocab_size,device,corpus_iter,char_to_idx,idx_to_char,is_data_random,num_epochs,num_steps,lr,clipping_theta,batch_size,pred_period,pred_len,prefixes):
    loss = nn.CrossEntropyLoss()
    if is_data_random:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecution
    params = get_params()   
    for epoch in range(num_epochs):
        if not is_data_random:
            state = init_gru_states(batch_size,num_hiddens,device)
        l_sum = 0.0
        n =0 
        start = time.time()
        data_iter = data_iter_fn(corpus_iter,batch_size,num_steps,device)
        for X,Y in data_iter:
            if state is not None:
                if isinstance(state,tuple):
                    state = (state[0].detach(),state[1].detach())
                else:
                    state = state.detach()
            if is_data_random:
                state = init_gru_states(batch_size,num_hiddens,device)
            else:
                for s in state:
                    s.detach()
            inputs = to_onehot(X,vocab_size) 
            y_hat,state= gru(inputs,state,params)
            y_hat = torch.cat(y_hat,dim=0)
            Y = torch.transpose(Y,0,1).contiguous().view(-1)
            l = loss(y_hat,Y.long())
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            clip_gradient(params,clipping_theta,device)
            sgd(params,lr,1)
            l_sum += l.item()*Y.shape[0]
            n += Y.shape[0]
        if (epoch+1)% pred_period ==0:
            print('epoch:%d,pre:%.5f,time:%.f'%(epoch+1,math.exp(l_sum/n),time.time()-start))
            for prefix in prefixes:
                print('---',pred_gru(prefix,gru,pred_len,params,init_gru_states,num_hiddens,vocab_size,device,idx_to_char,char_to_idx))

In [63]:
num_epochs =250
num_steps = 35
batch_size = 32
lr =1e2
clipping_theta = 1e-2
pred_period = 50
pred_len=50
prefixes=['分开','不分开']
train_and_pred_GRU(GRU,get_params,init_gru_states,num_hiddens,vocab_size,device,corpus_iter,
char_to_idx,idx_to_char,False,num_epochs,num_steps,
lr,clipping_theta,batch_size,pred_period,pred_len,prefixes)


epoch:50,pre:108.79664,time:0
--- 分开 我想你 你不了 我不要 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我
--- 不分开 我想你 你不了 我不要 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我想你 我
epoch:100,pre:12.11292,time:0
--- 分开 我想要这样牵着你的手不放  爱在我以以以以单单没有                        
--- 不分开 爱不不觉 经不再再不想 不知不觉 你已经这样奏 后知后觉 我的好好节活 让我不觉 你爱我 说你的甜
epoch:150,pre:1.75199,time:0
--- 分开 一个云酒 在人海中的溪边 情绪激动 一颗心到现在还在抽痛 还为分手前那句抱歉 在感动 穿梭时间的画
--- 不分开 你已经离开我 不知不觉 我跟了这节奏 后知后觉 又过了一个秋 后知后觉 我该好好生活 我该好好生活
epoch:200,pre:1.08422,time:0
--- 分开 小弄事 废诉我 印地安的传说 还真是 瞎透了 什么都有 沙漠之中怎么会有泥鳅 话说完飞过一只海鸥 
--- 不分开 整作云吗开嘛我有攻       古巴比伦王颁布了汉摩拉比法典 刻在黑色的玄武岩 距今已经三千七百多
epoch:250,pre:1.03417,time:0
--- 分开 小弄我 已沉为田手留白 所有人在莫铁铁香许下心愿 看远方的星如果听的见 它一定实现它一定实现 载著
--- 不分开 你已经离开我 不知不觉 我跟了这节奏 后知后觉 后知后觉 迷迷蒙蒙 你给的梦 出现裂缝 隐隐作痛 
