In [64]:
import torch
import torch.nn as nn
import time
import zipfile
import math
import random
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')

In [65]:
def load_data_lyrics():
    with zipfile.ZipFile('/home/data/jaychou_lyrics.txt.zip')as zin:
        with zin.open('jaychou_lyrics.txt')as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n',' ').replace('\r',' ')
    corpus_chars = corpus_chars[:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([char,i]for i,char in enumerate(idx_to_char))
    vocab_size =  len(idx_to_char)
    corpus_iter = [char_to_idx[char]for char in corpus_chars]
    return char_to_idx,idx_to_char,vocab_size,corpus_iter

In [66]:
char_to_idx,idx_to_char,vocab_size,corpus_iter = load_data_lyrics()
def data_iter_random(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    num_example = (len(corpus_iter)-1)//num_steps
    num_epochs = num_example//batch_size
    example_iter = list(range(num_example))
    random.shuffle(example_iter)
    def _data(pos):
        return corpus_iter[pos:pos+num_steps]
    for i in range(num_epochs):
        i = i*batch_size
        batch_iter = example_iter[i:i+batch_size]
        X = [_data(j*num_steps)for j in batch_iter]
        Y = [_data(j*num_steps+1)for j in batch_iter]
        x = torch.tensor(X,dtype=torch.float,device=device)
        y = torch.tensor(Y,dtype=torch.float,device=device)
        yield x,y

In [67]:
x = list(range(30))
for x,y in data_iter_random(x,2,6):
    print(x,y)

tensor([[12., 13., 14., 15., 16., 17.],
        [18., 19., 20., 21., 22., 23.]], device='cuda:0') tensor([[13., 14., 15., 16., 17., 18.],
        [19., 20., 21., 22., 23., 24.]], device='cuda:0')
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [ 6.,  7.,  8.,  9., 10., 11.]], device='cuda:0') tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [ 7.,  8.,  9., 10., 11., 12.]], device='cuda:0')


In [68]:
def data_iter_consecution(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    data_len = len(corpus_iter)
    num_batch = data_len // batch_size
    corpus_iter = torch.tensor(corpus_iter,dtype=torch.float32,device=device)
    iteration = corpus_iter[0:num_batch*batch_size].view(batch_size,num_batch)
    num_epochs = (num_batch-1)//num_steps
    for i in range(num_epochs):
        i = i *num_steps
        x = iteration[:,i:i+num_steps]
        y = iteration[:,i+1:i+num_steps+1]
        yield x,y 

In [69]:
def onehot(x,n_class,dtype=torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0],n_class,dtype=dtype,device=x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res

In [70]:
def to_onehot(x,n_class):
    return [onehot(x[:,i],n_class)for i in range(x.shape[1])]

In [71]:
x = torch.arange(10).view(2,5)
to_onehot(x,10)

[tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]),
 tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]),
 tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])]

In [72]:
num_inputs,num_hiddens,num_outputs = vocab_size,256,vocab_size
def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0,0.01,size=shape),dtype=torch.float32,device=device)
        return torch.nn.Parameter(ts,requires_grad=True)
    def _three():
        return (
            _one((num_inputs,num_hiddens)),
            _one((num_hiddens,num_hiddens)),
            torch.nn.Parameter(torch.zeros(num_hiddens,dtype=torch.float32,device=device),requires_grad=True)
        )
    w_xf,w_hf,b_f = _three()
    w_xi,w_hi,b_i = _three()
    w_xo,w_ho,b_o = _three()

    w_xc,w_hc,b_c = _three()
    w_h = _one((num_hiddens,num_outputs))
    b_h = nn.Parameter(torch.zeros(num_outputs,device=device,dtype=torch.float32),requires_grad=True)
    return nn.ParameterList([ w_xf,w_hf,b_f,w_xi,w_hi,b_i,w_xo,w_ho,b_o,w_xc,w_hc,b_c,w_h,b_h])

In [73]:
def init_state_lstm(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),
            torch.zeros((batch_size,num_hiddens),device=device)
    )

In [74]:
def LSTM(inputs,state,params):
    w_xf,w_hf,b_f,w_xi,w_hi,b_i,w_xo,w_ho,b_o,w_xc,w_hc,b_c,w_h,b_h = params
    H,C = state
    outputs = []
    for x in inputs:
        F = torch.matmul(x,w_xf)+torch.matmul(H,w_hf)+b_f
        F = torch.sigmoid(F)
        I = torch.matmul(x,w_xi)+torch.matmul(H,w_hi)+b_i
        I = torch.sigmoid(I)
        O = torch.matmul(x,w_xo)+torch.matmul(H,w_ho)+b_o
        O = torch.sigmoid(O)
        c_tilda = torch.matmul(x,w_xc)+torch.matmul(H,w_hc)+b_c
        c_tilda = torch.tanh(c_tilda)
        C = F*C + I*c_tilda
        H = O*torch.tanh(C)
        output = torch.matmul(H,w_h)+b_h
        outputs.append(output)
    return outputs,(H,C)

In [75]:
def pred_LSTM(prefix,num_chars,lstm,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx):
    state = init(1,num_hiddens,device)
    outputs = [char_to_idx[prefix[0]]]
    for t in range(num_chars+len(prefix)-1):
        X = to_onehot(torch.tensor([[outputs[-1]]],device=device),vocab_size)
        output,state  = LSTM(X,state,params)
        if t<len(prefix)-1:
            outputs.append(char_to_idx[prefix[t+1]])
        else:
            outputs.append(int(output[0].argmax(dim=1).item()))
    return ''.join([idx_to_char[i]for i in outputs])

In [76]:
prefix = '分开'
params=get_params()
init = init_state_lstm
pred_LSTM(prefix,10,LSTM,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx)

'分开阻辈可欢消苏中土蟑抢'

In [77]:
def clip_gradient(params,theta,device):
    norm = torch.tensor([0.0],device=device)
    for param in params:
        norm += (param.grad.data**2).sum()
    norm = norm.sqrt().item()
    if norm>theta:
        for param in params:
            param.grad.data *= (theta/norm)

In [84]:
def sgd(param,lr,batch_size):
    for param in params:
        param.data -=(param.grad*lr)/batch_size

In [85]:
def train_and_pred_lstm(net,params,init,num_epochs,num_hiddens,
lr,num_steps,batch_size,is_iter_random,device,clipping_rate,
idx_to_char,char_to_idx,corpus_iter,pred_period,vocab_size,
pred_len,prefixes):
    loss = nn.CrossEntropyLoss()
    if is_iter_random:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecution
    for epoch in range(num_epochs):
        if not is_iter_random:
            state = init(batch_size,num_hiddens,device)
        l_sum = 0.0
        n = 0
        start = time.time()
        data_iter = data_iter_fn(corpus_iter,batch_size,num_steps,device)
        for X,Y in data_iter:
            if is_iter_random:
                state = init(batch_size,num_hiddens,device)
            else:
                for s in state:
                    s.detach()
            if state is not None:
                if isinstance(state,tuple):
                    state = (state[0].detach(),state[1].detach())
                else:
                    state = state.detach()
            inputs = to_onehot(X,vocab_size)
            y_hat,state = net(inputs,state,params)
            y_hat = torch.cat(y_hat,dim=0)
            y = torch.transpose(Y,0,1).contiguous().view(-1)
            l = loss(y_hat,y.long())
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            clip_gradient(params,clipping_rate,device)
            sgd(params,lr,1)
            l_sum +=l.item()*y.shape[0]
            n += y.shape[0]
        if (epoch+1)%pred_period==0:
            print('Epoch:%d,pre:%.3f,time:%.1f'%(epoch+1,math.exp(l_sum/n),time.time()-start))
            for prefix in prefixes:
                print('---',pred_LSTM(prefix,pred_len,net,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx))

In [87]:
num_epochs =250
num_steps = 35
batch_size = 32
lr = 1e2
clipping_rate = 1e-2
pred_period = 50
pred_len = 50
prefixes = ['分开','不分开']
train_and_pred_lstm(LSTM,params,init,num_epochs,num_hiddens,lr,num_steps,batch_size,False,
device,clipping_rate,idx_to_char,char_to_idx,corpus_iter,pred_period,vocab_size,pred_len,prefixes)

Epoch:50,pre:51.900,time:0.3
--- 分开 我想你 你不的我不着 我想不着 我不的我不着 一色在 干不了 我不了 有不么 我的在 我的的在 你
--- 不分开 我想我我不着 我不不的 我不了的爱爱 一不么 干什了 我不了这不棍 一色在在 快什了空 我不么 不
Epoch:100,pre:13.584,time:0.3
--- 分开 一说我 你着我的手 说  你很很的吧? 我不着你想想 我  我 说眼眼睛看着我 别发抖 快给我抬起
--- 不分开 你想我 我不么 我说么 我说就 我想就 我不能 我不能 我不走 我不风 旧再风 旧说风 说说梦 说
Epoch:150,pre:4.120,time:0.3
--- 分开 一直你 你来的手的手 说通 却想再考倒我?? 我给你的黑色幽 说不到我的路堡 就是是童了故每  有
--- 不分开 你来定么不着我 甩是我的手腔幽 在不去了了嵩山 学少林跟武嵩 快使用双截棍 哼哼哈兮 快使用双截棍
Epoch:200,pre:2.119,time:0.3
--- 分开 一直说 你不眼的玩笑 我通啊 是又考倒倒我 说散 你想很久了吧? 败给你的黑色幽默 不散 你想很久
--- 不分开 你在经么开我 不开 看又我的倒活 说散 你又很久了吧? 我给去拆黑色幽幽 说散 你想很久了吧? 我
Epoch:250,pre:1.471,time:0.3
--- 分开 一轻说 你你的玩前 喜师 他念念 有词的 对酋我 才酋开我 我变就很主 这你是我 你过了一演出 一
--- 不分开 你作经开开我 不开开觉不手 抛这线进们 单过过人球 篮下妙传 手手了这节奏 篮知后觉 手人了人过秋
