In [1]:
import torch
import torch.nn as nn
import time
import zipfile
import math
import random
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')

In [2]:
def load_data_lyrics():
    with zipfile.ZipFile('/home/data/jaychou_lyrics.txt.zip')as zin:
        with zin.open('jaychou_lyrics.txt')as f:
            corpus_chars = f.read().decode('utf-8')
    corpus_chars = corpus_chars.replace('\n',' ').replace('\r',' ')
    corpus_chars = corpus_chars[:10000]
    idx_to_char = list(set(corpus_chars))
    char_to_idx = dict([char,i]for i,char in enumerate(idx_to_char))
    vocab_size =  len(idx_to_char)
    corpus_iter = [char_to_idx[char]for char in corpus_chars]
    return char_to_idx,idx_to_char,vocab_size,corpus_iter

In [3]:
char_to_idx,idx_to_char,vocab_size,corpus_iter = load_data_lyrics()
def data_iter_random(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    num_example = (len(corpus_iter)-1)//num_steps
    num_epochs = num_example//batch_size
    example_iter = list(range(num_example))
    random.shuffle(example_iter)
    def _data(pos):
        return corpus_iter[pos:pos+num_steps]
    for i in range(num_epochs):
        i = i*batch_size
        batch_iter = example_iter[i:i+batch_size]
        X = [_data(j*num_steps)for j in batch_iter]
        Y = [_data(j*num_steps+1)for j in batch_iter]
        x = torch.tensor(X,dtype=torch.float,device=device)
        y = torch.tensor(Y,dtype=torch.float,device=device)
        yield x,y

In [4]:
x = list(range(30))
for x,y in data_iter_random(x,2,6):
    print(x,y)

tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [12., 13., 14., 15., 16., 17.]], device='cuda:0') tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [13., 14., 15., 16., 17., 18.]], device='cuda:0')
tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [18., 19., 20., 21., 22., 23.]], device='cuda:0') tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [19., 20., 21., 22., 23., 24.]], device='cuda:0')


In [5]:
def data_iter_consecution(corpus_iter,batch_size,num_steps,device=None):
    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available()else 'cpu')
    data_len = len(corpus_iter)
    num_batch = data_len // batch_size
    corpus_iter = torch.tensor(corpus_iter,dtype=torch.float32,device=device)
    iteration = corpus_iter[0:num_batch*batch_size].view(batch_size,num_batch)
    num_epochs = (num_batch-1)//num_steps
    for i in range(num_epochs):
        i = i *num_steps
        x = iteration[:,i:i+num_steps]
        y = iteration[:,i+1:i+num_steps+1]
        yield x,y 

In [6]:
def onehot(x,n_class,dtype=torch.float32):
    x = x.long()
    res = torch.zeros(x.shape[0],n_class,dtype=dtype,device=x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res

In [7]:
def to_onehot(x,n_class):
    return [onehot(x[:,i],n_class)for i in range(x.shape[1])]

In [8]:
x = torch.arange(10).view(2,5)
to_onehot(x,10)

[tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 1., 0., 0., 0., 0.]]),
 tensor([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 1., 0., 0., 0.]]),
 tensor([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]]),
 tensor([[0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]]),
 tensor([[0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])]

In [9]:
num_inputs,num_hiddens,num_outputs = vocab_size,256,vocab_size
def get_params():
    def _one(shape):
        ts = torch.tensor(np.random.normal(0,0.01,size=shape),dtype=torch.float32,device=device)
        return torch.nn.Parameter(ts,requires_grad=True)
    def _three():
        return (
            _one((num_inputs,num_hiddens)),
            _one((num_hiddens,num_hiddens)),
            torch.nn.Parameter(torch.zeros(num_hiddens,dtype=torch.float32,device=device),requires_grad=True)
        )
    w_xf,w_hf,b_f = _three()
    w_xi,w_hi,b_i = _three()
    w_xo,w_ho,b_o = _three()

    w_xc,w_hc,b_c = _three()
    w_h = _one((num_hiddens,num_outputs))
    b_h = nn.Parameter(torch.zeros(num_outputs,device=device,dtype=torch.float32),requires_grad=True)
    return nn.ParameterList([ w_xf,w_hf,b_f,w_xi,w_hi,b_i,w_xo,w_ho,b_o,w_xc,w_hc,b_c,w_h,b_h])

In [10]:
def init_state_lstm(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),
            torch.zeros((batch_size,num_hiddens),device=device)
    )

In [11]:
def LSTM(inputs,state,params):
    w_xf,w_hf,b_f,w_xi,w_hi,b_i,w_xo,w_ho,b_o,w_xc,w_hc,b_c,w_h,b_h = params
    H,C = state
    outputs = []
    for x in inputs:
        F = torch.matmul(x,w_xf)+torch.matmul(H,w_hf)+b_f
        F = torch.sigmoid(F)
        I = torch.matmul(x,w_xi)+torch.matmul(H,w_hi)+b_i
        I = torch.sigmoid(I)
        O = torch.matmul(x,w_xo)+torch.matmul(H,w_ho)+b_o
        O = torch.sigmoid(O)
        c_tilda = torch.matmul(x,w_xc)+torch.matmul(H,w_hc)+b_c
        c_tilda = torch.tanh(c_tilda)
        C = F*C + I*c_tilda
        H = O*torch.tanh(C)
        output = torch.matmul(H,w_h)+b_h
        outputs.append(output)
    return outputs,(H,C)

In [12]:
def pred_LSTM(prefix,num_chars,lstm,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx):
    state = init(1,num_hiddens,device)
    outputs = [char_to_idx[prefix[0]]]
    for t in range(num_chars+len(prefix)-1):
        X = to_onehot(torch.tensor([[outputs[-1]]],device=device),vocab_size)
        output,state  = LSTM(X,state,params)
        if t<len(prefix)-1:
            outputs.append(char_to_idx[prefix[t+1]])
        else:
            outputs.append(int(output[0].argmax(dim=1).item()))
    return ''.join([idx_to_char[i]for i in outputs])

In [13]:
prefix = '分开'
params=get_params()
init = init_state_lstm
pred_LSTM(prefix,10,LSTM,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx)

'分开弃霜烁根容闭状根整界'

In [14]:
def clip_gradient(params,theta,device):
    norm = torch.tensor([0.0],device=device)
    for param in params:
        norm += (param.grad.data**2).sum()
    norm = norm.sqrt().item()
    if norm>theta:
        for param in params:
            param.grad.data *= (theta/norm)

In [15]:
def sgd(param,lr,batch_size):
    for param in params:
        param.data -=(param.grad*lr)/batch_size

In [16]:
def train_and_pred_lstm(net,params,init,num_epochs,num_hiddens,
lr,num_steps,batch_size,is_iter_random,device,clipping_rate,
idx_to_char,char_to_idx,corpus_iter,pred_period,vocab_size,
pred_len,prefixes):
    loss = nn.CrossEntropyLoss()
    if is_iter_random:
        data_iter_fn = data_iter_random
    else:
        data_iter_fn = data_iter_consecution
    for epoch in range(num_epochs):
        if not is_iter_random:
            state = init(batch_size,num_hiddens,device)
        l_sum = 0.0
        n = 0
        start = time.time()
        data_iter = data_iter_fn(corpus_iter,batch_size,num_steps,device)
        for X,Y in data_iter:
            if is_iter_random:
                state = init(batch_size,num_hiddens,device)
            else:
                for s in state:
                    s.detach()
            if state is not None:
                if isinstance(state,tuple):
                    state = (state[0].detach(),state[1].detach())
                else:
                    state = state.detach()
            inputs = to_onehot(X,vocab_size)
            y_hat,state = net(inputs,state,params)
            y_hat = torch.cat(y_hat,dim=0)
            y = torch.transpose(Y,0,1).contiguous().view(-1)
            l = loss(y_hat,y.long())
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            clip_gradient(params,clipping_rate,device)
            sgd(params,lr,1)
            l_sum +=l.item()*y.shape[0]
            n += y.shape[0]
        if (epoch+1)%pred_period==0:
            print('Epoch:%d,pre:%.3f,time:%.1f'%(epoch+1,math.exp(l_sum/n),time.time()-start))
            for prefix in prefixes:
                print('---',pred_LSTM(prefix,pred_len,net,params,init,num_hiddens,vocab_size,idx_to_char,char_to_idx))

In [17]:
num_epochs =250
num_steps = 35
batch_size = 32
lr = 1e2
clipping_rate = 1e-2
pred_period = 50
pred_len = 50
prefixes = ['分开','不分开']
train_and_pred_lstm(LSTM,params,init,num_epochs,num_hiddens,lr,num_steps,batch_size,False,
device,clipping_rate,idx_to_char,char_to_idx,corpus_iter,pred_period,vocab_size,pred_len,prefixes)

Epoch:50,pre:162.488,time:0.2
--- 分开 我不的我 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 
--- 不分开 我想你你的我 我想不你 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我不不 我
Epoch:100,pre:33.983,time:0.3
--- 分开 我想你你的微笑 一一  说想你的你有 一话  说想我的你有 有你  说你我的你有 有你 我想你的你
--- 不分开 我不你这已我有一一悲 我想想你的爱笑 你你 我想想的我有你 我想想你的爱笑 我想要 你不我 我不要
Epoch:150,pre:6.094,time:0.3
--- 分开 我想带这生经 一个个人 不来一直 我不就的抽 我面放好 全没用空 不人不动 没没有没 没有放纵 没
--- 不分开 我不经这生我 不要不觉 我跟了这节奏 后知后觉 又过了一个秋 后知后觉 我该好好生活 我该好好生活
Epoch:200,pre:1.891,time:0.3
--- 分开 我已带这生嵩 就你依 一直走的我想就你在一着  为知来起起个大不住 不懂 你的黑色幽默 想通 却又
--- 不分开 我已经这生单 我想能 你爱我 我想要这样布 对你依依不舍 连隔壁邻居都猜到我现在的感受 河边的风 
Epoch:250,pre:1.228,time:0.3
--- 分开 我轻轻的话快 后着着对不起 藤蔓植物 爬满了伯爵的坟墓 古堡里一片荒芜 长满杂草的泥土 不会骑扫把
--- 不分开 你已经离开我 不知不觉 我跟了这节奏 后知后觉 又过了一个秋 后知后觉 我该好好生活 我该好好生活


In [18]:
!nvidia-smi

Sat Apr 30 09:10:56 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.60.02    Driver Version: 510.60.02    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    On   | 00000000:25:00.0 Off |                  Off |
| 30%   33C    P8    15W / 230W |   2209MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces