In [1]:
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
import torch.optim as optim
import glob

In [2]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax()
     
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(1, self.hidden_size))

In [3]:
from thulac import thulac
thu = thulac(seg_only=True)

class MyDataset():
        
    def __init__(self):
        self.data = {0: [], 1: [], 2: [], 3: [], 4: []}
        self.word_vec = self.load_word_vector()
        self.vector_dim = 300
        
    def load_word_vector(self, in_name='data/sgns.financial.word'):
        """
        加载ACL2018词向量
        """
        word_vec = {}
        print('加载词向量中 ...')
        for i, line in enumerate(in_name):
    #         if i <= 10:
    #             continue
            if i > 250000:
                break
            words = line.strip().split(' ')
            word = words[0]
            word_vec[word] = Tensor([float(num) for num in words[1:]])
    #         except UnicodeDecodeError:
    #             print("编码问题，行 {}".format(i))
        print('加载词完成！一共 {}个词'.format(len(word_vec)))
        return word_vec

    def load(self):
        """
        加载原始文本
        """
        for in_name in glob.glob('data/labelled/*.txt'):
            for i, line in enumerate(open(in_name)):
                if line.strip() == '':
                    continue
                label = line.split('\t')[0]
                # 1234：四种情绪，-：没有情绪，x：不确定
                if label in ['1', '2', '3', '4', '-']:
                    if label == '-' or label == 'x':
                        y = int('0')
                    else:
                        y = int(label)

                X = self.line_to_tensor(line.split('\t')[1])
                self.data[y].append(X)
                       
    def line_to_tensor(self, line):
        """
        一句话转向量
        """
        list_vec = []
        for w in thu.cut(line): # 对分词结果进行处理
            w = w[0]
            if w in self.word_vec:
                list_vec.append(self.word_vec[w])
        tensor = torch.as_tensor([list_vec])
        return tensor
    

dataset = MyDataset()
dataset.load()

Model loaded succeed
加载词向量中 ...
加载词完成！一共 15个词


RuntimeError: sizes must be non-negative

In [None]:
# 设置参数
input_size, hidden_size, output_size = 300, 100, 5
learning_rate = 1e-5
EPOCH = 5000

# 定义函数
mod = MyNet(input_size, hidden_size, output_size)

# 定义损失函数
loss_fn = nn.MSELoss(size_average=False)

# 优化器
optimizer = torch.optim.SGD(mod.parameters(), lr=learning_rate)

In [38]:
## 开始训练 ##
for t in range(EPOCH):
    
    # 向前传播
    y_pred = mod(x_data)
    
    # 计算损失
    loss = loss_fn(y_pred, y_data)
    # 显示损失
    if t % 500 == 0:
        print(loss.data.item())
    
    # 在我们进行梯度更新之前，
    # 先使用optimier对象提供的清除已经积累的梯度。
    optimizer.zero_grad()
    
    # 计算梯度
    loss.backward()
    
    # 更新梯度
    optimizer.step()

11615.4599609375
8750.8349609375
8617.62109375
8526.04296875
8466.537109375
8412.2197265625
8342.642578125
8262.72265625
8200.2548828125
8079.6328125
