In [1]:
import pandas as pd

#加载处理好的数据集,每句话是15个词,y是2分类,字典在data/sst2/vocab.txt
data = pd.read_csv('data/sst2/data.csv')

data

Unnamed: 0,x,y
0,"101,5342,2047,3595,8496,2013,1996,18643,3197,1...",0
1,"101,3397,2053,15966,1010,2069,4450,2098,18201,...",0
2,"101,2008,7459,2049,3494,1998,10639,2015,2242,2...",1
3,"101,3464,12580,8510,2000,3961,1996,2168,2802,1...",0
4,"101,2006,1996,5409,7195,1011,1997,1011,1996,10...",0
...,...,...
64995,"101,1996,2569,3896,2024,1036,2446,1011,3670,29...",0
64996,1018552349410200000000000,1
64997,10115299102000000000000,1
64998,"101,2000,4608,1996,6510,1997,2010,13805,2015,1...",1


In [2]:
import torch


#定义数据集
class Dataset(torch.utils.data.Dataset):

    def __len__(self):
        return len(data)

    def __getitem__(self, i):
        #取数据
        x, y = data.iloc[i]

        #以逗号分割x数据,转换为向量
        x = [int(i) for i in x.split(',')]
        x = torch.LongTensor(x)
        
        #y不需要太特别的处理
        y = int(y)

        return x, y


dataset = Dataset()

len(dataset), dataset[0]

(65000,
 (tensor([  101,  5342,  2047,  3595,  8496,  2013,  1996, 18643,  3197,   102,
              0,     0,     0,     0,     0]),
  0))

In [3]:
#数据集加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=8,
                                     shuffle=True,
                                     drop_last=True)

len(loader), next(iter(loader))

(8125,
 [tensor([[  101,  1045,  2293,  2009,  1012,  1012,  1012,  3109,  1010,  1045,
           14145,  2080,  1012,   102,     0],
          [  101,  2064,  3191,  1996,  4942, 27430,  1006,  1996,  3850,  2003,
            7042,  1999,  3059,  1007,   102],
          [  101,  7093, 11450,  2055,  2028,  2518,  1010,  2005,  2035,  2049,
           26161,  1998, 27451,  1010,   102],
          [  101,  5019,  1997, 21014, 15401,   102,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101, 15594, 20957,   102,     0,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101,  1045,  1005,  1049,  3497,  2000,  2156,  2035,  2095,   102,
               0,     0,     0,     0,     0],
          [  101,  2878, 14045,  1998,   102,     0,     0,     0,     0,     0,
               0,     0,     0,     0,     0],
          [  101,  1999,  8489,  1997,  2293,  1005,  2003,  1996,  2472,  1005,
            1055

In [4]:
#全连接神经网络
class Model(torch.nn.Module):

    #模型初始化部分
    def __init__(self):
        super().__init__()

        #词编码层,30522是词的数量,每个词会被编码为100维的向量
        self.embed = torch.nn.Embedding(num_embeddings=30522,
                                        embedding_dim=100)

        #RNN单元
        self.cell = torch.nn.GRUCell(input_size=100, hidden_size=512)

        #线性输出
        self.fc = torch.nn.Linear(in_features=512, out_features=2)

    #定义神经网络计算过程
    def forward(self, x):

        #每个词编码为100维的向量
        #[8, 15] -> [8, 15, 100]
        x = self.embed(x)

        #初始记忆为空
        h = None

        #从前向后读句子中的每一个词
        for i in range(x.shape[1]):
            #[8, 100],[8, 512] -> [8, 512]
            h = self.cell(x[:, i], h)

        #根据最后一个词的记忆,分类整句话
        #[8, 512] -> [8, 2]
        return self.fc(h)


model = Model()

model(torch.ones(8, 15).long()).shape

torch.Size([8, 2])

In [5]:
#训练
def train():
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    loss_fun = torch.nn.CrossEntropyLoss()
    model.train()

    for epoch in range(2):
        for i, (x, y) in enumerate(loader):
            out = model(x)
            loss = loss_fun(out, y)

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if i % 2000 == 0:
                acc = (out.argmax(dim=1) == y).sum().item() / len(y)
                print(epoch, i, loss.item(), acc)

    torch.save(model, 'model/7.model')


train()

0 0 0.6832275986671448 0.625
0 2000 0.9678245186805725 0.5
0 4000 0.5564815998077393 0.625
0 6000 0.3704407811164856 0.875
0 8000 0.7264536619186401 0.75
1 0 0.5351651310920715 0.625
1 2000 0.7487736940383911 0.625
1 4000 0.4652150869369507 0.75
1 6000 0.3317089378833771 0.875
1 8000 0.12732546031475067 1.0


In [6]:
#测试
@torch.no_grad()
def test():
    model = torch.load('model/7.model')
    model.eval()

    correct = 0
    total = 0
    for i in range(100):
        x, y = next(iter(loader))

        out = model(x).argmax(dim=1)

        correct += (out == y).sum().item()
        total += len(y)

    print(correct / total)


test()

0.8325
