In [1]:
'''
实验目的：实现基于RNN的文本分类

实验内容：
1）词嵌入初始化方式：随机embedding、加载glove
2）CNN/RNN的特征抽取
3）Dropout


参考：
https://arxiv.org/abs/1408.5882
https://github.com/yokusama/CNN_Sentence_Classification
https://torchtext.readthedocs.io/en/latest/
http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/


https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py#L39-L58

'''

In [2]:
import os
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sklearn
print(os.getcwd())

dir_all_data='data\\task2_all_data.tsv'

BATCH_SIZE=10

cpu=True   #True   False 
if cpu :
    USE_CUDA = False
    DEVICE = torch.device('cpu')
else:
    USE_CUDA = torch.cuda.is_available()
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(0)


D:\workspace\nlp_beginer_solution


In [3]:
#从文件中读取数据
data_all=pd.read_csv(dir_all_data,sep='\t')
#print(all_data.shape)    #(156060, 4)
#print(all_data.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
idx =np.arange(data_all.shape[0])
#print(data_all.head())
#print(type(idx))   #<class 'numpy.ndarray'>

In [4]:
#shuffle，划分验证集、测试集,并保存
seed=0
np.random.seed(seed)
#print(idx)
np.random.shuffle(idx)  
#print(idx)

train_size=int(len(idx) * 0.6)
test_size =int(len(idx) * 0.8)

data_all.iloc[idx[:train_size], :].to_csv('data/task2_train.csv',index=False)
data_all.iloc[idx[train_size:test_size], :].to_csv("data/task2_test.csv", index=False)
data_all.iloc[idx[test_size:], :].to_csv("data/task2_dev.csv", index=False)


In [5]:
#Torchtext采用声明式方法加载数据
from torchtext import data
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)


In [6]:
#读取数据

datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]
train_data = data.TabularDataset(path='data/task2_train.csv', format='csv',
                                fields=datafields)
dev_data  = data.TabularDataset(path='data/task2_dev.csv', format='csv',
                                fields=datafields)
test_data = data.TabularDataset(path='data/task2_test.csv', format='csv',
                                fields=datafields)


In [7]:
print(len(train_data))

93637


In [8]:
#构建词典，字符映射到embedding
TEXT.build_vocab(train_data,  vectors= 'glove.6B.50d',   #可以提前下载好
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)

#得到索引，PAD_TOKEN='<pad>'
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [9]:
#得到词向量
#pretrained_embeddings=TEXT.vocab.vectors
#print(type(TEXT.vocab.vectors))
#print(TEXT.vocab.vectors)

In [10]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE, 
                                     train=True, shuffle=True,device=DEVICE)

dev_iterator = data.Iterator(dev_data, batch_size=len(dev_data), train=False,
                         sort=False, device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data), train=False,
                          sort=False, device=DEVICE)


In [11]:
#部分参数设置
embedding_choice='glove'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5

vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

hidden_size=20  #隐藏单元数
num_layers=1  #层数

16473 6


In [12]:
from torch import nn
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        
        self.embedding_choice=embedding_choice        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        
        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
        if self.embedding_choice==  'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim, 
                padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)
        #input_size (输入的特征维度),hidden_size ,num_layers 
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True,bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)    
        self.fc = nn.Linear(hidden_size * 2, label_num)  # 2 for bidirection
        
        
        
    def forward(self,x):      # (Batch_size, Length) 
        # Set initial hidden and cell states 
        # h_n (num_layers * num_directions, batch, hidden_size)
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        # c_n (num_layers * num_directions, batch, hidden_size): 
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        
        if USE_CUDA:
            h0=h0.cuda()
            c0=c0.cuda()
        
        x=self.embedding(x)     #(Batch_size, Length) 
                                       #(Batch_size,  Length, Dimention) 
        
        out, _ = self.lstm(x, (h0, c0))   #(Batch_size, Length，Dimention) 
                                        # (batch_size, Length, hidden_size)  
        out=self.dropout(out)
        
        out = self.fc(out[:, -1, :])   # (batch_size, Length, hidden_size)  
                                        # (batch_size, label_num)  
        return out 

In [13]:
#构建模型

model=LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if USE_CUDA:
    model.cuda()


In [15]:
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零
        
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item() 

        loss.backward()
        optimizer.step()        

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f, Training accuracy: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps, total_correct/total_data_num))  

    #验证
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0    
    for batch in dev_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()
        
        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()
        
        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))  
        
        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num 
            torch.save(model,'model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()

Epoch 0_1.068%:  Training average Loss: 1.217845, Training accuracy: 0.005372
Epoch 0_2.136%:  Training average Loss: 1.207918, Training accuracy: 0.010744
Epoch 0_3.204%:  Training average Loss: 1.196013, Training accuracy: 0.016511
Epoch 0_4.272%:  Training average Loss: 1.199137, Training accuracy: 0.022149
Epoch 0_5.340%:  Training average Loss: 1.191290, Training accuracy: 0.028066
Epoch 0_6.408%:  Training average Loss: 1.203007, Training accuracy: 0.033555
Epoch 0_7.476%:  Training average Loss: 1.209848, Training accuracy: 0.039141
Epoch 0_8.544%:  Training average Loss: 1.212190, Training accuracy: 0.044566
Epoch 0_9.612%:  Training average Loss: 1.214535, Training accuracy: 0.050194
Epoch 0_10.680%:  Training average Loss: 1.211269, Training accuracy: 0.055843
Epoch 0_11.747%:  Training average Loss: 1.210257, Training accuracy: 0.061482
Epoch 0_12.815%:  Training average Loss: 1.209050, Training accuracy: 0.067132
Epoch 0_13.883%:  Training average Loss: 1.202895, Training a

Epoch 1_9.612%:  Training average Loss: 1.110178, Training accuracy: 0.053622
Epoch 1_10.680%:  Training average Loss: 1.114960, Training accuracy: 0.059175
Epoch 1_11.747%:  Training average Loss: 1.111443, Training accuracy: 0.065423
Epoch 1_12.815%:  Training average Loss: 1.112253, Training accuracy: 0.071329
Epoch 1_13.883%:  Training average Loss: 1.113320, Training accuracy: 0.077074
Epoch 1_14.951%:  Training average Loss: 1.111852, Training accuracy: 0.083140
Epoch 1_16.019%:  Training average Loss: 1.109581, Training accuracy: 0.089035
Epoch 1_17.087%:  Training average Loss: 1.110177, Training accuracy: 0.094770
Epoch 1_18.155%:  Training average Loss: 1.109195, Training accuracy: 0.100729
Epoch 1_19.223%:  Training average Loss: 1.110222, Training accuracy: 0.106443
Epoch 1_20.291%:  Training average Loss: 1.111304, Training accuracy: 0.112317
Epoch 1_21.359%:  Training average Loss: 1.109590, Training accuracy: 0.118457
Epoch 1_22.427%:  Training average Loss: 1.108294, Tr

Epoch 2_18.155%:  Training average Loss: 1.047913, Training accuracy: 0.104681
Epoch 2_19.223%:  Training average Loss: 1.047750, Training accuracy: 0.110811
Epoch 2_20.291%:  Training average Loss: 1.047459, Training accuracy: 0.116973
Epoch 2_21.359%:  Training average Loss: 1.048419, Training accuracy: 0.122900
Epoch 2_22.427%:  Training average Loss: 1.046117, Training accuracy: 0.129254
Epoch 2_23.495%:  Training average Loss: 1.045626, Training accuracy: 0.135449
Epoch 2_24.563%:  Training average Loss: 1.044556, Training accuracy: 0.141664
Epoch 2_25.631%:  Training average Loss: 1.044317, Training accuracy: 0.147730
Epoch 2_26.699%:  Training average Loss: 1.043084, Training accuracy: 0.154010
Epoch 2_27.767%:  Training average Loss: 1.042129, Training accuracy: 0.160161
Epoch 2_28.835%:  Training average Loss: 1.041928, Training accuracy: 0.166377
Epoch 2_29.903%:  Training average Loss: 1.041392, Training accuracy: 0.172613
Epoch 2_30.971%:  Training average Loss: 1.039398, T

Epoch 3_26.699%:  Training average Loss: 1.000904, Training accuracy: 0.157395
Epoch 3_27.767%:  Training average Loss: 1.001579, Training accuracy: 0.163653
Epoch 3_28.835%:  Training average Loss: 0.999978, Training accuracy: 0.170008
Epoch 3_29.903%:  Training average Loss: 1.000848, Training accuracy: 0.175999
Epoch 3_30.971%:  Training average Loss: 1.000611, Training accuracy: 0.182449
Epoch 3_32.039%:  Training average Loss: 1.000747, Training accuracy: 0.188675
Epoch 3_33.107%:  Training average Loss: 1.001705, Training accuracy: 0.194795
Epoch 3_34.175%:  Training average Loss: 1.001657, Training accuracy: 0.201010
Epoch 3_35.242%:  Training average Loss: 1.000604, Training accuracy: 0.207407
Epoch 3_36.310%:  Training average Loss: 1.001224, Training accuracy: 0.213687
Epoch 3_37.378%:  Training average Loss: 1.001705, Training accuracy: 0.219999
Epoch 3_38.446%:  Training average Loss: 1.001985, Training accuracy: 0.226364
Epoch 3_39.514%:  Training average Loss: 1.002544, T

Epoch 4_36.310%:  Training average Loss: 0.990657, Training accuracy: 0.214936
Epoch 4_37.378%:  Training average Loss: 0.990298, Training accuracy: 0.221291
Epoch 4_38.446%:  Training average Loss: 0.990209, Training accuracy: 0.227666
Epoch 4_39.514%:  Training average Loss: 0.990609, Training accuracy: 0.233903
Epoch 4_40.582%:  Training average Loss: 0.989069, Training accuracy: 0.240546
Epoch 4_41.650%:  Training average Loss: 0.987856, Training accuracy: 0.247231
Epoch 4_42.718%:  Training average Loss: 0.987472, Training accuracy: 0.253735
Epoch 4_43.786%:  Training average Loss: 0.987275, Training accuracy: 0.260186
Epoch 4_44.854%:  Training average Loss: 0.987628, Training accuracy: 0.266252
Epoch 4_45.922%:  Training average Loss: 0.988529, Training accuracy: 0.272339
Epoch 4_46.990%:  Training average Loss: 0.988419, Training accuracy: 0.278554
Epoch 4_48.058%:  Training average Loss: 0.988924, Training accuracy: 0.284823
Epoch 4_49.126%:  Training average Loss: 0.989621, T

Epoch 5_44.854%:  Training average Loss: 0.976648, Training accuracy: 0.267320
Epoch 5_45.922%:  Training average Loss: 0.976613, Training accuracy: 0.273588
Epoch 5_46.990%:  Training average Loss: 0.976740, Training accuracy: 0.279751
Epoch 5_48.058%:  Training average Loss: 0.976782, Training accuracy: 0.286019
Epoch 5_49.126%:  Training average Loss: 0.976340, Training accuracy: 0.292502
Epoch 5_50.194%:  Training average Loss: 0.975956, Training accuracy: 0.298963
Epoch 5_51.262%:  Training average Loss: 0.975924, Training accuracy: 0.305392
Epoch 5_52.330%:  Training average Loss: 0.976166, Training accuracy: 0.311618
Epoch 5_53.398%:  Training average Loss: 0.975793, Training accuracy: 0.318229
Epoch 5_54.466%:  Training average Loss: 0.975879, Training accuracy: 0.324519
Epoch 5_55.534%:  Training average Loss: 0.976465, Training accuracy: 0.330660
Epoch 5_56.602%:  Training average Loss: 0.976157, Training accuracy: 0.337036
Epoch 5_57.670%:  Training average Loss: 0.976876, T

Epoch 6_53.398%:  Training average Loss: 0.975139, Training accuracy: 0.317909
Epoch 6_54.466%:  Training average Loss: 0.974739, Training accuracy: 0.324306
Epoch 6_55.534%:  Training average Loss: 0.974581, Training accuracy: 0.330767
Epoch 6_56.602%:  Training average Loss: 0.974211, Training accuracy: 0.337153
Epoch 6_57.670%:  Training average Loss: 0.974536, Training accuracy: 0.343304
Epoch 6_58.737%:  Training average Loss: 0.974262, Training accuracy: 0.349958
Epoch 6_59.805%:  Training average Loss: 0.973877, Training accuracy: 0.356462
Epoch 6_60.873%:  Training average Loss: 0.973582, Training accuracy: 0.362880
Epoch 6_61.941%:  Training average Loss: 0.974028, Training accuracy: 0.369031
Epoch 6_63.009%:  Training average Loss: 0.973811, Training accuracy: 0.375258
Epoch 6_64.077%:  Training average Loss: 0.973448, Training accuracy: 0.381761
Epoch 6_65.145%:  Training average Loss: 0.973812, Training accuracy: 0.388052
Epoch 6_66.213%:  Training average Loss: 0.973640, T

Epoch 7_63.009%:  Training average Loss: 0.965060, Training accuracy: 0.378974
Epoch 7_64.077%:  Training average Loss: 0.965522, Training accuracy: 0.385147
Epoch 7_65.145%:  Training average Loss: 0.965660, Training accuracy: 0.391480
Epoch 7_66.213%:  Training average Loss: 0.966069, Training accuracy: 0.397770
Epoch 7_67.281%:  Training average Loss: 0.965925, Training accuracy: 0.404060
Epoch 7_68.349%:  Training average Loss: 0.966252, Training accuracy: 0.410415
Epoch 7_69.417%:  Training average Loss: 0.966053, Training accuracy: 0.417004
Epoch 7_70.485%:  Training average Loss: 0.965284, Training accuracy: 0.423647
Epoch 7_71.553%:  Training average Loss: 0.965339, Training accuracy: 0.430065


KeyboardInterrupt: 

In [17]:
#测试
PATH='model_dict/model_lstm/epoch_5_accuracy_0.200765'
model = torch.load(PATH)

total_loss=0.0
accuracy=0.0
total_correct=0.0
total_data_num = len(train_iterator.dataset)
steps = 0.0    
start_time=time.time()
for batch in test_iterator:
    steps+=1
    batch_text=batch.Phrase
    batch_label=batch.Sentiment
    out=model(batch_text)
    loss = criterion(out, batch_label)
    total_loss = total_loss + loss.item()

    correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
    total_correct = total_correct + correct.item()

print("Test average Loss: %f, Test accuracy: %f，Total time: %f"
  %(total_loss/steps, total_correct/total_data_num,time.time()-start_time) ) 

Test average Loss: 0.958765, Test accuracy: 0.200936，Total time: 4.736815
