In [1]:
'''
实验目的：实现基于RNN的文本分类

实验内容：
1）词嵌入初始化方式：随机embedding、加载glove
2）CNN/RNN的特征抽取
3）Dropout


参考：
https://arxiv.org/abs/1408.5882
https://github.com/yokusama/CNN_Sentence_Classification
https://torchtext.readthedocs.io/en/latest/
http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/
https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/


https://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py#L39-L58

'''

'\n实验目的：实现基于RNN的文本分类\n\n实验内容：\n1）词嵌入初始化方式：随机embedding、加载glove\n2）CNN/RNN的特征抽取\n3）Dropout\n\n\n参考：\nhttps://arxiv.org/abs/1408.5882\nhttps://github.com/yokusama/CNN_Sentence_Classification\nhttps://torchtext.readthedocs.io/en/latest/\nhttp://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/\nhttps://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/\n\n\nhttps://github.com/yunjey/pytorch-tutorial/blob/master/tutorials/02-intermediate/bidirectional_recurrent_neural_network/main.py#L39-L58\n\n'

In [2]:
import os
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sklearn
print(os.getcwd())

dir_all_data='data\\task2_all_data.tsv'

BATCH_SIZE=10

cpu=True   #True   False 
if cpu :
    USE_CUDA = False
    DEVICE = torch.device('cpu')
else:
    USE_CUDA = torch.cuda.is_available()
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(0)


D:\workspace\nlp_beginer_solution


In [3]:
#从文件中读取数据
data_all=pd.read_csv(dir_all_data,sep='\t')
#print(all_data.shape)    #(156060, 4)
#print(all_data.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
idx =np.arange(data_all.shape[0])
#print(data_all.head())
#print(type(idx))   #<class 'numpy.ndarray'>

In [4]:
# #shuffle，划分验证集、测试集,并保存
# seed=0
# np.random.seed(seed)
# #print(idx)
# np.random.shuffle(idx)  
# #print(idx)

# train_size=int(len(idx) * 0.6)
# test_size =int(len(idx) * 0.8)

# data_all.iloc[idx[:train_size], :].to_csv('data/task2_train.csv',index=False)
# data_all.iloc[idx[train_size:test_size], :].to_csv("data/task2_test.csv", index=False)
# data_all.iloc[idx[test_size:], :].to_csv("data/task2_dev.csv", index=False)


In [5]:
#shuffle，只划分测试集,并保存
seed=0
np.random.seed(seed)
#print(idx)
np.random.shuffle(idx)  
#print(idx)

#train_size=int(len(idx) * 0.6)
test_size =int(len(idx) * 0.8)

data_all.iloc[idx[:test_size], :].to_csv('data/task2_train2.csv',index=False)
data_all.iloc[idx[test_size:], :].to_csv("data/task2_test2.csv", index=False)


In [6]:
#Torchtext采用声明式方法加载数据
from torchtext import data
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)


In [7]:
#读取数据

datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]
train_data = data.TabularDataset(path='data/task2_train2.csv', format='csv',
                                fields=datafields)
# dev_data  = data.TabularDataset(path='data/task2_dev.csv', format='csv',
#                                 fields=datafields)
test_data = data.TabularDataset(path='data/task2_test2.csv', format='csv',
                                fields=datafields)


In [8]:
print(len(train_data))

124849


In [9]:
#构建词典，字符映射到embedding
TEXT.build_vocab(train_data,  vectors= 'glove.6B.50d',   #可以提前下载好
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)

#得到索引，PAD_TOKEN='<pad>'
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [10]:
#得到词向量
#pretrained_embeddings=TEXT.vocab.vectors
#print(type(TEXT.vocab.vectors))
#print(TEXT.vocab.vectors)

In [11]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE, 
                                     train=True, shuffle=True,device=DEVICE)

# dev_iterator = data.Iterator(dev_data, batch_size=len(dev_data), train=False,
#                          sort=False, device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data), train=False,
                          sort=False, device=DEVICE)


In [12]:
#部分参数设置
embedding_choice='glove'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5

vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

hidden_size=50  #隐藏单元数
num_layers=2  #层数

16525 6


In [16]:
from torch import nn
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM,self).__init__()
        
        self.embedding_choice=embedding_choice        
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        
        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
        if self.embedding_choice==  'glove':
            self.embedding = nn.Embedding(num_embeddings, embedding_dim, 
                padding_idx=PAD_INDEX).from_pretrained(TEXT.vocab.vectors, freeze=True)
        #input_size (输入的特征维度),hidden_size ,num_layers 
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers,
                            batch_first=True,dropout=dropout_p,bidirectional=True)
        self.dropout = nn.Dropout(dropout_p)    
        self.fc = nn.Linear(hidden_size * 2, label_num)  # 2 for bidirection
        
        
        
    def forward(self,x):      # (Batch_size, Length) 
        # Set initial hidden and cell states 
        # h_n (num_layers * num_directions, batch, hidden_size)
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        # c_n (num_layers * num_directions, batch, hidden_size): 
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size)
        
        if USE_CUDA:
            h0=h0.cuda()
            c0=c0.cuda()
        
        x=self.embedding(x)     #(Batch_size, Length) 
                                       #(Batch_size,  Length, Dimention) 
        
        out, _ = self.lstm(x, (h0, c0))   #(Batch_size, Length，Dimention) 
                                        # (batch_size, Length, hidden_size)  
        out=self.dropout(out)
        
        out = self.fc(out[:, -1, :])   # (batch_size, Length, hidden_size)  
                           # (batch_size, label_num)  
        return out 

In [17]:
#构建模型

model=LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if USE_CUDA:
    model.cuda()


In [None]:
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零
        
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item() 

        loss.backward()
        optimizer.step()        

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f, Training accuracy: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps, total_correct/total_data_num))  

    #测试
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0    
    for batch in test_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()
        
        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()
        
        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))  
        
        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num 
            torch.save(model,'model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/model_lstm/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()

Epoch 0_0.801%:  Training average Loss: 1.440362, Training accuracy: 0.003564
Epoch 0_1.602%:  Training average Loss: 1.365597, Training accuracy: 0.007761
Epoch 0_2.403%:  Training average Loss: 1.331405, Training accuracy: 0.011846
Epoch 0_3.204%:  Training average Loss: 1.323612, Training accuracy: 0.015747
Epoch 0_4.005%:  Training average Loss: 1.295128, Training accuracy: 0.019960
Epoch 0_4.806%:  Training average Loss: 1.280781, Training accuracy: 0.024005
Epoch 0_5.607%:  Training average Loss: 1.272381, Training accuracy: 0.028146
Epoch 0_6.408%:  Training average Loss: 1.261118, Training accuracy: 0.032495
Epoch 0_7.209%:  Training average Loss: 1.261520, Training accuracy: 0.036284
Epoch 0_8.010%:  Training average Loss: 1.255846, Training accuracy: 0.040489
Epoch 0_8.811%:  Training average Loss: 1.254589, Training accuracy: 0.044494
Epoch 0_9.612%:  Training average Loss: 1.255967, Training accuracy: 0.048338
Epoch 0_10.413%:  Training average Loss: 1.251414, Training accu

In [None]:
#测试
PATH='model_dict/model_lstm/epoch_5_a'
model = torch.load(PATH)

total_loss=0.0
accuracy=0.0
total_correct=0.0
total_data_num = len(train_iterator.dataset)
steps = 0.0    
start_time=time.time()
for batch in test_iterator:
    steps+=1
    batch_text=batch.Phrase
    batch_label=batch.Sentiment
    out=model(batch_text)
    loss = criterion(out, batch_label)
    total_loss = total_loss + loss.item()

    correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
    total_correct = total_correct + correct.item()

print("Test average Loss: %f, Test accuracy: %f，Total time: %f"
  %(total_loss/steps, total_correct/total_data_num,time.time()-start_time) ) 

In [None]:
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    if i==1:
        break
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    j=0
    for batch in train_iterator:
        j+=1
        if j==3:
            break
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零
        
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        print(batch_label)
        #print(batch_text.size())
        #print(batch_text)
        #print(model.embedding(batch_text))
        
        

#         batch_label=batch.Sentiment
        
#         out=model(batch_text)    #[batch_size, label_num]
#         loss = criterion(out, batch_label)
#         total_loss = total_loss + loss.item() 

#         loss.backward()
#         optimizer.step()        

#         correct = (torch.max(out, dim=1)[1]  #get the indices
#                    .view(batch_label.size()) == batch_label).sum()
#         total_correct = total_correct + correct.item()

#         if steps%100==0:
#             print("Epoch %d_%.3f%%:  Training average Loss: %f, Training accuracy: %f"
#                       %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps, total_correct/total_data_num))  

    