In [1]:
'''
实验目的：实现基于CNN、RNN的文本分类

实验内容：
1）词嵌入初始化方式：随机embedding、word2vec、word2vec_fun-tured、加载glove
2）CNN/RNN的特征抽取
3）Dropout


参考：
https://arxiv.org/abs/1408.5882
https://github.com/yokusama/CNN_Sentence_Classification
https://torchtext.readthedocs.io/en/latest/
http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

'''

'\n实验目的：实现基于CNN、RNN的文本分类\n\n实验内容：\n1）词嵌入初始化方式：随机embedding、word2vec、word2vec_fun-tured、加载glove\n2）CNN/RNN的特征抽取\n3）Dropout\n\n\n参考：\nhttps://arxiv.org/abs/1408.5882\nhttps://github.com/yokusama/CNN_Sentence_Classification\nhttps://torchtext.readthedocs.io/en/latest/\nhttp://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/\n\n'

In [17]:
import os
import time
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sklearn
print(os.getcwd())

dir_all_data='data\\task2_all_data.tsv'

BATCH_SIZE=10

cpu=True   #True   False 
if cpu :
    USE_CUDA = False
    DEVICE = torch.device('cpu')
else:
    USE_CUDA = torch.cuda.is_available()
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.set_device(0)


D:\workspace\nlp_beginer_solution


In [3]:
#从文件中读取数据
data_all=pd.read_csv(dir_all_data,sep='\t')
#print(all_data.shape)    #(156060, 4)
#print(all_data.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
idx =np.arange(data_all.shape[0])
#print(data_all.head())
#print(type(idx))   #<class 'numpy.ndarray'>

In [4]:
#shuffle，划分验证集、测试集,并保存
seed=0
np.random.seed(seed)
#print(idx)
np.random.shuffle(idx)  
#print(idx)

train_size=int(len(idx) * 0.6)
test_size =int(len(idx) * 0.8)

data_all.iloc[idx[:train_size], :].to_csv('data/task2_train.csv',index=False)
data_all.iloc[idx[train_size:test_size], :].to_csv("data/task2_test.csv", index=False)
data_all.iloc[idx[test_size:], :].to_csv("data/task2_dev.csv", index=False)


In [5]:
#Torchtext采用声明式方法加载数据
from torchtext import data
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)


In [6]:
#读取数据

datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]
train_data = data.TabularDataset(path='data/task2_train.csv', format='csv',
                                fields=datafields)
dev_data  = data.TabularDataset(path='data/task2_dev.csv', format='csv',
                                fields=datafields)
test_data = data.TabularDataset(path='data/task2_test.csv', format='csv',
                                fields=datafields)


In [7]:
print(len(train_data))

93637


In [8]:
#构建词典，字符映射到embedding
TEXT.build_vocab(train_data,  vectors= 'glove.6B.50d',   #可以提前下载好
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)
PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [9]:
#得到词向量
pretrained_embeddings=TEXT.vocab.vectors
print(type(TEXT.vocab.vectors))
print(TEXT.vocab.vectors)

<class 'torch.Tensor'>
tensor([[-0.1919, -0.2418,  0.0356,  ..., -0.1342, -0.1067,  0.1024],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [ 0.2282,  0.0205, -0.4533,  ...,  0.5120,  1.0239, -0.8895],
        [ 0.1172,  1.0841, -0.0531,  ..., -1.2205,  0.7453,  2.0112],
        [ 0.1703,  0.3512,  1.5457,  ..., -0.4095, -0.4091,  0.5303]])


In [10]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE, 
                                     train=True, shuffle=True,device=DEVICE)

dev_iterator = data.Iterator(dev_data, batch_size=len(dev_data), train=False,
                         sort=False, device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data), train=False,
                          sort=False, device=DEVICE)


In [11]:
#部分参数设置
embedding_choice='rand'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =50
dropout_p=0.5
filters_num=100

In [12]:
vocab_size=len(TEXT.vocab)
label_num=len(LABEL.vocab)
print(vocab_size,label_num)

16473 6


In [13]:
import torch.nn.functional as F
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.embedding_choice=embedding_choice
        self.vocab_size=vocab_size
        
        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
            
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(3, embedding_dim), padding=(2,0))
        
        self.conv2 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(4, embedding_dim), padding=(3,0))
        
        self.conv3 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(5, embedding_dim), padding=(4,0))
        
        self.dropout = nn.Dropout(dropout_p)
        
        self.fc = nn.Linear(filters_num * 3, label_num)
        
    def forward(self,x):      # (Batch_size, Length) 
        x=self.embedding(x).unsqueeze(1)      #(Batch_size, Length, Dimention) 
                                       #(Batch_size, 1, Length, Dimention) 
        
        x1 = F.relu(self.conv1(x)).squeeze(3)    #(Batch_size, filters_num, length+padding, 1) 
                                          #(Batch_size, filters_num, length+padding) 
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)  #(Batch_size, filters_num, 1)
                                               #(Batch_size, filters_num) 
         
        x2 = F.relu(self.conv2(x)).squeeze(3)  
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)      
        
        x3 = F.relu(self.conv3(x)).squeeze(3)  
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)      
        
        x = torch.cat((x1, x2, x3), dim=1)  #(Batch_size, filters_num *3 )
        x = self.dropout(x)      #(Batch_size, filters_num *3 )
        out = self.fc(x)       #(Batch_size, label_num  )
        return out
    

In [14]:
#构建模型

model=CNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#创建优化器SGD
criterion = nn.CrossEntropyLoss()   #损失函数

if USE_CUDA:
    model.cuda()

In [None]:
import time
epoch=100
best_accuracy=0.0
start_time=time.time()

for i in range(epoch):
    model.train()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0
    #训练
    for batch in train_iterator:
        steps+=1
        #print(steps)
        optimizer.zero_grad() #  梯度缓存清零
        
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)    #[batch_size, label_num]
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item() 

        loss.backward()
        optimizer.step()        

        correct = (torch.max(out, dim=1)[1]  #get the indices
                   .view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()

        if steps%100==0:
            print("Epoch %d_%.3f%%:  Training average Loss: %f, Training accuracy: %f"
                      %(i, steps * train_iterator.batch_size*100/len(train_iterator.dataset),total_loss/steps, total_correct/total_data_num))  

    #验证
    model.eval()
    total_loss=0.0
    accuracy=0.0
    total_correct=0.0
    total_data_num = len(train_iterator.dataset)
    steps = 0.0    
    for batch in dev_iterator:
        steps+=1
        batch_text=batch.Phrase
        batch_label=batch.Sentiment
        out=model(batch_text)
        loss = criterion(out, batch_label)
        total_loss = total_loss + loss.item()
        
        correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
        total_correct = total_correct + correct.item()
        
        print("Epoch %d :  Verification average Loss: %f, Verification accuracy: %f%%,Total Time:%f"
          %(i, total_loss/steps, total_correct*100/total_data_num,time.time()-start_time))  
        
        if best_accuracy < total_correct/total_data_num :
            best_accuracy =total_correct/total_data_num 
            torch.save(model,'model_dict/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            print('Model is saved in model_dict/epoch_%d_accuracy_%f'%(i,total_correct/total_data_num))
            #torch.cuda.empty_cache()

Epoch 0_1.068%:  Training average Loss: 1.405351, Training accuracy: 0.004987
Epoch 0_2.136%:  Training average Loss: 1.381584, Training accuracy: 0.009975
Epoch 0_3.204%:  Training average Loss: 1.363787, Training accuracy: 0.014898
Epoch 0_4.272%:  Training average Loss: 1.341407, Training accuracy: 0.020131
Epoch 0_5.340%:  Training average Loss: 1.332115, Training accuracy: 0.025706
Epoch 0_6.408%:  Training average Loss: 1.323313, Training accuracy: 0.030981
Epoch 0_7.476%:  Training average Loss: 1.317740, Training accuracy: 0.036321
Epoch 0_8.544%:  Training average Loss: 1.309630, Training accuracy: 0.041757
Epoch 0_9.612%:  Training average Loss: 1.307138, Training accuracy: 0.047033
Epoch 0_10.680%:  Training average Loss: 1.309347, Training accuracy: 0.052148
Epoch 0_11.747%:  Training average Loss: 1.308197, Training accuracy: 0.057285
Epoch 0_12.815%:  Training average Loss: 1.305253, Training accuracy: 0.062326
Epoch 0_13.883%:  Training average Loss: 1.302527, Training a

  "type " + obj.__name__ + ". It won't be checked "


Model is saved in model_dict/epoch_0_accuracy_0.194870
Epoch 1_1.068%:  Training average Loss: 0.998286, Training accuracy: 0.006525
Epoch 1_2.136%:  Training average Loss: 1.030809, Training accuracy: 0.012516
Epoch 1_3.204%:  Training average Loss: 1.022446, Training accuracy: 0.019010
Epoch 1_4.272%:  Training average Loss: 1.033006, Training accuracy: 0.025278
Epoch 1_5.340%:  Training average Loss: 1.032399, Training accuracy: 0.031643
Epoch 1_6.408%:  Training average Loss: 1.028389, Training accuracy: 0.038030
Epoch 1_7.476%:  Training average Loss: 1.029686, Training accuracy: 0.044438
Epoch 1_8.544%:  Training average Loss: 1.032250, Training accuracy: 0.050717
Epoch 1_9.612%:  Training average Loss: 1.033162, Training accuracy: 0.057029
Epoch 1_10.680%:  Training average Loss: 1.035210, Training accuracy: 0.063436
Epoch 1_11.747%:  Training average Loss: 1.034795, Training accuracy: 0.069748
Epoch 1_12.815%:  Training average Loss: 1.033592, Training accuracy: 0.076070
Epoch 

Epoch 2_9.612%:  Training average Loss: 0.936946, Training accuracy: 0.060094
Epoch 2_10.680%:  Training average Loss: 0.930214, Training accuracy: 0.067057
Epoch 2_11.747%:  Training average Loss: 0.929712, Training accuracy: 0.073689
Epoch 2_12.815%:  Training average Loss: 0.931131, Training accuracy: 0.080289
Epoch 2_13.883%:  Training average Loss: 0.926615, Training accuracy: 0.087348
Epoch 2_14.951%:  Training average Loss: 0.927692, Training accuracy: 0.094023
Epoch 2_16.019%:  Training average Loss: 0.927529, Training accuracy: 0.100804
Epoch 2_17.087%:  Training average Loss: 0.926201, Training accuracy: 0.107746
Epoch 2_18.155%:  Training average Loss: 0.927995, Training accuracy: 0.114335
Epoch 2_19.223%:  Training average Loss: 0.929285, Training accuracy: 0.121074
Epoch 2_20.291%:  Training average Loss: 0.930432, Training accuracy: 0.127695
Epoch 2_21.359%:  Training average Loss: 0.930083, Training accuracy: 0.134455
Epoch 2_22.427%:  Training average Loss: 0.929119, Tr

Epoch 3_18.155%:  Training average Loss: 0.863564, Training accuracy: 0.118564
Epoch 3_19.223%:  Training average Loss: 0.861860, Training accuracy: 0.125730
Epoch 3_20.291%:  Training average Loss: 0.860101, Training accuracy: 0.132800


In [18]:
#测试
PATH='model_dict/epoch_6_accuracy_0.213879'
model = torch.load(PATH)

total_loss=0.0
accuracy=0.0
total_correct=0.0
total_data_num = len(train_iterator.dataset)
steps = 0.0    
start_time=time.time()
for batch in test_iterator:
    steps+=1
    batch_text=batch.Phrase
    batch_label=batch.Sentiment
    out=model(batch_text)
    loss = criterion(out, batch_label)
    total_loss = total_loss + loss.item()

    correct = (torch.max(out, dim=1)[1].view(batch_label.size()) == batch_label).sum()
    total_correct = total_correct + correct.item()

print("Test average Loss: %f, Test accuracy: %f，Total time: %f"
  %(total_loss/steps, total_correct/total_data_num,time.time()-start_time) ) 

Test average Loss: 0.942344, Test accuracy: 0.212843，Total time: 16.872462
