In [1]:
'''
实验目的：实现基于CNN、RNN的文本分类

实验内容：
1）词嵌入初始化方式：随机embedding、word2vec、word2vec_fun-tured、加载glove
2）CNN/RNN的特征抽取
3）Dropout


补充：
1）
2）

参考：
https://arxiv.org/abs/1408.5882
https://github.com/yokusama/CNN_Sentence_Classification
https://torchtext.readthedocs.io/en/latest/
http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext/

'''

In [2]:
import os
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import sklearn
print(os.getcwd())

dir_all_data='data\\task2_all_data.tsv'

BATCH_SIZE=3

cpu=True
if cpu :
    USE_CUDA = False
    DEVICE = torch.device('cpu')
else:
    USE_CUDA = torch.cuda.is_available()
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

D:\workspace\nlp_beginer_solution


In [3]:
#从文件中读取数据
data_all=pd.read_csv(dir_all_data,sep='\t')
#print(all_data.shape)    #(156060, 4)
#print(all_data.keys())   #['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']
idx =np.arange(data_all.shape[0])
#print(data_all.head())
#print(type(idx))   #<class 'numpy.ndarray'>

In [4]:
#shuffle，划分验证集、测试集,并保存
seed=0
np.random.seed(seed)
#print(idx)
np.random.shuffle(idx)  
#print(idx)

train_size=int(len(idx) * 0.6)
test_size =int(len(idx) * 0.8)

data_all.iloc[idx[:train_size], :].to_csv('data/task2_train.csv',index=False)
data_all.iloc[idx[train_size:test_size], :].to_csv("data/task2_test.csv", index=False)
data_all.iloc[idx[test_size:], :].to_csv("data/task2_dev.csv", index=False)


In [5]:
#Torchtext采用声明式方法加载数据
from torchtext import data
PAD_TOKEN='<pad>'
TEXT = data.Field(sequential=True,batch_first=True, lower=True, pad_token=PAD_TOKEN)
LABEL = data.Field(sequential=False, batch_first=True, unk_token=None)


In [6]:
#读取数据

datafields = [("PhraseId", None), # 不需要的filed设置为None
              ("SentenceId", None),
              ('Phrase', TEXT),
              ('Sentiment', LABEL)]
train_data = data.TabularDataset(path='data/task2_train.csv', format='csv',
                                fields=datafields)
dev_data  = data.TabularDataset(path='data/task2_dev.csv', format='csv',
                                fields=datafields)
test_data = data.TabularDataset(path='data/task2_test.csv', format='csv',
                                fields=datafields)


In [7]:
print(train_data)
print(train_data[1])
print(train_data[1].__dict__.keys())
print(train_data[1].__dict__.values())
print(train_data[1].Phrase[:100])

<torchtext.data.dataset.TabularDataset object at 0x000002AE0D64A4A8>
<torchtext.data.example.Example object at 0x000002AE0D64A6D8>
dict_keys(['Phrase', 'Sentiment'])
dict_values([['escape', 'movie'], '2'])
['escape', 'movie']


In [8]:
print(len(train_data))

93637


In [9]:
#构建词典，字符映射到embedding
VECTORS='glove.6B.100d'
TEXT.build_vocab(train_data,  vectors= 'glove.6B.100d',
                 unk_init= lambda x:torch.nn.init.uniform_(x, a=-0.25, b=0.25))
LABEL.build_vocab(train_data)

PAD_INDEX = TEXT.vocab.stoi[PAD_TOKEN]
TEXT.vocab.vectors[PAD_INDEX] = 0.0

In [10]:
#得到词向量
pretrained_embeddings=TEXT.vocab.vectors
print(type(TEXT.vocab.vectors))
print(TEXT.vocab.vectors)

<class 'torch.Tensor'>
tensor([[-0.0700, -0.1790,  0.0099,  ...,  0.1509,  0.1249, -0.1951],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.6252,  0.8329,  1.4060,  ..., -0.4336, -0.3156,  0.2991],
        [ 0.1580, -0.2077,  0.0084,  ..., -1.2656, -0.2771, -0.3230],
        [-0.2909, -0.7498, -0.2997,  ..., -0.4769,  0.4399, -0.2560]])


In [11]:
#构建迭代器
train_iterator = data.BucketIterator(train_data, batch_size=BATCH_SIZE, 
                                     train=True, shuffle=True,device=DEVICE)

dev_iterator = data.Iterator(dev_data, batch_size=len(dev_data), train=False,
                         sort=False, device=DEVICE)

test_iterator = data.Iterator(test_data, batch_size=len(test_data), train=False,
                          sort=False, device=DEVICE)




In [12]:
#迭代方式
#b=next(iter(train_iterator))
#print(vars(b).keys())
#print(b.Phrase[:, :])
#print(b.Sentiment[:])


In [72]:
embedding_choice='rand'   #  'static'    'non-static'
num_embeddings = len(TEXT.vocab)
embedding_dim =100
dropout_p=0.5
filters_num=100
label_num=5

In [16]:


class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        self.embedding_choice=embedding_choice
        self.vocab_size=vocab_size
        
        if self.embedding_choice==  'rand':
            self.embedding=nn.Embedding(num_embeddings,embedding_dim)
            
        self.conv1 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(3, embedding_dim), padding=(2,0))
        
        self.conv2 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(4, embedding_dim), padding=(3,0))
        
        self.conv3 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                               kernel_size=(5, embedding_dim), padding=(4,0))
        
        self.dropout = nn.Dropout(dropout_p)
        
        self.fc = nn.Linear(filters_num * 3, label_num)
        
    def forward(self,x):
        x=self.embedding(x).unsqueeze(1) # (N, 1, W, D)   ？？？
        
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.size(2)).squeeze(2)
        
        
        return x

In [14]:
batch_data=next(iter(train_iterator))
print(vars(batch_data).keys())
print(vars(batch_data).values())

dict_keys(['batch_size', 'dataset', 'fields', 'input_fields', 'target_fields', 'Phrase', 'Sentiment'])
dict_values([3, <torchtext.data.dataset.TabularDataset object at 0x000002AE0D64A4A8>, dict_keys(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']), ['Phrase', 'Sentiment'], [], tensor([[    2,  1601,  3802,   463,    47,     7,    24],
        [ 1240,    20,     1,     1,     1,     1,     1],
        [    4, 12720,     1,     1,     1,     1,     1]]), tensor([0, 0, 0])])


In [15]:
train_text, train_labels = batch_data.Phrase, batch_data.Sentiment
print(train_text.shape)
print(train_labels.shape)

torch.Size([3, 7])
torch.Size([3])


In [73]:
#实验一下维度
embedding=nn.Embedding(num_embeddings,embedding_dim)
x=train_text
print(x.shape)
x=embedding(x).unsqueeze(1)
print(x.shape)

torch.Size([3, 7])
torch.Size([3, 1, 7, 100])


In [74]:
conv1 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                       kernel_size=(3, embedding_dim), padding=(2,0))

conv2 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                       kernel_size=(4, embedding_dim), padding=(3,0))

conv3 = nn.Conv2d(in_channels=1,out_channels=filters_num ,  #卷积产生的通道
                       kernel_size=(5, embedding_dim), padding=(4,0))


In [75]:
print(x.shape)
x=conv1(x).squeeze(3)
print(x.shape)

torch.Size([3, 1, 7, 100])
torch.Size([3, 100, 9])


In [76]:
import torch.nn.functional as F
print(x.shape)
x = F.max_pool1d(x, x.size(2)).squeeze(2)
print(x.shape)

torch.Size([3, 100, 9])
torch.Size([3, 100])


In [71]:
print(x.shape)
x=conv2(x).squeeze(3)
print(x.shape)

torch.Size([3, 100])


RuntimeError: Expected 4-dimensional input for 4-dimensional weight 100 1, but got 2-dimensional input of size [3, 100] instead