### 文本情感分类

In [1]:
import torch
from torch import nn
from torch.nn import functional
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from tqdm import tqdm
import jieba

### 读取数据训练词向量模型

#### 读取微博评论数据

In [2]:
weibo_senti_100k = pd.read_csv("../data/weibo_senti_100k.csv", encoding="utf-8")
sentences = weibo_senti_100k['review']
labels = weibo_senti_100k['label']

print(len(sentences), len(labels))

119988 119988


#### 读取停用词

In [3]:
cn_stop_words = open("../data/cn_stopwords.txt", mode="r", encoding="utf-8").readlines()
cn_stop_words = [word.strip() for word in cn_stop_words]
print(len(cn_stop_words))

753


#### 分词并使用word2vec训练词向量

In [4]:
word_count_dict = {} # 统计词频
sentences_list = [] # 存储分词后的词组
for sentence in sentences:
    sentence = sentence.strip()
    seq_list = jieba.cut(sentence, cut_all=False)
    seq_res = []
    # 过滤停用词
    for seq in seq_list:
        if seq not in cn_stop_words:
            seq_res.append(seq)
        if seq in word_count_dict.keys():
            word_count_dict[seq] += 1
        else:
            word_count_dict[seq] = 1
    sentences_list.append(seq_res)

print(len(word_count_dict))
print(len(sentences_list))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WANGTI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.604 seconds.
Prefix dict has been built successfully.


202388
119988


In [5]:
# 定义PAD 和 UNK
UNK = "<UNK>" # 如果当前词词表中不存在时, 使用UNK
PAD = "<PAD>" # 如果当前句子不满足指定长度, 使用PAD填充

# 存储词表
word_list = [word for word,count in word_count_dict.items()]
word_idx_dict = {word:index for index,word in enumerate(word_list)}
word_idx_dict.update({UNK:len(word_idx_dict), PAD: len(word_idx_dict) + 1})
ff = open("out/weibo_dict", "w", encoding="utf-8")
for item in word_idx_dict.keys():
    ff.writelines("{},{}\n".format(item, word_idx_dict[item]))

#### 使用word2vec训练词向量

In [6]:
from model_config import RNNConfig, Word2VecConfig

rnn_config = RNNConfig()
word2vec_config = Word2VecConfig()

In [7]:

word2vec_model = Word2Vec(
    sentences=sentences_list,
    min_count=1,
    vector_size=word2vec_config.vector_size,
    workers=word2vec_config.workers,
    window=word2vec_config.window
)

word2vec_model.train(sentences_list,
                     total_examples=word2vec_model.corpus_total_words, epochs=word2vec_model.epochs)

(11528071, 13765395)

In [8]:
word2vec_model.save("out/weibo_word2vec.model")

In [9]:
vec_tensor = torch.from_numpy(word2vec_model.wv['帅'])
vec_tensor.size()

  """Entry point for launching an IPython kernel.


torch.Size([100])

#### 将数据转换为词向量表

In [10]:
max_seq_len = 32
sentences_len = len(sentences)
PAD = np.ndarray(shape=(100,))
UNK = np.ndarray(shape=(100,))
print(PAD.shape)

(100,)


In [11]:

data = []
for i in tqdm(range(sentences_len)):

    sentence = sentences[i]
    label = labels[i]

    sentence = sentence.strip()
    label = int(label)
    word_list = jieba.cut(sentence, cut_all=False)

    vector_seq = []
    # 过滤停用词
    for seq in word_list:

        if seq in cn_stop_words:
            continue

        if seq in word2vec_model.wv:
            vector_seq.append(word2vec_model.wv[seq])
        else:
            vector_seq.append(UNK)
    # 如果句子长度不够的话, 使用PAD进行填充
    if len(vector_seq) < max_seq_len:
        vector_seq += [PAD for i in range(max_seq_len - len(vector_seq))]

    data.append([label, vector_seq])
datasets = torch.from_numpy(np.array(data))
print(datasets.size())

100%|██████████| 119988/119988 [01:05<00:00, 1832.56it/s]


TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [None]:
vocab_size = len(word_list)
# 初始化词向量表
word_vectors = torch.randn([vocab_size, max_seq_len, 300])
for i in range(0, vocab_size):
    word = word_list[i]
    if word in word2vec_model.wv:
        vector = word2vec_model.wv[word]
        word_vectors[i, :] = torch.from_numpy(vector)
print(len(word_vectors))

In [None]:
word_vectors.shape

### 定义模型

In [None]:
class WordEmbed(nn.Module):
    def __init__(self, hidden_dim, seq_max_len, word_vectors, drop_prob=0.0):
        """
            初始化, 并定义模型
        :param embedding_dim: embedding后每个单词生成词向量的维度
        :param seq_max_len: 句子最大长度
        """
        super(WordEmbed).__init__()
        self.word_embeddings = nn.Embedding.from_pretrained(word_vectors)  # 加载词向量
        self.word_embeddings.weight.requires_grad = False  # 关闭计算
        self.rnn_node = nn.RNN(word_vectors.size(1), hidden_dim, bidirectional=True, dropout=drop_prob)
        self.dropout = nn.Dropout(drop_prob)
        self.linear = nn.Linear(in_features=hidden_dim * 2, out_features=1)

    def forward(self, sentence_inputs, batch_size=128):
        # shape size of :
        #
        embeds = self.word_embeddings(sentence_inputs)


In [None]:
model = WordEmbed()