### 微博情感分类

In [1]:
import pandas as pd
import jieba
import os
import torch
from gensim.models.word2vec import Word2Vec

In [2]:
torch.cuda.is_available()

True

### 1. 读取数据

In [3]:
weibo_data = pd.read_csv("../data/weibo_senti_100k.csv")
weibo_data.head()

Unnamed: 0,label,review
0,1,﻿更博了，爆照了，帅的呀，就是越来越爱你！生快傻缺[爱你][爱你][爱你]
1,1,@张晓鹏jonathan 土耳其的事要认真对待[哈哈]，否则直接开除。@丁丁看世界 很是细心...
2,1,姑娘都羡慕你呢…还有招财猫高兴……//@爱在蔓延-JC:[哈哈]小学徒一枚，等着明天见您呢/...
3,1,美~~~~~[爱你]
4,1,梦想有多大，舞台就有多大![鼓掌]


In [4]:
weibo_labels = weibo_data['label']
weibo_review = weibo_data['review']

### 2. 对数据进行分词并过滤停用词

In [5]:
# 读取停用词
stop_words = open("../data/cn_stopwords.txt", "r", encoding="utf-8").readlines()
stop_words = [word.strip() for word in stop_words]
stop_words.append(" ")
stop_words.append("\n")

print(len(stop_words))
print(stop_words[:3])

755
['[', ']', '\\']


In [6]:
dataset_len = len(weibo_labels)

review_data_list = []
label_data_list = []

for i in range(dataset_len):
    # 对每行句子去除左右的空格
    sentence = weibo_review[i].strip()
    label = weibo_labels[i]
    # 对句子进行分词, 分词得到的是一个迭代的对象, 可以使用for循环获取数据
    seq_list = jieba.cut(sentence, cut_all=False)
    # 过滤停用词
    seq_res = []
    for seq in seq_list:
        # 去除停用词
        if seq in stop_words:
            continue
        seq_res.append(seq)
    if len(seq_res) > 0:
        review_data_list.append(seq_res)
        label_data_list.append(label)

print(len(review_data_list))
print(len(label_data_list))
print(review_data_list[:2])
print(label_data_list[:2])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\WANGTI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.601 seconds.
Prefix dict has been built successfully.


119945
119945
[['\ufeff', '更博', '爆照', '帅', '越来越', '爱', '生快', '傻', '缺', '爱', '爱', '爱'], ['张晓鹏', 'jonathan', '土耳其', '事要', '认真对待', '直接', '开除', '丁丁', '世界', '细心', '酒店', 'OK']]
[1, 1]


In [7]:
features = [" ".join(data) for data in review_data_list]

# 保存分词后的数据
save_data = pd.DataFrame({
    'label': pd.Series(label_data_list),
    'review': pd.Series(features)
})
save_data.to_csv("weibo/weibo_cut.csv", sep=',', header=True, index=False, mode="w")  # index 表示是否需要行号

### 加载word2vec模型

In [8]:
word2vec_model_path = "weibo/word2vec.model"
model = Word2Vec.load(word2vec_model_path)

In [9]:
# 获取单词词表
word_list = model.wv.index_to_key
print(len(word_list))
print(word_list[:10])

201747
[':', '泪', '~', '嘻嘻', '爱', '抓狂', '鼓掌', '…', '回复', '-']


### 将数据转换为词向量

In [10]:
features_unk = []

for sentence in review_data_list:
    seq_res = []
    for seq in sentence:
        # 将不在词表中的词替换为<unk>
        if seq not in word_list:
            seq_res.append('<unk>')
        else:
            seq_res.append(seq)
    features_unk.append(seq_res)

print(len(features_unk))

119945


In [11]:
# 将数据转换为向量
vec_features = [model.wv[feature] for feature in features_unk]
vec_features[:3]

[array([[ 1.8741280e-02,  1.1425072e-03,  8.9426833e-04, ...,
          7.4016671e-03,  1.2883464e-02, -4.6873275e-02],
        [ 2.6914781e-02,  3.8821152e-03, -2.2450440e-02, ...,
         -3.1217936e-02,  1.1910747e-02, -3.3909071e-02],
        [ 4.9475200e-02, -1.5874540e-02,  6.3550696e-03, ...,
          1.7243981e-02,  9.9601997e-03, -4.4792011e-02],
        ...,
        [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
         -5.2136872e-02, -1.8677931e-01,  1.3562311e+00],
        [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
         -5.2136872e-02, -1.8677931e-01,  1.3562311e+00],
        [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
         -5.2136872e-02, -1.8677931e-01,  1.3562311e+00]], dtype=float32),
 array([[-5.1415071e-02,  1.9149600e-01, -4.2778301e-01, ...,
          4.5438632e-02,  1.6648443e-01, -3.4303448e-01],
        [-2.1516225e-01,  1.8336062e-01, -3.9351913e-01, ...,
         -4.3308019e-04,  1.5172122e-01, -2.7362645e-01],
        [-

In [16]:
print(len(vec_features))
print(vec_features[:1])

119945
[array([[ 1.8741280e-02,  1.1425072e-03,  8.9426833e-04, ...,
         7.4016671e-03,  1.2883464e-02, -4.6873275e-02],
       [ 2.6914781e-02,  3.8821152e-03, -2.2450440e-02, ...,
        -3.1217936e-02,  1.1910747e-02, -3.3909071e-02],
       [ 4.9475200e-02, -1.5874540e-02,  6.3550696e-03, ...,
         1.7243981e-02,  9.9601997e-03, -4.4792011e-02],
       ...,
       [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
        -5.2136872e-02, -1.8677931e-01,  1.3562311e+00],
       [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
        -5.2136872e-02, -1.8677931e-01,  1.3562311e+00],
       [ 1.5110646e+00, -1.7500892e+00,  3.4322922e+00, ...,
        -5.2136872e-02, -1.8677931e-01,  1.3562311e+00]], dtype=float32)]


### 创建数据迭代器

In [12]:

features_dataset = torch.tensor(vec_features)
label_dataset = torch.tensor(weibo_labels)

ValueError: expected sequence of length 12 at dim 1 (got 22)

### 搭建RNN模型

In [None]:
from torch import nn
from torch.nn import Sequential
from torch.nn import RNN
from torch.nn import Sigmoid
from torch.nn import Embedding
from torch.nn import Linear

