In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
"""
3. 尝试不同分词工具进行文本分词，观察模型训练结果。
"""
import os
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

In [8]:
# 原生分词方法，即将文本按字符切分
def build_vocab_native(comments_data):
    """
    构建词汇表
    :param comments_data: 评论数据
    :return: 词汇表字典
    """
    vocab = {}
    vocab['PAD'] = 0  # 填充符
    vocab['UNK'] = 1  # 未知词
    for _, comment in comments_data:
        words = list(comment.strip())  # 按字符切分
        for word in words:
            if word not in vocab:
                vocab[word] = len(vocab)  # 为新词分配索引
    torch.save(vocab, 'comments_vocab_3_native.pth')  # 保存词汇表-原生分词
    return vocab
def build_vocab_jieba(comments_data):
    """
    构建词汇表
    :param comments_data: 评论数据
    :return: 词汇表字典
    """
    vocab = {}
    vocab['PAD'] = 0  # 填充符
    vocab['UNK'] = 1  # 未知词
    for _, comment in comments_data:
        # 使用jieba分词
        words = list(jieba.cut(comment.strip()))
        for word in words:
            if word not in vocab:
                vocab[word] = len(vocab)  # 为新词分配索引
    torch.save(vocab, 'comments_vocab_3_jieba.pth')  # 保存词汇表-jieba分词
    return vocab

def get_vocab(vocab_type,comments_data):
    """
    获取词汇表
    :param vocab_type: 分词工具类型
    :return: 词汇表字典
    """
    if vocab_type == 'jieba':
        # 尝试加载jieba分词的词汇表
        if os.path.exists('/kaggle/output/comments_vocab_3_jieba.pth'):
            return torch.load('/kaggle/output/comments_vocab_3_jieba.pth')
        return build_vocab_jieba(comments_data)  # 如果不存在，则构建新的词汇表
    else:
        # 尝试加载原生分词的词汇表
        if os.path.exists('/kaggle/output/comments_vocab_3_native.pth'):
            return torch.load('/kaggle/output/comments_vocab_3_native.pth')
        return build_vocab_native(comments_data)

class CommentsClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(CommentsClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)  # padding_idx=0
        self.rnn = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, input_ids):
        # input_ids: (batch_size, seq_len)
        embedded = self.embedding(input_ids)  #embedded shape: (batch_size, seq_len, embedding_dim)
        output, (hidden, _) = self.rnn(embedded) # output shape: (batch_size, seq_len, hidden_size)
        output = self.fc(output[:, -1, :])  # 取最后一个时间步的输出 shape: (batch_size, num_classes)
        return output

In [9]:
# 设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 加载评论数据
comment_file = '/kaggle/input/cleaned-dmsc-csv/cleaned_DMSC.csv'
df = pd.read_csv(comment_file)
comments_data = df.values.tolist() # 将DataFrame转换为列表

In [10]:
# 构建词汇表
build_vocab_native(comments_data)  # 构建原生分词的词汇表

{'PAD': 0,
 'UNK': 1,
 '非': 2,
 '常': 3,
 '失': 4,
 '望': 5,
 '，': 6,
 '剧': 7,
 '本': 8,
 '完': 9,
 '全': 10,
 '敷': 11,
 '衍': 12,
 '了': 13,
 '事': 14,
 '主': 15,
 '线': 16,
 '情': 17,
 '没': 18,
 '突': 19,
 '破': 20,
 '大': 21,
 '家': 22,
 '可': 23,
 '以': 24,
 '理': 25,
 '解': 26,
 '所': 27,
 '有': 28,
 '的': 29,
 '人': 30,
 '物': 31,
 '都': 32,
 '缺': 33,
 '乏': 34,
 '动': 35,
 '机': 36,
 '正': 37,
 '邪': 38,
 '之': 39,
 '间': 40,
 '、': 41,
 '妇': 42,
 '联': 43,
 '内': 44,
 '部': 45,
 '什': 46,
 '么': 47,
 '火': 48,
 '花': 49,
 '。': 50,
 '团': 51,
 '结': 52,
 '-': 53,
 '分': 54,
 '裂': 55,
 '三': 56,
 '段': 57,
 '式': 58,
 '虽': 59,
 '然': 60,
 '老': 61,
 '套': 62,
 '但': 63,
 '其': 64,
 '实': 65,
 '也': 66,
 '利': 67,
 '用': 68,
 '积': 69,
 '攒': 70,
 '下': 71,
 '来': 72,
 '形': 73,
 '象': 74,
 '魅': 75,
 '力': 76,
 '搞': 77,
 '出': 78,
 '意': 79,
 '思': 80,
 '写': 81,
 '得': 82,
 '肤': 83,
 '浅': 84,
 '平': 85,
 '面': 86,
 '场': 87,
 '上': 88,
 '调': 89,
 '度': 90,
 '混': 91,
 '乱': 92,
 '呆': 93,
 '板': 94,
 '满': 95,
 '屏': 96,
 '铁': 97,
 '甲': 98,
 '审': 99,
 '美': 

In [11]:
build_vocab_jieba(comments_data)  # 构建jieba分词

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.614 seconds.
Prefix dict has been built successfully.


{'PAD': 0,
 'UNK': 1,
 '非常': 2,
 '失望': 3,
 '，': 4,
 '剧本': 5,
 '完全': 6,
 '敷衍了事': 7,
 '主线': 8,
 '剧情': 9,
 '没': 10,
 '突破': 11,
 '大家': 12,
 '可以': 13,
 '理解': 14,
 '可': 15,
 '所有': 16,
 '的': 17,
 '人物': 18,
 '都': 19,
 '缺乏': 20,
 '动机': 21,
 '正邪': 22,
 '之间': 23,
 '、': 24,
 '妇联': 25,
 '内部': 26,
 '没什么': 27,
 '火花': 28,
 '。': 29,
 '团结': 30,
 '-': 31,
 '分裂': 32,
 '三段式': 33,
 '虽然': 34,
 '老套': 35,
 '但': 36,
 '其实': 37,
 '也': 38,
 '利用': 39,
 '积攒': 40,
 '下来': 41,
 '形象': 42,
 '魅力': 43,
 '搞': 44,
 '出': 45,
 '意思': 46,
 '写得': 47,
 '肤浅': 48,
 '平面': 49,
 '场面': 50,
 '上': 51,
 '调度': 52,
 '混乱': 53,
 '呆板': 54,
 '满屏': 55,
 '铁甲': 56,
 '审美疲劳': 57,
 '只有': 58,
 '笑': 59,
 '点算': 60,
 '得': 61,
 '差强人意': 62,
 '2015': 63,
 '年度': 64,
 '最': 65,
 '作品': 66,
 '以为': 67,
 '面面俱到': 68,
 '实则': 69,
 '画蛇添足': 70,
 '；': 71,
 '主题深刻': 72,
 '老调重弹': 73,
 '推陈出新': 74,
 '俗不可耐': 75,
 '很': 76,
 'high': 77,
 '劲': 78,
 '不足': 79,
 '气': 80,
 '！': 81,
 '一集': 82,
 '趣味': 83,
 '全无': 84,
 '这集': 85,
 '点': 86,
 '明显': 87,
 '刻意': 88,
 '到': 89,
 '心虚': 90,
 '全片':

In [12]:
# 加载词汇表
vocab_jieba = get_vocab('jieba',comments_data)
print('jieba词汇表大小:', len(vocab_jieba))
vocab_native = get_vocab('native',comments_data)
print('原生分词词汇表大小:', len(vocab_native))

jieba词汇表大小: 287571
原生分词词汇表大小: 9311


In [17]:
# 划分训练集和测试集
train_data, test_data = train_test_split(comments_data, test_size=0.2, random_state=42)

# 自定义训练数据批次加载数据处理函数
# collate_fn函数用于将每个batch的数据转换为tensor
def collate_fn(batch,vocab_type):
    comments,labels = [],[]  # 分别存储评论和标签
    for label, comment in batch:
        if vocab_type == 'jieba':
            # 使用jieba分词
            words = list(jieba.cut(comment))
            vocab = vocab_jieba
        else:
            # 原生分词方法
            words = list(comment.strip())
            vocab = vocab_native
        comments.append(torch.tensor([vocab.get(word, vocab['UNK']) for word in words]))
        labels.append(label)
    
    # 将评论和标签转换为tensor
    commt = pad_sequence(comments, batch_first=True, padding_value=vocab['PAD']) # 填充为相同长度
    labels = torch.tensor(labels)
    return commt, labels

embedding_dim = 100
hidden_size = 128
num_classes = 2

In [18]:
def train_model(vocab_type):
    print(f"Training model with {vocab_type} vocabulary...")
    vocab_type = 'jieba' if vocab_type == 'jieba' else 'native'
    train_dataloader = DataLoader(train_data, batch_size=512, shuffle=True, collate_fn=lambda x: collate_fn(x, vocab_type=vocab_type))
    if vocab_type == 'jieba':
        vocab = vocab_jieba  
    else:
        vocab = vocab_native

    # 构建模型
    model = CommentsClassifier(len(vocab), embedding_dim, hidden_size, num_classes).to(device)
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # 模型训练
    num_epochs = 2
    for epoch in range(num_epochs):
        for i, (commt, labels) in enumerate(train_dataloader):
            commt = commt.to(device) # 评论
            labels = labels.to(device) # 标签
            # 前向传播
            outputs = model(commt)
            # 计算损失
            loss = criterion(outputs, labels)
            # 反向传播和优化
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            # 打印训练信息
            if (i+1) % 1000 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}')
    # 保存模型
    torch.save(model.state_dict(), 'comments_classifier_3_'+vocab_type+'.pth') #权重和偏置

In [22]:
def eval_model(vocab_type):
    vocab_type = 'jieba' if vocab_type == 'jieba' else 'native'
    test_dataloader = DataLoader(test_data, batch_size=512, shuffle=False, collate_fn=lambda x: collate_fn(x, vocab_type=vocab_type))
    if vocab_type == 'jieba':
        vocab = vocab_jieba
    else:
        vocab = vocab_native

    # 加载模型
    model = CommentsClassifier(len(vocab), embedding_dim, hidden_size, num_classes).to(device)
    model.load_state_dict(torch.load('/kaggle/working/comments_classifier_3_'+vocab_type+'.pth'))
    
    # 模型评估
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for cmt, lbl in test_dataloader:
            cmt = cmt.to(device)
            lbl = lbl.to(device)
            outputs = model(cmt)
            _, predicted = torch.max(outputs.data, 1)
            total += lbl.size(0)
            correct += (predicted == lbl).sum().item()
        accuracy = 100 * correct / total
        print(f'Accuracy of the model on the {vocab_type} test set: {accuracy:.2f}%')

In [20]:
# 模型训练 & 评估
train_model('jieba')

Training model with jieba vocabulary...
Epoch [1/2], Step [1000/2569], Loss: 0.4469
Epoch [1/2], Step [2000/2569], Loss: 0.3083
Epoch [2/2], Step [1000/2569], Loss: 0.2358
Epoch [2/2], Step [2000/2569], Loss: 0.2526


In [23]:
eval_model('jieba')

Accuracy of the model on the jieba test set: 90.33%


In [24]:
train_model('native')

Training model with native vocabulary...
Epoch [1/2], Step [1000/2569], Loss: 0.4595
Epoch [1/2], Step [2000/2569], Loss: 0.2098
Epoch [2/2], Step [1000/2569], Loss: 0.2585
Epoch [2/2], Step [2000/2569], Loss: 0.1929


In [25]:
eval_model('native')

Accuracy of the model on the native test set: 91.35%


In [26]:
def test_model(texts_test,vocab_type):
    vocab_type = 'jieba' if vocab_type == 'jieba' else 'native'
    if vocab_type == 'jieba':
        vocab = vocab_jieba
    else:
        vocab = vocab_native
    text_test_index = []
    for text in texts_test:
        idx_seq = [vocab.get(word,vocab['UNK']) for word in jieba.cut(text)]
        text_test_index.append(idx_seq)
    # 填充序列
    text_test_index = pad_sequence([torch.tensor(idx_seq) for idx_seq in text_test_index], batch_first=True, padding_value=vocab['PAD'])
    # 加载模型
    model = CommentsClassifier(len(vocab), embedding_dim, hidden_size, num_classes).to(device)
    model.load_state_dict(torch.load('/kaggle/working/comments_classifier_3_'+vocab_type+'.pth'))
    # 推理
    with torch.no_grad():
        model.eval()
        text_test_index = text_test_index.to(device)
        logits = model(text_test_index)
        _, predicted = torch.max(logits, 1)
    # 打印预测结果
    for text, pred in zip(texts_test, predicted):
        sentiment = '正面' if pred.item() == 1 else '负面'
        print(f'vocab_type={vocab_type} ,评论: "{text}" 的预测情感为: {sentiment}')

In [28]:
# 测试模型
texts_test = ["我喜欢这部电影", "太难看了"]
test_model(texts_test,'jieba')
test_model(texts_test,'native')

vocab_type=jieba ,评论: "我喜欢这部电影" 的预测情感为: 负面
vocab_type=jieba ,评论: "太难看了" 的预测情感为: 正面
vocab_type=native ,评论: "我喜欢这部电影" 的预测情感为: 负面
vocab_type=native ,评论: "太难看了" 的预测情感为: 负面
