# 任务：LSTM意图分类

LSTM（Long Short-Term Memory）是一种特殊的循环神经网络，在文本分类任务中表现良好。LSTM可以通过对输入文本进行序列建模来捕捉文本中的长期依赖关系，并对文本进行分类。

- 步骤1：搭建LSTM模型，具体结构为Embedding层、LSTM层和全连接层；
    - Embedding层：将输入的文本转换为词向量表示，降低维度并保留语义信息；
    - LSTM层：使用长短期记忆单元处理词向量序列，学习文本中的上下文信息，并输出隐藏状态；
    -全连接层：将LSTM层的最后一个隐藏状态作为特征输入，使用softmax函数输出每个类别的概率。
- 步骤2：使用任务3中的词向量初始化Embedding层
- 步骤3：LSTM模型的训练，验证和预测
- 步骤4：通过上述步骤，请回答下面问题
    - Embedding层的精度与初始化方式相关吗？
    - LSTM模型精度与文本最大长度是否相关？

In [36]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import jieba

# 一. 数据处理
## 1.1 数据加载

In [6]:
import pandas as pd

data_dir = 'https://mirror.coggle.club/dataset/coggle-competition/'
train_data = pd.read_csv(data_dir + 'intent-classify/train.csv', sep='\t', header=None)
test_data = pd.read_csv(data_dir + 'intent-classify/test.csv', sep='\t', header=None)

In [7]:
print(train_data.head(10))

                          0              1
0         还有双鸭山到淮阴的汽车票吗13号的   Travel-Query
1                   从这里怎么回家   Travel-Query
2          随便播放一首专辑阁楼里的佛里的歌     Music-Play
3                 给看一下墓王之王嘛  FilmTele-Play
4     我想看挑战两把s686打突变团竞的游戏视频     Video-Play
5       我想看和平精英上战神必备技巧的游戏视频     Video-Play
6  2019年古装爱情电视剧小女花不弃的花絮播放一下     Video-Play
7        找一个2004年的推理剧给我看一会呢  FilmTele-Play
8            自驾游去深圳都经过那些地方啊   Travel-Query
9        给我转播今天的女子双打乒乓球比赛现场     Video-Play


In [195]:
# label2idx, idx2label
labels = train_data[1].tolist()
all_labels = set(labels)
print(all_labels)

labels2idx = {val: key for key, val in enumerate(all_labels)}
idx2labels = {key: val for key, val in enumerate(all_labels)}

print(labels2idx)
print(idx2labels)

{'Other', 'Video-Play', 'Travel-Query', 'Audio-Play', 'TVProgram-Play', 'Music-Play', 'Radio-Listen', 'Alarm-Update', 'FilmTele-Play', 'Calendar-Query', 'HomeAppliance-Control', 'Weather-Query'}
{'Other': 0, 'Video-Play': 1, 'Travel-Query': 2, 'Audio-Play': 3, 'TVProgram-Play': 4, 'Music-Play': 5, 'Radio-Listen': 6, 'Alarm-Update': 7, 'FilmTele-Play': 8, 'Calendar-Query': 9, 'HomeAppliance-Control': 10, 'Weather-Query': 11}
{0: 'Other', 1: 'Video-Play', 2: 'Travel-Query', 3: 'Audio-Play', 4: 'TVProgram-Play', 5: 'Music-Play', 6: 'Radio-Listen', 7: 'Alarm-Update', 8: 'FilmTele-Play', 9: 'Calendar-Query', 10: 'HomeAppliance-Control', 11: 'Weather-Query'}


## 1.2 构造Dataset类

In [180]:
class CustomDataset(Dataset):
    def __init__(self, texts, labels, vocab, labels2idx, do_train=True):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.labels2idx = labels2idx
        self.do_train = do_train

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        input_ids = self.text_to_index(text)
        if self.do_train:
            label = self.labels2idx[self.labels[idx]]
            return input_ids, label
        else:
            return input_ids

    def text_to_index(self, text):
        return [self.vocab[token] for token in jieba.lcut(text)]

In [181]:
x_train = train_data[0].tolist()
y_train = train_data[1]
x_test = test_data[0].tolist()


# 构建词汇表
vocab = {}
for text in x_train + x_test:
    for token in jieba.lcut(text):
        if token not in vocab:
            vocab[token] = len(vocab)
            
# print(vocab)

In [182]:
train_dataset = CustomDataset(x_train, y_train, vocab, labels2idx, do_train=True)
test_dataset = CustomDataset(x_test, None, vocab, labels2idx, do_train=False)

## 1.3 封装DataLoader

In [204]:
# 定义 DataLoader
# 定义 DataLoader
def collate_fn(batch):
    batch_input_ids, batch_labels = zip(*batch)
    max_length = max(len(input_ids) for input_ids in batch_input_ids)
    padded_input_ids = [input_ids + [0] * (max_length - len(input_ids)) for input_ids in batch_input_ids]
    return torch.tensor(padded_input_ids), torch.tensor(batch_labels)


def collate_fn_test(batch):
    batch_input_ids = batch
    max_length = max(len(input_ids) for input_ids in batch_input_ids)
    padded_input_ids = [input_ids + [0] * (max_length - len(input_ids)) for input_ids in batch_input_ids]
    return torch.tensor(padded_input_ids)


train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn_test)

In [205]:
print(train_dataset[1])
print(test_dataset[1])

([9, 10, 11, 12], 2)
[21, 27, 123, 4351, 3241, 4, 193]


# 2. 搭建LSTM模型

In [185]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]  # 使用最后一个时刻的隐藏状态作为输出
        
        output = self.fc(lstm_out)
        return output

In [196]:
# 实例化模型
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = len(set(all_labels))  # 输出类别数量
print(output_dim)
print(all_labels)
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)

12
{'Other', 'Video-Play', 'Travel-Query', 'Audio-Play', 'TVProgram-Play', 'Music-Play', 'Radio-Listen', 'Alarm-Update', 'FilmTele-Play', 'Calendar-Query', 'HomeAppliance-Control', 'Weather-Query'}


In [197]:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## 3. 训练模型

In [199]:
# 训练模型
num_epochs = 1
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(torch.tensor(inputs))
#         print(outputs)
#         print(labels)
        
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running_loss / len(train_loader)}")

  outputs = model(torch.tensor(inputs))


Epoch 1, Loss: 0.2727646979352661


## 4. 模型预测

In [211]:
model.eval()
y_pred = []
with torch.no_grad():
    for inputs in test_loader:
        inputs = torch.tensor(inputs)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        y_pred.extend(predicted.tolist())

df = pd.DataFrame(zip(test_data[0], y_pred))
df.columns = ['text', 'y_pred']
print(df.head(10))
df.to_csv('lstm.csv', index=False)

  inputs = torch.tensor(inputs)


                       text  y_pred
0              回放CCTV2的消费主张       8
1                 给我打开玩具房的灯      10
2            循环播放赵本山的小品相亲来听       3
3  15号上午10点带孩子去海洋馆的行程帮我制定下。       7
4                把智能扫地机器人关掉      10
5               帮我续播16集摩天大楼       8
6             放美国电影史密斯夫妇给我看       8
7                放刘禹锡的浪淘沙来听       5
8             查看6月6日是农历几月几号       9
9           放一个讲述美食的美国纪录片来看       1
