In [1]:
import tensorflow as tf
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""


# 加载数据并进行预处理
df = pd.read_csv('data.csv')
df.dropna(inplace=True)  # 删除包含NaN的行
df['label'] = LabelEncoder().fit_transform(df['label'])  # 将标签编码为整数

# 数据编码（这里我们不再使用预训练的分词器，改为使用简单的词索引编码）
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df['phrase'])
encoded_texts = tokenizer.texts_to_sequences(df['phrase'])
max_length = 64  # 定义序列的最大长度
input_ids = tf.keras.preprocessing.sequence.pad_sequences(encoded_texts, maxlen=max_length, padding='post')

# 划分数据集为训练集和测试集
train_input_ids, test_input_ids, train_y, test_y = train_test_split(
    input_ids, df['label'].values, test_size=0.2, random_state=42)

# 创建训练数据集和测试数据集
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_y)).batch(128)
test_dataset = tf.data.Dataset.from_tensor_slices((test_input_ids, test_y)).batch(128)

# 定义类别数量和模型输入层
num_classes = 3
input_layer = Input(shape=(max_length,), dtype='int32')

# Embedding层和RNN层
embedded = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128)(input_layer)
rnn_output = SimpleRNN(64)(embedded)

# 输出层
outputs = Dense(units=num_classes, activation="softmax")(rnn_output)
model = Model(inputs=input_layer, outputs=outputs)

# 编译模型
adam = Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(loss="sparse_categorical_crossentropy", metrics=["accuracy"], optimizer=adam)

# 训练模型
model.fit(train_dataset, epochs=15)

# 保存模型
model.save('my_rnn_model.h5')

# 评估模型
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

# 预测和计算 F1-score
predictions = model.predict(test_dataset)
predicted_labels = np.argmax(predictions, axis=1)
f1 = f1_score(test_y, predicted_labels, average='macro')
print(f"F1 Score: {f1}")


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.45507779717445374, Test Accuracy: 0.8469404578208923
F1 Score: 0.8461400938610723
