In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from transformers import RobertaTokenizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

# 初始化数组，用于存储结果
accuracies = np.zeros(10)
f1_scores = np.zeros(10)

# 加载数据并进行预处理
df = pd.read_csv('data.csv')
df.dropna(inplace=True)  # 删除包含NaN的行
df['label'] = LabelEncoder().fit_transform(df['label'])  # 将标签编码为整数

# 使用RobertaTokenizer进行文本编码
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
encoded = tokenizer(df['phrase'].tolist(), padding='max_length', truncation=True, max_length=64, return_tensors="tf")
input_ids = encoded['input_ids'].numpy()  # 将编码后的input_ids转换为NumPy数组

# 数据集划分为训练集和测试集
train_input_ids, test_input_ids, train_labels, test_labels = train_test_split(
    input_ids, df['label'].values, test_size=0.2, random_state=42
)

# 定义模型参数
vocab_size = tokenizer.vocab_size
embedding_dim = 64
lstm_units = 64
num_classes = df['label'].nunique()

for i in range(10):
    # 构建模型
    input_layer = Input(shape=(64,), dtype='int32', name='input_ids')
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=64)(input_layer)
    lstm_layer1 = LSTM(lstm_units, return_sequences=True)(embedding_layer)
    lstm_layer2 = LSTM(lstm_units, return_sequences=True)(lstm_layer1)
    lstm_layer3 = LSTM(lstm_units)(lstm_layer2)
    dense_layer1 = Dense(128, activation='relu')(lstm_layer3)  # 第一个全连接层
    dense_layer2 = Dense(64, activation='relu')(dense_layer1)  # 第二个全连接层
    output_layer = Dense(num_classes, activation='softmax')(dense_layer2)
    model = Model(inputs=input_layer, outputs=output_layer)

    # 编译模型
    model.compile(optimizer=Adam(learning_rate=2e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # 准备训练数据
    train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_labels)).batch(32)

    # 训练模型
    model.fit(train_dataset, epochs=15)

    # 进行预测
    test_predictions = model.predict(test_input_ids)
    predicted_labels = np.argmax(test_predictions, axis=1)

    # 计算准确度和F1得分
    accuracies[i] = accuracy_score(test_labels, predicted_labels)
    f1_scores[i] = f1_score(test_labels, predicted_labels, average='macro')

# 计算平均准确度和F1得分
average_accuracy = np.mean(accuracies)
average_f1_score = np.mean(f1_scores)

# 输出模型性能
print(f"average_accuracy: {average_accuracy:.4f}")
print(f"average_f1_score: {average_f1_score:.4f}")




Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/1