In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import f1_score, roc_auc_score, precision_score
import numpy as np

In [3]:
data_path = '/Users/liuyifeng/Desktop/信息系统/湖北.csv'
data = pd.read_csv(data_path)

In [4]:
# 为评分定义分类函数
def categorize_rating(rating):
    if rating >= 3:
        return 'positive'
    else:
        return 'negative'


In [5]:
# 首先确保评分列是整数或浮点数类型
data['评分'] = pd.to_numeric(data['评分'], errors='coerce')  # 尝试转换为数字，无法转换的变为NaN

# 然后应用分类函数
data['评分分类'] = data['评分'].apply(categorize_rating)


In [6]:
# 确保所有评论内容都是字符串类型
data['评论内容'] = data['评论内容'].fillna('')  # 将NaN值替换为空字符串
data['评论内容'] = data['评论内容'].astype(str)  # 确保所有内容都转换为字符串类型
# 文本分词
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['评论内容'])
sequences = tokenizer.texts_to_sequences(data['评论内容'])
data_seq = pad_sequences(sequences, maxlen=100)

In [7]:
# 编码标签
label_encoder = LabelEncoder()
data_labels = label_encoder.fit_transform(data['评分分类'])
data_labels = to_categorical(data_labels)


In [8]:
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(data_seq, data_labels, test_size=0.2, random_state=42)


In [9]:
# 打印样本数量
print(f"总样本数量: {len(data_seq)}")
print(f"训练集样本数量: {len(X_train)}")
print(f"测试集样本数量: {len(X_test)}")

总样本数量: 51654
训练集样本数量: 41323
测试集样本数量: 10331


In [10]:
# 构建模型
model = Sequential()
model.add(Embedding(5000, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [11]:
# 训练模型
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 84ms/step - accuracy: 0.9762 - loss: 0.1185 - val_accuracy: 0.9804 - val_loss: 0.0954
Epoch 2/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 84ms/step - accuracy: 0.9820 - loss: 0.0900 - val_accuracy: 0.9804 - val_loss: 0.0953
Epoch 3/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 86ms/step - accuracy: 0.9813 - loss: 0.0908 - val_accuracy: 0.9805 - val_loss: 0.0964
Epoch 4/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 87ms/step - accuracy: 0.9816 - loss: 0.0867 - val_accuracy: 0.9807 - val_loss: 0.0976
Epoch 5/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 85ms/step - accuracy: 0.9821 - loss: 0.0861 - val_accuracy: 0.9806 - val_loss: 0.1030
Epoch 6/10
[1m646/646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 85ms/step - accuracy: 0.9815 - loss: 0.0880 - val_accuracy: 0.9807 - val_loss: 0.1010
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x2a2223430>

In [12]:
# 模型评估
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

precision = precision_score(true_classes, predicted_classes, average='macro')
f1 = f1_score(true_classes, predicted_classes, average='macro')
# 对于二分类问题，ROC-AUC 计算
predictions_proba = predictions[:, 1]
auc_roc = roc_auc_score(y_test[:, 1], predictions_proba)



[1m323/323[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step


In [14]:
print(f'准确率: {model.evaluate(X_test, y_test, verbose=0)[1] * 100:.2f}%')
print(f'精确度: {precision:.2f}')
print(f'AUC-ROC: {auc_roc:.2f}')


准确率: 98.06%
精确度: 0.87
AUC-ROC: 0.54
