In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

In [19]:
data_path = '/Users/liuyifeng/Desktop/信息系统/全国.csv'
data = pd.read_csv(data_path)

In [20]:
# 为评分定义分类函数
def categorize_rating(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'

In [21]:
# 首先确保评分列是整数或浮点数类型
data['评分'] = pd.to_numeric(data['评分'], errors='coerce')  # 尝试转换为数字，无法转换的变为NaN

# 然后应用分类函数
data['评分分类'] = data['评分'].apply(categorize_rating)


In [22]:
# 确保所有评论内容都是字符串类型
data['评论内容'] = data['评论内容'].fillna('')  # 将NaN值替换为空字符串
data['评论内容'] = data['评论内容'].astype(str)  # 确保所有内容都转换为字符串类型
# 文本分词
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['评论内容'])
sequences = tokenizer.texts_to_sequences(data['评论内容'])
data_seq = pad_sequences(sequences, maxlen=100)

In [23]:
# 编码标签
label_encoder = LabelEncoder()
data_labels = label_encoder.fit_transform(data['评分分类'])
data_labels = to_categorical(data_labels)


In [24]:
# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(data_seq, data_labels, test_size=0.2, random_state=42)


In [25]:
# 打印样本数量
print(f"总样本数量: {len(data_seq)}")
print(f"训练集样本数量: {len(X_train)}")
print(f"测试集样本数量: {len(X_test)}")

总样本数量: 1030506
训练集样本数量: 824404
测试集样本数量: 206102


In [26]:
# 构建模型
model = Sequential()
model.add(Embedding(5000, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [27]:
# 训练模型
model.fit(X_train, y_train, batch_size=64, epochs=10, validation_data=(X_test, y_test))


Epoch 1/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1150s[0m 89ms/step - accuracy: 0.9245 - loss: 0.3116 - val_accuracy: 0.9251 - val_loss: 0.3055
Epoch 2/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1109s[0m 86ms/step - accuracy: 0.9260 - loss: 0.3027 - val_accuracy: 0.9252 - val_loss: 0.3052
Epoch 3/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1080s[0m 84ms/step - accuracy: 0.9260 - loss: 0.3024 - val_accuracy: 0.9253 - val_loss: 0.3056
Epoch 4/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1095s[0m 85ms/step - accuracy: 0.9261 - loss: 0.3018 - val_accuracy: 0.9252 - val_loss: 0.3058
Epoch 5/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1091s[0m 85ms/step - accuracy: 0.9260 - loss: 0.3019 - val_accuracy: 0.9252 - val_loss: 0.3056
Epoch 6/10
[1m12882/12882[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1126s[0m 87ms/step - accuracy: 0.9267 - loss: 0.2996 - val_accuracy: 0.9251

In [None]:
# 模型评估
predictions = model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

precision = precision_score(true_classes, predicted_classes, average='macro')
recall = recall_score(true_classes, predicted_classes, average='macro')
f1 = f1_score(true_classes, predicted_classes, average='macro')


[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print(f'准确率: {model.evaluate(X_test, y_test, verbose=0)[1] * 100:.2f}%')
print(f'精确度: {precision:.2f}')
print(f'召回率: {recall:.2f}')
print(f'F1 分数: {f1:.2f}')

准确率: 99.23%
精确度: 0.33
召回率: 0.33
F1 分数: 0.33
