In [3]:
import pandas as pd


In [4]:
data_path = '/Users/liuyifeng/Desktop/信息系统/北京.csv'
data = pd.read_csv(data_path)

In [5]:
data = data[['评论内容', '评分']]

# 检查缺失值
missing_values = data.isnull().sum()

# 查看评分分布，以决定如何将评分转换为情感标签
rating_distribution = data['评分'].value_counts()

missing_values, rating_distribution


(评论内容    0
 评分      0
 dtype: int64,
 评分
 5    3612
 4     217
 3      45
 2       3
 1       1
 Name: count, dtype: int64)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# 将评分转换为情感标签：正面(1)和负面(0)
data['情感标签'] = data['评分'].apply(lambda x: 1 if x > 3 else 0)

# 分离特征和标签
X = data['评论内容']
y = data['情感标签']

# 文本向量化
tfidf_vectorizer = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.9, stop_words=['的', '了', '在', '是', '我'])
X_tfidf = tfidf_vectorizer.fit_transform(X)

# 分割数据集
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# 检查向量化后的特征维度和分割的数据集大小
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((3102, 279), (776, 279), (3102,), (776,))

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# 初始化模型
lr_model = LogisticRegression(random_state=42)
svm_model = SVC(random_state=42)
rf_model = RandomForestClassifier(random_state=42)

# 训练模型
lr_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# 预测测试集
lr_predictions = lr_model.predict(X_test)
svm_predictions = svm_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# 评估模型
lr_accuracy = accuracy_score(y_test, lr_predictions)
lr_f1 = f1_score(y_test, lr_predictions)

svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_f1 = f1_score(y_test, svm_predictions)

rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_f1 = f1_score(y_test, rf_predictions)

(lr_accuracy, lr_f1), (svm_accuracy, svm_f1), (rf_accuracy, rf_f1)


((0.9922680412371134, 0.9961190168175937),
 (0.9922680412371134, 0.9961190168175937),
 (0.9922680412371134, 0.9961190168175937))