In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 加载数据集
df = pd.read_csv('test_sentiment.csv')
# 检查数据类型和缺失值情况
print('数据基本信息：')
df.info()

In [None]:
# 计算消息长度
df['message_length'] = df['body'].str.len()

In [None]:
# 计算单词数量
df['word_count'] = df['body'].apply(lambda x: len(x.split()))

In [None]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [None]:
# 按年月和发送者分组，计算每月的消息发送频率、平均消息长度和平均单词数量
monthly_features = df.groupby([df['date'].dt.to_period('M'), 'from']).agg(
    message_frequency=('body', 'count'),
    average_message_length=('message_length', 'mean'),
    average_word_count=('word_count', 'mean')
).reset_index()

In [None]:
# 为每条消息分配一个分数
df['score'] = df['sentiment'].map({'Positive': 1, 'Negative': -1, 'Neutral': 0})

In [None]:
# 按年月和发送者分组，计算每月的情绪评分
monthly_score = df.groupby([df['date'].dt.to_period('M'), 'from'])['score'].sum().reset_index()

In [None]:
# 合并每月的情绪评分和特征
monthly_data = pd.merge(monthly_score, monthly_features, on=['date', 'from'])

In [None]:
# 准备特征和目标变量
X = monthly_data[['message_frequency', 'average_message_length', 'average_word_count']]
y = monthly_data['score']

In [None]:
# 将数据分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 开发线性回归模型
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
# 在测试集上进行预测
y_pred = model.predict(X_test)

In [None]:
# 计算模型性能指标
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
# 输出模型性能指标和系数
print('均方误差：', mse)
print('决定系数：', r2)
print('模型系数：', model.coef_)

In [None]:
# 绘制预测值和真实值的散点图
plt.scatter(y_test, y_pred)
plt.xlabel('True value')
plt.ylabel('Predicted value')
plt.title('The prediction results of the linear regression model')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
plt.show()