In [30]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

class R2Callback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        # 在每个 epoch 结束时计算 R² 值
        y_pred = self.model.predict(self.validation_data[0])
        r2 = r2_score(self.validation_data[1], y_pred)
        logs['val_r2'] = r2
        print(f'Epoch {epoch+1}: val_r2 = {r2}')

df = pd.read_csv('filtered_data_final.csv')

df.drop(columns=['id'], inplace=True)


# 将 num_broadcasters 列中的 '5+' 替换为 '5'，并转换为 int64 类型
df['num_broadcasters'] = df['num_broadcasters'].replace('5+', '5').astype(int)

# 将 match_datetime 列转换为 datetime 类型
df['match_datetime'] = pd.to_datetime(df['match_datetime'])

# 按 match_datetime 排序
df.sort_values('match_datetime', inplace=True)

# 提取时间特征
df['year'] = df['match_datetime'].dt.year
df['month'] = df['match_datetime'].dt.month
df['day'] = df['match_datetime'].dt.day
df['hour'] = df['match_datetime'].dt.hour

# 删除原始的 match_datetime 列
df.drop(columns=['match_datetime'], inplace=True)

# 定义特征和目标
X = df.drop(columns=['attendance'])
y = df['attendance']

# 归一化特征
scaler = MinMaxScaler()
y = scaler.fit_transform(y)

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 将训练集和测试集转换为 NumPy 数组并重塑以适应LSTM
X_train_reshaped = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# 构建LSTM模型
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    tf.keras.layers.LSTM(50),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

# 创建 R2Callback 实例，并传递验证数据
r2_callback = R2Callback(validation_data=(X_test_reshaped, y_test))

# 训练模型并使用自定义回调
history = model.fit(X_train_reshaped, y_train, epochs=1000, batch_size=32, validation_split=0.2, callbacks=[r2_callback])

# 评估模型
loss = model.evaluate(X_test_reshaped, y_test)
print(f'Test Loss: {loss}')

# 使用模型进行预测
y_pred = model.predict(X_test_reshaped)


# 计算 R² 值
r2 = r2_score(y_test, y_pred)
print(f'R² Score: {r2}')

# 查看 history 对象中的键
print(history.history.keys())

# 绘制训练损失和验证损失
plt.figure(figsize=(12, 6))

# 绘制训练损失
plt.plot(history.history['loss'], label='Training Loss')

# 绘制验证损失
plt.plot(history.history['val_loss'], label='Validation Loss')

# 绘制验证 R²
plt.plot(history.history['val_r2'], label='Validation R²')

# 添加图例
plt.legend()

# 添加标题和轴标签
plt.title('Training and Validation Loss and R²')
plt.xlabel('Epochs')
plt.ylabel('Loss / R²')

# 显示图形
plt.show()


ValueError: Expected a 2-dimensional container but got <class 'pandas.core.series.Series'> instead. Pass a DataFrame containing a single row (i.e. single sample) or a single column (i.e. single feature) instead.

In [33]:
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

class R2Callback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data):
        super().__init__()
        self.validation_data = validation_data

    def on_epoch_end(self, epoch, logs=None):
        # 在每个 epoch 结束时计算 R² 值
        y_pred = self.model.predict(self.validation_data[0])
        r2 = r2_score(self.validation_data[1], y_pred)
        logs['val_r2'] = r2
        print(f'Epoch {epoch+1}: val_r2 = {r2}')

df = pd.read_csv('filtered_data_final.csv')


# 假设 df 是你的数据框
df.drop(columns=['id'], inplace=True)

# 将 num_broadcasters 列中的 '5+' 替换为 '5'，并转换为 int64 类型
df['num_broadcasters'] = df['num_broadcasters'].replace('5+', '5').astype(int)

# 将 match_datetime 列转换为 datetime 类型
df['match_datetime'] = pd.to_datetime(df['match_datetime'])

# 按 match_datetime 排序
df.sort_values('match_datetime', inplace=True)

# 提取时间特征
df['year'] = df['match_datetime'].dt.year
df['month'] = df['match_datetime'].dt.month
df['day'] = df['match_datetime'].dt.day
df['hour'] = df['match_datetime'].dt.hour

# 删除原始的 match_datetime 列
df.drop(columns=['match_datetime'], inplace=True)

# 定义特征和目标
X = df.drop(columns=['attendance'])
y = df['attendance']

# 对 y 进行缩放
y = y.values.reshape(-1, 1)
scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y)

# 拆分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y_scaled, test_size=0.2, random_state=42)

# 将训练集和测试集转换为 NumPy 数组并重塑以适应LSTM
X_train_reshaped = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_reshaped = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# 构建LSTM模型
model = tf.keras.models.Sequential([
    tf.keras.layers.Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2])),
    tf.keras.layers.LSTM(50),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam', loss='mean_squared_error')

# 创建 R2Callback 实例，并传递验证数据
r2_callback = R2Callback(validation_data=(X_test_reshaped, y_test))

# 训练模型并使用自定义回调
history = model.fit(X_train_reshaped, y_train, epochs=1000, batch_size=32, validation_split=0.2, callbacks=[r2_callback])

# 评估模型
loss = model.evaluate(X_test_reshaped, y_test)
print(f'Test Loss: {loss}')

# 使用模型进行预测
y_pred = model.predict(X_test_reshaped)

# 反缩放 y_pred 和 y_test
y_pred_rescaled = scaler_y.inverse_transform(y_pred)
y_test_rescaled = scaler_y.inverse_transform(y_test)

# 计算 R² 值
r2 = r2_score(y_test_rescaled, y_pred_rescaled)
print(f'R² Score: {r2}')

# 查看 history 对象中的键
print(history.history.keys())

# 绘制训练损失和验证损失
plt.figure(figsize=(12, 6))

# 绘制训练损失
plt.plot(history.history['loss'], label='Training Loss')

# 绘制验证损失
plt.plot(history.history['val_loss'], label='Validation Loss')

# 绘制验证 R²
plt.plot(history.history['val_r2'], label='Validation R²')

# 添加图例
plt.legend()

# 添加标题和轴标签
plt.title('Training and Validation Loss and R²')
plt.xlabel('Epochs')
plt.ylabel('Loss / R²')

# 显示图形
plt.show()


Epoch 1/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step p - loss: 0.02
Epoch 1: val_r2 = -0.005013390369480231
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 0.0279 - val_loss: 0.0225 - val_r2: -0.0050
Epoch 2/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 559us/step - loss: 0.023
Epoch 2: val_r2 = -0.009722840182714299
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0232 - val_loss: 0.0222 - val_r2: -0.0097
Epoch 3/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 705us/step - loss: 0.02
Epoch 3: val_r2 = -0.008504801514650495
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0233 - val_loss: 0.0222 - val_r2: -0.0085
Epoch 4/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 606us/step - loss: 0.022
Epoch 4: val_r2 = -4.8055883405950794e-06
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

KeyboardInterrupt: 