In [None]:
# 导入所需的库
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# 创建数据集
# 先测试18年的数据
df_factor = pd.read_csv('../factor/1000/2018_1000.csv', nrows=100000)
df_label = pd.read_csv('../factor/label.csv', nrows=100000)
print(df_factor.isnull().sum().sum())
print(df_label.isnull().sum().sum())

In [None]:
# 根据time列的值拆分DataFrame
df_label_1000 = df_label[df_label['time'] == '10:00:00']
df_label_1000.head()

In [None]:
# 根据time列的值拆分DataFrame
df_label_1330 = df_label[df_label['time'] == '13:30:00']
df_label_1330.head()

In [None]:
df_factor.head()

In [None]:
# 假设df是包含所有因子的DataFrame，这里先标准化数据
columns_to_scale = df_factor.columns[2:]
columns_to_scale

In [None]:
# # 仅对选定的列进行标准化
# scaler = StandardScaler()
# df_factor_scaled = scaler.fit_transform(df_factor[columns_to_scale])
# # 将标准化后的数据转换回DataFrame格式，并赋予正确的列名
# df_factor_scaled = pd.DataFrame(df_factor_scaled, columns=columns_to_scale)
# # 将未标准化的列与标准化后的数据合并
# df_factor_final = pd.concat([df_factor.iloc[:, :2].reset_index(drop=True), df_factor_scaled.reset_index(drop=True)], axis=1)

df_factor_final=df_factor
# print(df_factor.head())
# print(df_factor_scaled.head())
df_factor_final.head()

In [None]:
# # 应用PCA
# pca = PCA(n_components=0.95)  # 选择足够的主成分以解释95%的方差
# principal_components = pca.fit_transform(df_factor_scaled)

# # 将主成分转换为DataFrame
# df_pca = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(principal_components.shape[1])])

# print("主成分解释的方差比例：", pca.explained_variance_ratio_)

In [None]:
# 示例：将标签数据根据股票代码和日期重新排序
df_label_1000_sorted = df_label_1000.sort_values(by=['date', 'code'])
(df_label_1000_sorted.head())

In [None]:
# 示例：将标签数据根据股票代码和日期重新排序
df_label_1330_sorted = df_label_1330.sort_values(by=['date', 'code'])
df_label_1330_sorted.head()

In [None]:
# 将因子数据和标签数据合并，确保对齐
df_merged = pd.merge(df_factor_final, df_label_1000_sorted, on=['date', 'code'])
df_merged.head(10)

In [None]:
# 选择因子和标签
X = df_merged.drop(['date', 'code', 'time', 'ret_next_close_alpha', 'ret_next_5_close_alpha'], axis=1)
y = df_merged['ret_next_close_alpha']
print(X.head())
print(y.head())


In [None]:
print(X.isnull().sum().sum())
print(y.isnull().sum())
# 处理缺失值
X.fillna(X.mean(), inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# 实例化模型并训练
model = LinearRegression()
model.fit(X_train, y_train)

# 进行预测
y_pred = model.predict(X_test)

# 评估模型
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 输出每个自变量的系数，保留三位小数
coefficients = ", ".join([f"{coef:.3f}" for coef in model.coef_])
print(f"Coefficients : {coefficients}")
# 输出截距（误差项），保留三位小数
print(f"Intercept: {model.intercept_:.3f}")
print(f'Mean Squared Error: {mse:.3f}')
print(f'R^2 Score: {r2:.3f}')

In [None]:
# 可视化实际值与预测值
plt.scatter(y_test, y_pred, s=10, label='train data')
plt.xlabel('Actual Value')
plt.ylabel('Predicted Value')
plt.title('Actual Value vs Predicted Value')
plt.show()