In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, train_test_split

In [5]:
df_filtered = pd.read_csv('9个月总数据.csv')
df_filtered1 = df_filtered.drop(columns=["月份"])
X = df_filtered1.drop(columns=["最高报警等级"])  # 特征
y = df_filtered1["最高报警等级"]  # 目标变量

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42,
                                                    stratify=y)
# 定义模型和 RFECV
# 使用随机森林作为基学习器，step=1 表示每次移除一个特征，cv=5 表示 5 折交叉验证
model = RandomForestClassifier(n_estimators=100, random_state=42)
rfecv = RFECV(estimator=model,
              step=1,
              cv=5,
              scoring='accuracy',     # 您也可以选择 f1_macro, recall 等指标
              n_jobs=-1)             # 并行加速

# 运行 RFECV，自动选择最佳特征数目
rfecv.fit(X_train, y_train)
# 打印结果
print("最优特征数量: %d" % rfecv.n_features_)
print("每个特征是否被保留: ", rfecv.support_)
print("特征的排名（1 表示被选中）: ", rfecv.ranking_)

最优特征数量: 9
每个特征是否被保留:  [False False False  True  True False  True False False  True False False
  True False False  True  True  True  True]
特征的排名（1 表示被选中）:  [ 9 10  6  1  1  2  1 11  3  1  5  7  1  4  8  1  1  1  1]


In [9]:
selected_features = X_train.columns[rfecv.support_]
print("被选中的特征有:", selected_features)

被选中的特征有: Index(['累计里程', '总电压', 'SOC', '驱动电机控制器温度', '驱动电机温度', '电池单体电压最高值', '电池单体电压最低值',
       '最高温度值', '最低温度值'],
      dtype='object')


In [12]:
# 仅保留选定特征
selected_features = [
    '累计里程', '总电压', 'SOC', '驱动电机控制器温度', 
    '驱动电机温度', '电池单体电压最高值', '电池单体电压最低值', 
    '最高温度值', '最低温度值'
]
X_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

In [14]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X_selected)
X_test_normalized = scaler.fit_transform(X_test_selected)
# 将归一化后的数据转换为 DataFrame，保留原有特征名称
X_normalized = pd.DataFrame(X_normalized, columns=X_selected.columns)
X_test_normalized = pd.DataFrame(X_test_normalized, columns=X_selected.columns)
print(X_normalized.head())

       累计里程       总电压       SOC  驱动电机控制器温度    驱动电机温度  电池单体电压最高值  电池单体电压最低值  \
0  1.054413 -0.339277 -0.082250   0.724238  0.578705  -0.304956  -0.350595   
1 -1.412788  1.192977  1.091045  -0.278925  0.023618   1.198015   1.195322   
2 -0.151881  1.675035  1.569054  -0.880824 -1.209910   1.671479   1.680537   
3  0.868191 -1.728061 -2.168107   0.256095  0.331999  -1.714337  -1.744177   
4  0.244359 -0.953326 -0.734080   0.724238  0.702058  -0.976614  -0.948650   

      最高温度值     最低温度值  
0  1.036188  1.138364  
1 -0.911094 -0.817212  
2 -1.265145 -1.350551  
3  0.505111  0.427246  
4  0.505111  0.427246  


In [16]:
y_test

77718      0
717626     2
92323      0
32131      0
704861     0
          ..
1104917    0
1015239    0
766465     0
612291     0
1027851    0
Name: 最高报警等级, Length: 233234, dtype: int64

In [18]:
# 导出归一化后的训练特征
X_normalized.to_csv("X_normalized.csv", index=False)

# 导出归一化后的测试特征
X_test_normalized.to_csv("X_test_normalized.csv", index=False)

y_train.to_frame().to_csv("y_train.csv", index=False)
y_test.to_frame().to_csv("y_test.csv", index=False)