In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
import joblib
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

# 读取数据
trn_file = '../input/round2_train_X_extracted_features6.csv'
tst_file = '../input/round2_test_X_extracted_features6.csv'
# 读取数据
def load_data():
    train_data = pd.read_csv(trn_file)
    
    tmp_train_labels = pd.read_csv('../input/label.csv')
    train_labels = train_data[['file_name']].copy()
    train_labels = train_labels.merge(tmp_train_labels,on='file_name',how='left')
    train_labels = train_labels[['label_id']].copy()
    
    test_data = pd.read_csv(tst_file)

    return train_data.drop(['file_name'],axis=1), train_labels, test_data.drop(['file_name'],axis=1)

# 数据预处理
def preprocess_data(train_data, train_labels, test_data):
    # 确保数据类型正确
    train_data = train_data.astype(float)
    test_data = test_data.astype(float)
    
    # 处理缺失值
    train_data.fillna(train_data.mean(), inplace=True)
    test_data.fillna(test_data.mean(), inplace=True)

    train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_data.replace([np.inf, -np.inf], np.nan, inplace=True)

    test_data[test_data > np.finfo(np.float32).max] = 0
    
    return train_data, train_labels, test_data

35


In [2]:
def train_xgboost_with_cv(train_data, train_labels, test_data, n_splits=5):
    train_labels = train_labels.label_id.values
    # 计算类别权重来处理长尾分布
    classes = np.unique(train_labels)
    class_weights = dict()
    n_samples = len(train_labels)
    
    for c in classes:
        c_count = np.sum(train_labels == c)
        class_weights[c] = n_samples / (len(classes) * c_count)

    # XGBoost参数设置
    params = {
        'objective': 'multi:softprob',  # 多分类
        'num_class': len(classes),
        'max_depth': 15,  # 控制树的深度，防止过拟合
        'min_child_weight': 5,  # 增大这个值有助于处理类别不平衡
        'gamma': 0.15,  # 控制是否进一步分裂
        'subsample': 0.9,  # 随机采样，防止过拟合
        'colsample_bytree': 0.9,  # 特征采样
        'eta': 0.01,  # 较小的学习率
        # 'scale_pos_weight': 1,  # 类别不平衡时的缩放因子
        'seed': 1001
    }
    params["device"] = "cuda"
    params["tree_method"] = "hist"
    # 准备交叉验证
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_predictions = np.zeros((len(train_data), len(classes)))
    test_predictions = np.zeros((len(test_data), len(classes)))
    
    # 早停设置
    early_stopping_rounds = 80
    model_list = []
    # 开始交叉验证训练
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_data, train_labels)):
        print(f"\nFold {fold + 1}/{n_splits}")
        
        X_train, X_valid = train_data.iloc[train_idx,:], train_data.iloc[valid_idx,:]
        y_train, y_valid = train_labels[train_idx], train_labels[valid_idx]
        
        # 计算样本权重
        sample_weights = np.array([class_weights[label] for label in y_train])
        
        # 准备数据集
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights) 
        dvalid = xgb.DMatrix(X_valid, label=y_valid)
        dtest = xgb.DMatrix(test_data)
        
        # 设置评估指标
        def micro_f1_eval(preds, dtrain):
            labels = dtrain.get_label()
            preds = np.argmax(preds.reshape(len(labels), -1), axis=1)
            score = f1_score(labels, preds, average='micro')
            return 'micro_f1', score

        # 训练模型
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000000,
            evals=[(dtrain, 'train'), (dvalid, 'valid')],
            early_stopping_rounds=early_stopping_rounds,
            # feval=micro_f1_eval,
            verbose_eval=100
        )
        
        # 保存验证集预测
        oof_predictions[valid_idx] = model.predict(dvalid)
        
        # 累积测试集预测
        test_predictions += model.predict(dtest) / n_splits
        
        # 打印当前折的性能
        valid_preds = np.argmax(oof_predictions[valid_idx], axis=1)
        fold_score = f1_score(y_valid, valid_preds, average='micro')
        print(f"Fold {fold + 1} - Micro F1: {fold_score:.4f}")
        model_list.append(model)
    # 计算整体性能
    final_predictions = np.argmax(oof_predictions, axis=1)
    final_score = f1_score(train_labels, final_predictions, average='micro')
    print(f"\nOverall CV Micro F1: {final_score:.4f}")
    
    return oof_predictions, test_predictions, model_list

# 使用示例
# oof_preds, test_preds = train_xgboost_with_cv(train_data, train_labels, test_data)
# final_test_predictions = np.argmax(test_preds, axis=1)

In [3]:
# 加载数据
print("加载数据...")
train_data, train_labels, test_data = load_data()

# 预处理数据
print("预处理数据...")
train_data, train_labels, test_data = preprocess_data(train_data, train_labels, test_data)

# 训练和评估模型
print("开始交叉验证训练...")
oof_preds, test_preds, model_list = train_xgboost_with_cv(train_data, train_labels, test_data)

# 保存预测结果
test_pred_labels = np.argmax(test_preds, axis=1)
joblib.dump(test_preds, 'xgb599_test_preds_cv5.pkl')
pd.DataFrame(test_pred_labels, columns=['prediction']).to_csv('xgb_predictions.csv', index=False)

# 保存oof预测结果
oof_pred_labels = np.argmax(oof_preds, axis=1)
pd.DataFrame(oof_pred_labels, columns=['prediction']).to_csv('xgb_oof_predictions.csv', index=False)

print("\n模型训练和预测完成！")

加载数据...
预处理数据...
开始交叉验证训练...

Fold 1/5
[0]	train-mlogloss:4.19881	valid-mlogloss:4.31865
[100]	train-mlogloss:1.07204	valid-mlogloss:1.77713
[200]	train-mlogloss:0.56212	valid-mlogloss:1.28368
[300]	train-mlogloss:0.36223	valid-mlogloss:1.06867
[400]	train-mlogloss:0.27291	valid-mlogloss:0.95782
[500]	train-mlogloss:0.22830	valid-mlogloss:0.89328
[600]	train-mlogloss:0.20306	valid-mlogloss:0.85199
[700]	train-mlogloss:0.18648	valid-mlogloss:0.82313
[800]	train-mlogloss:0.17430	valid-mlogloss:0.80170
[900]	train-mlogloss:0.16443	valid-mlogloss:0.78480
[1000]	train-mlogloss:0.15622	valid-mlogloss:0.77112
[1100]	train-mlogloss:0.14924	valid-mlogloss:0.75990
[1200]	train-mlogloss:0.14297	valid-mlogloss:0.74998
[1300]	train-mlogloss:0.13751	valid-mlogloss:0.74142
[1400]	train-mlogloss:0.13274	valid-mlogloss:0.73413
[1500]	train-mlogloss:0.12846	valid-mlogloss:0.72763
[1600]	train-mlogloss:0.12468	valid-mlogloss:0.72221
[1700]	train-mlogloss:0.12128	valid-mlogloss:0.71737
[1800]	train-mloglo

In [4]:
# Overall CV Micro F1: 0.6157 442

In [5]:
import joblib
label_to_id = joblib.load('../input/label_to_id.pkl')
id_to_label = {}
for k, v in label_to_id.items():
    id_to_label[v] = k

In [6]:
probas_df = pd.DataFrame(test_preds)
probas_df['file_name'] = pd.read_csv(tst_file)[['file_name']]

sub_df = probas_df.groupby('file_name').mean().reset_index()
sub_df['label_id'] = sub_df.drop(["file_name"],axis=1).apply(lambda x:np.argmax(x), axis=1)

sub_df

Unnamed: 0,file_name,0,1,2,3,4,5,6,7,8,...,82,83,84,85,86,87,88,89,90,label_id
0,test_X0.pkl,0.000060,1.232871e-05,2.282307e-03,0.006137,0.000397,0.004596,0.000495,0.000103,0.002904,...,3.484735e-05,1.072983e-04,1.371812e-05,0.000076,3.646589e-05,4.116927e-05,6.586999e-05,0.000037,2.907312e-05,51
1,test_X1.pkl,0.021176,9.780277e-05,8.905210e-05,0.019581,0.106486,0.023305,0.000224,0.017831,0.021568,...,8.714462e-05,1.901604e-04,2.840542e-05,0.000271,1.123113e-04,9.119237e-05,7.117559e-05,0.000127,3.764833e-05,57
2,test_X10.pkl,0.000119,6.498522e-05,2.684947e-04,0.002898,0.001471,0.545198,0.006775,0.023097,0.015795,...,1.068637e-04,3.482195e-06,2.157888e-04,0.002388,1.451019e-04,1.138648e-05,9.258903e-05,0.000234,3.740419e-04,5
3,test_X100.pkl,0.000005,2.141899e-07,6.498588e-06,0.000170,0.000004,0.000525,0.000738,0.000002,0.000016,...,8.496332e-07,1.717012e-07,2.657921e-05,0.000005,1.273502e-06,3.999807e-07,4.522390e-07,0.000001,9.374557e-06,10
4,test_X1000.pkl,0.000091,1.103787e-06,1.077973e-06,0.000477,0.006270,0.000005,0.000004,0.000003,0.000070,...,1.724860e-06,4.204922e-06,3.875637e-07,0.000005,2.089438e-06,2.388873e-06,1.024304e-06,0.000002,5.989218e-07,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315715,test_X99995.pkl,0.000463,4.102267e-03,9.775926e-07,0.000009,0.000026,0.000013,0.000003,0.000003,0.000010,...,1.198838e-06,1.329348e-06,3.680853e-07,0.000002,8.929646e-07,8.079657e-07,5.270440e-07,0.000001,6.431289e-07,19
315716,test_X99996.pkl,0.000003,1.056331e-06,2.795040e-06,0.000467,0.000014,0.000719,0.001266,0.015700,0.970333,...,4.569794e-06,6.723521e-06,2.502824e-06,0.000007,8.707401e-06,1.821481e-06,4.212082e-06,0.000004,4.619030e-06,8
315717,test_X99997.pkl,0.000771,2.305966e-06,2.755874e-06,0.991647,0.000008,0.000007,0.000111,0.000003,0.000010,...,1.527805e-06,2.236294e-05,6.892683e-07,0.000003,1.480505e-06,8.459906e-06,2.432602e-06,0.000003,8.532757e-07,3
315718,test_X99998.pkl,0.000007,1.221711e-06,4.906053e-06,0.000208,0.000057,0.001177,0.000047,0.003299,0.263276,...,5.401042e-06,8.178435e-07,1.979066e-06,0.000005,7.302872e-06,2.154533e-06,4.921566e-06,0.000005,2.745103e-06,10


In [7]:
sub_df['label_id'].nunique()

89

In [8]:
sub_df['label_id'].value_counts()

7     72087
8     33724
3     30158
6     19199
29    12828
      ...  
88       16
78       14
73       10
23        6
62        3
Name: label_id, Length: 89, dtype: int64

In [9]:
sub_df['reconstructed_label'] = sub_df['label_id'].map(id_to_label)
# 将合并标签拆分为原始的 94 列
reconstructed_labels = sub_df['reconstructed_label'].str.split('_', expand=True)

In [10]:
out_df = pd.concat([sub_df[['file_name']], reconstructed_labels], axis=1)
out_df.columns = pd.read_csv('../input/train_y_v0.1.0.csv').columns
out_df = out_df.replace('-1', 0.1)
out_df = out_df.replace( '1', 0.9)
out_df = out_df.replace( '0',  0)
out_df.head()

Unnamed: 0,filename,Active_Power_Sensor,Air_Flow_Sensor,Air_Flow_Setpoint,Air_Temperature_Sensor,Air_Temperature_Setpoint,Alarm,Angle_Sensor,Average_Zone_Air_Temperature_Sensor,Chilled_Water_Differential_Temperature_Sensor,...,Warmest_Zone_Air_Temperature_Sensor,Water_Flow_Sensor,Water_Temperature_Sensor,Water_Temperature_Setpoint,Wind_Direction_Sensor,Wind_Speed_Sensor,Zone_Air_Dewpoint_Sensor,Zone_Air_Humidity_Sensor,Zone_Air_Humidity_Setpoint,Zone_Air_Temperature_Sensor
0,test_X0.pkl,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.9,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
1,test_X1.pkl,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
2,test_X10.pkl,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
3,test_X100.pkl,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1
4,test_X1000.pkl,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,...,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1


In [11]:
out_df.dtypes

filename                        object
Active_Power_Sensor            float64
Air_Flow_Sensor                float64
Air_Flow_Setpoint              float64
Air_Temperature_Sensor         float64
                                ...   
Wind_Speed_Sensor              float64
Zone_Air_Dewpoint_Sensor       float64
Zone_Air_Humidity_Sensor       float64
Zone_Air_Humidity_Setpoint     float64
Zone_Air_Temperature_Sensor    float64
Length: 95, dtype: object

In [12]:
out_df.to_csv('../subs/baseline_xgb_cv5_06.csv', index=False)

In [13]:
out_df.shape

(315720, 95)