In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score, classification_report
import joblib
import xgboost as xgb
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')

trn_file = '../input/round2_train_X_extracted_features6.csv'
tst_file = '../input/round2_test_X_extracted_features6.csv'
# 读取数据
def load_data():
    train_data = pd.read_csv(trn_file)
    
    tmp_train_labels = pd.read_csv('../input/label.csv')
    train_labels = train_data[['file_name']].copy()
    train_labels = train_labels.merge(tmp_train_labels,on='file_name',how='left')
    train_labels = train_labels[['label_id']].copy()
    
    test_data = pd.read_csv(tst_file)
    
    # return train_data.drop(['file_name'],axis=1)[features], train_labels, test_data.drop(['file_name'],axis=1)[features]
    return train_data.drop(['file_name'],axis=1), train_labels, test_data.drop(['file_name'],axis=1)
# 数据预处理
def preprocess_data(train_data, train_labels, test_data):
    # 确保数据类型正确
    train_data = train_data.astype(float)
    test_data = test_data.astype(float)
    
    # 处理缺失值
    train_data.fillna(train_data.mean(), inplace=True)
    test_data.fillna(test_data.mean(), inplace=True)

    train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
    test_data.replace([np.inf, -np.inf], np.nan, inplace=True)

    test_data[test_data > np.finfo(np.float32).max] = 0
    
    return train_data, train_labels, test_data

In [2]:
def train_xgboost_with_cv(train_data, train_labels, test_data, n_splits=5):

    train_labels = train_labels.label_id.values
    # 计算类别权重来处理长尾分布
    classes = np.unique(train_labels)
    class_weights = dict()
    n_samples = len(train_labels)
    
    for c in classes:
        c_count = np.sum(train_labels == c)
        class_weights[c] = n_samples / (len(classes) * c_count)

    # XGBoost参数设置
    params = {
        'objective': 'multi:softprob',  # 多分类
        'num_class': len(classes),
        'max_depth': 15,  # 控制树的深度，防止过拟合
        'min_child_weight': 5,  # 增大这个值有助于处理类别不平衡
        'gamma': 0.15,  # 控制是否进一步分裂
        'subsample': 0.9,  # 随机采样，防止过拟合
        'colsample_bytree': 0.9,  # 特征采样
        'eta': 0.01,  # 较小的学习率
        # 'scale_pos_weight': 1,  # 类别不平衡时的缩放因子
        'seed': 1001
    }
    params["device"] = "cuda"
    params["tree_method"] = "hist"
    # 准备交叉验证
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    oof_predictions = np.zeros((len(train_data), len(classes)))
    test_predictions = np.zeros((len(test_data), len(classes)))
    
    # 早停设置
    early_stopping_rounds = 80

    # 开始交叉验证训练
    for fold, (train_idx, valid_idx) in enumerate(skf.split(train_data, train_labels)):
        print(f"\nFold {fold + 1}/{n_splits}")
        fold_model=f'xgb_model_fold{fold + 1}.pkl'
                   
        X_train, X_valid = train_data.iloc[train_idx,:], train_data.iloc[valid_idx,:]
        y_train, y_valid = train_labels[train_idx], train_labels[valid_idx]
        
        # 计算样本权重
        sample_weights = np.array([class_weights[label] for label in y_train])
        
        # 准备数据集
        dtrain = xgb.DMatrix(X_train, label=y_train, weight=sample_weights) 
        dvalid = xgb.DMatrix(X_valid, label=y_valid)
        dtest = xgb.DMatrix(test_data)
        
        # 设置评估指标
        def micro_f1_eval(preds, dtrain):
            labels = dtrain.get_label()
            preds = np.argmax(preds.reshape(len(labels), -1), axis=1)
            score = f1_score(labels, preds, average='micro')
            return 'micro_f1', score
            
        if not os.path.exists(fold_model): 
            # 训练模型
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=1000000,
                evals=[(dtrain, 'train'), (dvalid, 'valid')],
                early_stopping_rounds=early_stopping_rounds,
                # feval=micro_f1_eval,
                verbose_eval=100
            )
            joblib.dump(model, fold_model)
        else:
            model = joblib.load(fold_model)
        
        # 保存验证集预测
        oof_predictions[valid_idx] = model.predict(dvalid)
        
        # 累积测试集预测
        test_predictions += model.predict(dtest) / n_splits
        
        # 打印当前折的性能
        valid_preds = np.argmax(oof_predictions[valid_idx], axis=1)
        fold_score = f1_score(y_valid, valid_preds, average='micro')
        print(f"Fold {fold + 1} - Micro F1: {fold_score:.4f}")
        
    # 计算整体性能
    final_predictions = np.argmax(oof_predictions, axis=1)
    final_score = f1_score(train_labels, final_predictions, average='micro')
    print(f"\nOverall CV Micro F1: {final_score:.4f}")
    
    return oof_predictions, test_predictions

In [None]:
# 加载数据
print("加载数据...")
train_data, train_labels, test_data = load_data()

# 预处理数据
print("预处理数据...")
train_data, train_labels, test_data = preprocess_data(train_data, train_labels, test_data)

# 训练和评估模型
print("开始交叉验证训练...")
oof_preds, test_preds = train_xgboost_with_cv(train_data, train_labels, test_data)

print("\n模型训练和预测完成！")

加载数据...


In [None]:
test_preds_post = test_preds**(1/64)

In [None]:
import joblib
label_to_id = joblib.load('../code/label_to_id.pkl')
id_to_label = {}
for k, v in label_to_id.items():
    id_to_label[v] = k

In [None]:
probas_df = pd.DataFrame(test_preds_post)
probas_df['file_name'] = pd.read_csv(tst_file)[['file_name']]

sub_df = probas_df.groupby('file_name').mean().reset_index()
sub_df['label_id'] = sub_df.drop(["file_name"],axis=1).apply(lambda x:np.argmax(x), axis=1)

sub_df.shape

In [None]:
sub_df['label_id'].nunique()

In [None]:
sub_df['reconstructed_label'] = sub_df['label_id'].map(id_to_label)
# 将合并标签拆分为原始的 94 列
reconstructed_labels = sub_df['reconstructed_label'].str.split('_', expand=True)

In [None]:
out_df = pd.concat([sub_df[['file_name']], reconstructed_labels], axis=1)
out_df.columns = pd.read_csv('../input/train_y_v0.1.0.csv').columns
out_df = out_df.replace('-1', 0.1)
out_df = out_df.replace( '1', 0.9)
out_df = out_df.replace( '0',  0)
out_df.head()

In [None]:
out_df.to_csv('../subs/baseline_xgb_cv5_06.csv', index=False)

In [None]:
out_df.shape