In [1]:
import pandas as pd
import os
from functools import reduce

# Thư mục chứa các mô hình
root_folder = '/kaggle/input/consis-dsc-prediction'
model_folders = ['erniem', 'xlmr', 'deberta', 'cross', 'roberta', 'dvt']
train_file = 'dev_predictions_with_probs.csv'
test_file = 'submit_with_probs_privatetest.csv'

def feature_engineering(root_folder, model_folders, file, train=True):
    model_dfs = []
    
    for model in model_folders:
        model_path = os.path.join(root_folder, model)
        df_list = []
    
        # Stack 5 folds theo chiều dọc
        for i in range(5):
            fold_path = os.path.join(model_path, f'fold_{i}', file)
            df = pd.read_csv(fold_path)
            df = df[['id', 'prob_intrinsic', 'prob_no', 'prob_extrinsic']]
            df_list.append(df)
    
        if train: 
            processed_df = pd.concat(df_list, axis=0, ignore_index=True)
        else: 
            # Average xác suất
            all_df = pd.concat(df_list)
            processed_df = all_df.groupby("id").mean().reset_index()
    
        # Đổi tên cột để phân biệt các mô hình
        processed_df = processed_df.rename(columns={
            'prob_intrinsic': f'{model}_prob_intrinsic',
            'prob_no': f'{model}_prob_no',
            'prob_extrinsic': f'{model}_prob_extrinsic'
        })
    
        model_dfs.append(processed_df)
    
    # Gộp các model lại theo chiều ngang dựa trên 'id'
    final_df = reduce(lambda left, right: pd.merge(left, right, on='id'), model_dfs)
    return final_df

train_meta = feature_engineering(root_folder, model_folders, file=train_file)
train_meta

Unnamed: 0,id,erniem_prob_intrinsic,erniem_prob_no,erniem_prob_extrinsic,xlmr_prob_intrinsic,xlmr_prob_no,xlmr_prob_extrinsic,deberta_prob_intrinsic,deberta_prob_no,deberta_prob_extrinsic,cross_prob_intrinsic,cross_prob_no,cross_prob_extrinsic,roberta_prob_intrinsic,roberta_prob_no,roberta_prob_extrinsic,dvt_prob_intrinsic,dvt_prob_no,dvt_prob_extrinsic
0,db7a89c6-2a6a-42af-beef-58e557ecc819,0.015204,0.966010,0.018786,0.019164,0.965985,0.014851,0.038892,0.890406,0.070702,0.018710,0.923733,0.057557,0.012265,0.959150,0.028585,0.087124,0.882724,0.030151
1,10fca062-d343-4eca-8434-93c7a8aa5e0e,0.025574,0.005776,0.968650,0.068519,0.006468,0.925014,0.223679,0.014619,0.761702,0.018768,0.003533,0.977699,0.044158,0.016361,0.939481,0.112689,0.014523,0.872788
2,ece8eb9e-d6bb-407a-a567-d9531861c603,0.003886,0.990234,0.005881,0.005762,0.989063,0.005175,0.679647,0.205793,0.114560,0.072594,0.893676,0.033729,0.017415,0.963567,0.019018,0.039752,0.940535,0.019713
3,b613217f-df2a-491e-8326-25811a31eb09,0.308794,0.008378,0.682828,0.930336,0.003383,0.066281,0.739829,0.022905,0.237266,0.914723,0.003096,0.082181,0.771447,0.004139,0.224414,0.828249,0.040226,0.131525
4,6a00bac9-3de4-4144-90c6-4e080c160d43,0.986726,0.001893,0.011380,0.983213,0.004952,0.011835,0.973340,0.004083,0.022577,0.988244,0.000867,0.010889,0.992483,0.000393,0.007125,0.922774,0.021137,0.056089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,df859b5e-49ea-4b67-bc86-e01073521977,0.033319,0.002976,0.963705,0.094277,0.003014,0.902709,0.083899,0.006147,0.909954,0.031920,0.004306,0.963774,0.029042,0.005697,0.965261,0.064006,0.016565,0.919429
6996,05d911a1-6a5c-4ed6-8a4b-4a111200c193,0.950903,0.011749,0.037349,0.991391,0.002439,0.006170,0.949420,0.002340,0.048241,0.953631,0.008644,0.037725,0.983543,0.001848,0.014609,0.875540,0.069283,0.055177
6997,e8de77bc-3bea-4328-bf5d-3321ff767e69,0.009904,0.983390,0.006706,0.010554,0.986566,0.002880,0.053380,0.889967,0.056652,0.016064,0.940588,0.043348,0.008599,0.979765,0.011636,0.054726,0.928317,0.016958
6998,4f7d3a5c-d395-43f0-a941-9ae2aa4b8632,0.013788,0.976917,0.009295,0.018545,0.951016,0.030439,0.038515,0.887469,0.074016,0.010037,0.981783,0.008180,0.004185,0.969410,0.026404,0.147210,0.708837,0.143953


In [2]:
test_meta = feature_engineering(root_folder, model_folders, file=test_file, train=False)
test_meta

Unnamed: 0,id,erniem_prob_intrinsic,erniem_prob_no,erniem_prob_extrinsic,xlmr_prob_intrinsic,xlmr_prob_no,xlmr_prob_extrinsic,deberta_prob_intrinsic,deberta_prob_no,deberta_prob_extrinsic,cross_prob_intrinsic,cross_prob_no,cross_prob_extrinsic,roberta_prob_intrinsic,roberta_prob_no,roberta_prob_extrinsic,dvt_prob_intrinsic,dvt_prob_no,dvt_prob_extrinsic
0,0017c593-6648-4eb3-91ca-fd15a7c900d3,0.015454,0.965804,0.018742,0.007345,0.987312,0.005343,0.025299,0.954350,0.020351,0.007703,0.973846,0.018450,0.002935,0.992803,0.004262,0.044080,0.929475,0.026445
1,0028c3a9-1beb-4644-8047-e890aa8680c8,0.027923,0.938700,0.033377,0.624169,0.276705,0.099126,0.212251,0.675393,0.112356,0.451239,0.082003,0.466758,0.101108,0.867477,0.031415,0.320860,0.631154,0.047986
2,005197c9-45f9-44db-8442-5398d2f21999,0.027206,0.003586,0.969208,0.015833,0.003313,0.980854,0.030894,0.007363,0.961743,0.011185,0.001850,0.986965,0.018449,0.004879,0.976672,0.058649,0.016921,0.924429
3,005687f6-5ffe-4d0d-9838-c98434df57e1,0.057078,0.615861,0.327061,0.282155,0.090570,0.627275,0.078673,0.350546,0.570781,0.024267,0.010482,0.965250,0.009121,0.908293,0.082586,0.166623,0.022330,0.811047
4,006d47a4-4b66-4531-a747-97c6eb1e92f1,0.855520,0.069167,0.075313,0.180062,0.773223,0.046715,0.382950,0.460795,0.156255,0.591131,0.171922,0.236947,0.179059,0.642232,0.178708,0.488169,0.457006,0.054825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,ff48052f-4717-4e85-8823-f754b9f3bf25,0.006389,0.986769,0.006842,0.018256,0.974350,0.007393,0.027080,0.960232,0.012688,0.015096,0.966359,0.018545,0.006791,0.988143,0.005066,0.046676,0.919877,0.033447
1996,ff5438f1-4987-41d1-aa30-3902190b16b8,0.978738,0.003199,0.018063,0.893180,0.038274,0.068546,0.956307,0.003259,0.040433,0.978979,0.002182,0.018839,0.995341,0.000461,0.004198,0.719856,0.125523,0.154621
1997,ffc58416-c1c5-43ec-a707-a5e55354c53f,0.030662,0.004237,0.965101,0.028510,0.006824,0.964666,0.062320,0.006705,0.930975,0.025298,0.002176,0.972526,0.054140,0.004788,0.941072,0.051634,0.012386,0.935980
1998,fff8b237-5029-451f-93a5-dd141328d552,0.952190,0.016029,0.031781,0.914793,0.050606,0.034602,0.602727,0.197334,0.199939,0.692344,0.040982,0.266674,0.671341,0.245320,0.083339,0.362852,0.568732,0.068415


In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Đọc dữ liệu nhãn
train_df = pd.read_csv('/kaggle/input/dsc-2025-dataset/train_dsc.csv')  # chứa 'id' và 'label' (chuỗi)

# Encode nhãn (chuỗi -> số nguyên)
le = LabelEncoder()
train_df['label_encoded'] = le.fit_transform(train_df['label'])

# Gộp với train_meta theo 'id' để tạo X, y tương ứng
merged_df = pd.merge(train_meta, train_df[['id', 'label_encoded']], on='id')

# Tách X và y
X = merged_df.drop(columns=['id', 'label_encoded'])
y = merged_df['label_encoded']

In [4]:
import os
import random
import numpy as np

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# ==========================
# 1. Fix toàn bộ random seed
# ==========================
seed = 22520465
os.environ["PYTHONHASHSEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)

# ==========================
# 2. Khởi tạo model XGBoost
# ==========================
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=seed,
    tree_method='hist',     # histogram (CPU & GPU đều hỗ trợ)
    device='cuda'           # GPU CUDA (đổi thành 'cpu' nếu muốn tuyệt đối deterministic)
)

# ==========================
# 3. Grid search hyperparams
# ==========================
param_grid = {
    'n_estimators': [300, 500, 700],
    'max_depth': [3, 6],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
    'gamma': [0, 0.1, 0.5],
}

# ==========================
# 4. CV với random_state cố định
# ==========================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=cv,
    scoring='f1_macro',
    n_jobs=-1,
    verbose=2
)

# ==========================
# 5. Train & Inference
# ==========================
grid_search.fit(X, y)

print("Best params:", grid_search.best_params_)
print("Best CV F1 Macro:", grid_search.best_score_)
model = grid_search

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=0.8; total time=   7.8s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=1; total time=   7.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=300, subsample=1; total time=   6.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=0.8; total time=  12.0s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=500, subsample=1; total time=  11.6s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=0.8; total time=  16.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=0.8; total time=  16.9s
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_depth=3, n_estimators=700, subsample=1; total time=  17.4s
[CV] END

In [5]:
# 👉 Sau khi train xong bạn có thể lưu best_model bằng pickle hoặc joblib
import joblib
joblib.dump(model, "xgb_best_model.pkl")

['xgb_best_model.pkl']

In [6]:
# Lấy đặc trưng đầu vào X_test (loại bỏ cột id)
X_test = test_meta.drop(columns=['id'])

# Dự đoán nhãn dạng số
y_test_pred = model.predict(X_test)

# Lấy lại tên nhãn tương ứng
y_test_label = le.inverse_transform(y_test_pred)

# Gộp id và nhãn dự đoán
submit_df = pd.DataFrame({
    'id': test_meta['id'],
    'predict_label': y_test_label
})

# Lưu file CSV
submit_df.to_csv('submit.csv', index=False)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


