In [2]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder

In [3]:
import lightgbm as lgb
import xgboost as xgb
import catboost as cb

In [4]:
import optuna

In [5]:
from autogluon.tabular import TabularPredictor

In [None]:
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score

## 读入数据

In [7]:
train_df = pd.read_csv('../input/train.csv', index_col=0)
test_df = pd.read_csv('../input/test.csv', index_col=0)
ori_df = pd.read_csv('../input/data.csv', sep=';')

In [8]:
ori_df.index = np.arange(ori_df.shape[0]) + 1_000_000

In [9]:
com_df = pd.concat([train_df, test_df, ori_df], axis=0)

In [10]:
ori_all_feats = [col for col in com_df.columns if col != 'Target']

In [11]:
ori_cat_feats = [
    'Marital status',
    'Application mode',
    'Application order',
    'Course',
    'Daytime/evening attendance',
    'Previous qualification',
    'Nacionality',
    'Mother\'s qualification',
    'Father\'s qualification',
    'Mother\'s occupation',
    'Father\'s occupation',
    'Displaced',
    'Educational special needs',
    'Debtor',
    'Tuition fees up to date',
    'Gender',
    'Scholarship holder',
    'International',
]

ori_num_feats = [
    'Previous qualification (grade)',
    'Admission grade',
    'Age at enrollment',
    'Curricular units 1st sem (credited)',
    'Curricular units 1st sem (enrolled)',
    'Curricular units 1st sem (evaluations)',
    'Curricular units 1st sem (approved)',
    'Curricular units 1st sem (grade)',
    'Curricular units 1st sem (without evaluations)',
    'Curricular units 2nd sem (credited)',
    'Curricular units 2nd sem (enrolled)',
    'Curricular units 2nd sem (evaluations)',
    'Curricular units 2nd sem (approved)',
    'Curricular units 2nd sem (grade)',
    'Curricular units 2nd sem (without evaluations)',
    'Unemployment rate',
    'Inflation rate',
    'GDP',
]

In [12]:
for col in ori_cat_feats:
    com_df[col] = LabelEncoder().fit_transform(com_df[col])
    com_df[col] = com_df[col].astype('category')

In [13]:
label2code = {
    'Graduate': 0,
    'Enrolled': 1,
    'Dropout': 2,
}

code2label = {v: k for k, v in label2code.items()}

In [14]:
com_df['Target'] = com_df['Target'].map(label2code)

In [15]:
special_char = ["'", '/', ' ']
def normalize_feature_name(name):
    for c in special_char:
        name = name.replace(c, '_')
    name = name.replace('(', '').replace(')', '')
    return name

ori_all_feats = [normalize_feature_name(col) for col in ori_all_feats]
ori_cat_feats = [normalize_feature_name(col) for col in ori_cat_feats]
ori_num_feats = [normalize_feature_name(col) for col in ori_num_feats]

com_df.columns = com_df.columns.map(normalize_feature_name)

In [16]:
train_df = com_df.loc[train_df.index]
test_df = com_df.loc[test_df.index]
ori_df = com_df.loc[ori_df.index]

train_df.shape[0], test_df.shape[0], ori_df.shape[0]

(76518, 51012, 4424)

In [17]:
label = 'Target'

In [18]:
train_df[label] = train_df[label].astype(int)
ori_df[label] = ori_df[label].astype(int)

## CV

In [19]:
def lgb_cv(params, train_df, test_df, feat_cols, cat_feat_cols, target_col, stratified=False, nfold=5, num_boost_round=10000):
    if stratified:
        folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=42)
    else:
        folds = KFold(n_splits=nfold, shuffle=True, random_state=42)
        
    target = train_df[target_col]

    oof = np.zeros((train_df.shape[0], 3), dtype=np.float64)
    pred = np.zeros((test_df.shape[0], 3), dtype=np.float64)

    for i, (trn_idx, val_idx) in enumerate(folds.split(train_df.index, train_df[target_col].astype(int))):
        print(f'fold={i}', '- ' * 20)
        trn_data = lgb.Dataset(train_df.loc[trn_idx, feat_cols], label=target.loc[trn_idx], categorical_feature=cat_feat_cols)
        val_data = lgb.Dataset(train_df.loc[val_idx, feat_cols], label=target.loc[val_idx], categorical_feature=cat_feat_cols)
    
        model = lgb.train(params, trn_data, num_boost_round, valid_sets=val_data, callbacks=[lgb.log_evaluation(200)])
    
        oof[val_idx] = model.predict(train_df.loc[val_idx, feat_cols], num_iteration=model.best_iteration)
        pred += model.predict(test_df[feat_cols], num_iteration=model.best_iteration) / nfold

    cv = accuracy_score(target, oof.argmax(axis=-1))
    return cv, oof, pred

In [20]:
def xgb_cv(params, train_df, test_df, feat_cols, cat_feat_cols, target_col, stratified=False, nfold=5, num_boost_round=10000):
    if stratified:
        folds = StratifiedKFold(n_splits=nfold, shuffle=True, random_state=42)
    else:
        folds = KFold(n_splits=nfold, shuffle=True, random_state=42)
        
    target = train_df[target_col]

    oof = np.zeros((train_df.shape[0], 3), dtype=np.float64)
    pred = np.zeros((test_df.shape[0], 3), dtype=np.float64)

    for i, (trn_idx, val_idx) in enumerate(folds.split(train_df.index, train_df[target_col].astype(int))):
        print(f'fold={i}', '- ' * 20)
        trn_data = xgb.DMatrix(
            train_df.loc[trn_idx, feat_cols], label=target.loc[trn_idx],
            feature_names=feat_cols,
            feature_types=['c' if f in cat_feat_cols else 'q' for f in feat_cols],
            enable_categorical=True
        )
        val_data = xgb.DMatrix(
            train_df.loc[val_idx, feat_cols], label=target.loc[val_idx],
            feature_names=feat_cols,
            feature_types=['c' if f in cat_feat_cols else 'q' for f in feat_cols],
            enable_categorical=True
        )

        model = xgb.train(params, trn_data, num_boost_round, evals=[(trn_data, 'train'), (val_data, 'valid')], verbose_eval=200, early_stopping_rounds=400)
        
        oof[val_idx] = model.predict(val_data, iteration_range=(0, model.best_iteration + 1))
        
        tst_data = xgb.DMatrix(test_df[feat_cols], enable_categorical=True)
        pred += model.predict(tst_data, iteration_range=(0, model.best_iteration + 1)) / nfold

    cv = accuracy_score(target, oof.argmax(axis=-1))
    return cv, oof, pred

## Optuna调参

In [120]:
import pickle

In [114]:
def read_pickle(path):
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

### XGBoost

In [21]:
def objective(trial):
    dtrain = xgb.DMatrix(
        train_df[ori_all_feats], 
        label=train_df[label], 
        feature_names=ori_all_feats, 
        feature_types=['c' if f in ori_cat_feats else 'q' for f in ori_all_feats],
        enable_categorical=True,
    )
    
    param = {
        # general parameters
        "verbosity": 0,
        'nthread': 48,
        "objective": "multi:softprob",
        'num_class': 3,
        'eval_metric': 'merror',
        # tuning parameters
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1.0, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 1000),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 1e3, log=True),
    }

    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "test-merror")
    history = xgb.cv(
        param, dtrain, num_boost_round=10000, nfold=3, stratified=True, early_stopping_rounds=400, verbose_eval=200,
        callbacks=[pruning_callback]
    )

    merror = history["test-merror-mean"].values[-1]
    return 1 - merror

In [None]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(
    storage="sqlite:///db.sqlite3",
    study_name="xgboost_hpo",
    pruner=pruner, 
    direction="maximize",
)
study.optimize(objective, n_trials=100)

[I 2024-07-06 00:47:11,483] A new study created in RDB with name: xgboost_hpo


[0]	train-merror:0.21879+0.00206	test-merror:0.21835+0.00288
[200]	train-merror:0.18321+0.00049	test-merror:0.18512+0.00096
[400]	train-merror:0.17895+0.00023	test-merror:0.18126+0.00111


In [31]:
study.best_value

0.8343396325047702

In [32]:
study.best_params

{'learning_rate': 0.015498040868303089,
 'max_depth': 8,
 'min_child_weight': 69,
 'subsample': 0.8287052489166259,
 'colsample_bytree': 0.20944357313129097,
 'reg_lambda': 0.0913120193501681}

In [24]:
hyperparameters = {
    'XGB': params,
}

In [25]:
predictor = TabularPredictor(
    label=label,
    problem_type='multiclass',
    eval_metric='accuracy',
    verbosity=3,
    path='AutogluonModels/xgb_8fold_cv'
).fit(
    train_df, 
    hyperparameters=hyperparameters,
    num_bag_sets=1,
    num_bag_folds=8,
    num_stack_levels=0
)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
User Specified kwargs:
{'num_bag_folds': 8, 'num_bag_sets': 1, 'num_stack_levels': 0}
Full kwargs:
{'_feature_generator_kwargs': None,
 '_save_bag_folds': None,
 'ag_args': None,
 'ag_args_ensemble': None,
 'ag_args_fit': None,
 'auto_stack': False,
 'calibrate': 'auto',
 'ds_args': {'clean_up_fits': True,
             'detection_time_frac': 0.25,
             'holdout_data': None,
             'holdout_frac':

In [38]:
lb = predictor.leaderboard(extra_info=True)

Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F8/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F8/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F7/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F7/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F5/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F5/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F4/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F4/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F1/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F1/model.pkl
Loading: AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/S1F6/info.pkl
Loading: AutogluonModels/xgb_8fold_cv/mode

In [41]:
lb.iloc[0].child_hyperparameters

{'n_estimators': 10000,
 'learning_rate': 0.015498040868303089,
 'n_jobs': -1,
 'proc.max_category_levels': 100,
 'objective': 'multi:softprob',
 'booster': 'gbtree',
 'num_class': 3,
 'verbosity': 1,
 'eval_metric': 'merror',
 'max_depth': 8,
 'min_child_weight': 69,
 'subsample': 0.8287052489166259,
 'colsample_bytree': 0.20944357313129097,
 'reg_lambda': 0.0913120193501681}

In [42]:
lb.iloc[0].child_hyperparameters_fit

{'n_estimators': 1686}

In [95]:
xgb_pred_prob = predictor.predict_proba(test_df[ori_all_feats])

In [152]:
xgb_oof = read_pickle('AutogluonModels/xgb_8fold_cv/models/XGBoost_BAG_L1/utils/oof.pkl')

In [154]:
xgb_oof = xgb_oof['_oof_pred_proba']

### LightGBM

In [49]:
# optuna 搜索得到的超参
params = {
    # 'num_threads': 32,
    'objective': 'multiclass',
    'num_class': 3,
    'metric': 'multi_error',
    'feature_pre_filter': False,
    'lambda_l1': 5.316177163748547,
    'lambda_l2': 1.0682353345816337e-08,
    'num_leaves': 31,
    'feature_fraction': 0.4,
    'bagging_fraction': 1.0,
    'bagging_freq': 0,
    'min_child_samples': 20,
    'early_stopping_rounds': 400
}

In [50]:
hyperparameters = {
    'GBM': params,
}

In [51]:
lgb_predictor = TabularPredictor(
    label=label,
    problem_type='multiclass',
    eval_metric='accuracy',
    verbosity=2,
    path='AutogluonModels/lgb_8fold_cv',
).fit(
    train_df, 
    hyperparameters=hyperparameters,
    num_bag_sets=1,
    num_bag_folds=8,
    num_stack_levels=0
)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/lgb_8fold_cv"
AutoGluon Version:  1.1.0
Python Version:     3.8.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sun Jan 19 18:21:42 CST 2020
CPU Count:          64
Memory Avail:       85.07 GB / 251.62 GB (33.8%)
Disk Space Avail:   81.77 GB / 879.22 GB (9.3%)
Train Data Rows:    76518
Train Data C

In [52]:
lb = lgb_predictor.leaderboard(extra_info=True)

In [53]:
lb

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,...,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,LightGBM_BAG_L1,0.834065,accuracy,1.242113,11.652174,1.242113,11.652174,1,True,1,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Scholarship_holder, Curricular_units_1st_sem_...",,"{'learning_rate': 0.05, 'objective': 'multicla...",{'num_boost_round': 224},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[WeightedEnsemble_L2]
1,WeightedEnsemble_L2,0.834065,accuracy,1.25044,11.665603,0.008328,0.013429,2,True,2,...,"{'use_orig_features': False, 'max_base_models'...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[LightGBM_BAG_L1_1, LightGBM_BAG_L1_2, LightGB...",,"{'ensemble_size': 25, 'subsample_size': 1000000}",{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[LightGBM_BAG_L1],[]


In [54]:
lb.iloc[0].child_hyperparameters

{'learning_rate': 0.05,
 'objective': 'multiclass',
 'num_class': 3,
 'metric': 'multi_error',
 'feature_pre_filter': False,
 'lambda_l1': 5.316177163748547,
 'lambda_l2': 1.0682353345816337e-08,
 'num_leaves': 31,
 'feature_fraction': 0.4,
 'bagging_fraction': 1.0,
 'bagging_freq': 0,
 'min_child_samples': 20,
 'early_stopping_rounds': 400}

In [55]:
lb.iloc[0].child_hyperparameters_fit

{'num_boost_round': 224}

In [97]:
lgb_pred_prob = lgb_predictor.predict_proba(test_df[ori_all_feats])

In [155]:
lgb_oof = read_pickle('AutogluonModels/lgb_8fold_cv/models/LightGBM_BAG_L1/utils/oof.pkl')

In [156]:
lgb_oof = lgb_oof['_oof_pred_proba']

### CatBoost

In [70]:
params = {
    # 'thread_count': 8,
    'objective': 'MultiClassOneVsAll',    
    'eval_metric': 'MultiClassOneVsAll',
    'bootstrap_type': 'Bernoulli',
    
    'colsample_bylevel': 0.40298480832888184,
    'learning_rate': 0.03608281349834408,
    'max_depth': round(4.879164646163974),
    'min_data_in_leaf': round(182.6866437231196),
    'reg_lambda': 2.57671202752054,
    'subsample': 0.8710103634646746
    
}

In [71]:
hyperparameters = {
    'CAT': params,
}

In [72]:
cb_predictor = TabularPredictor(
    label=label,
    problem_type='multiclass',
    eval_metric='accuracy',
    verbosity=2,
    path='AutogluonModels/cb_8fold_cv',
).fit(
    train_df, 
    hyperparameters=hyperparameters,
    num_bag_sets=1,
    num_bag_folds=8,
    num_stack_levels=0
)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/cb_8fold_cv"
AutoGluon Version:  1.1.0
Python Version:     3.8.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sun Jan 19 18:21:42 CST 2020
CPU Count:          64
Memory Avail:       85.04 GB / 251.62 GB (33.8%)
Disk Space Avail:   81.73 GB / 879.22 GB (9.3%)
Train Data Rows:    76518
Train Data Co

In [73]:
lb = cb_predictor.leaderboard(extra_info=True)

In [74]:
lb

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,...,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,CatBoost_BAG_L1,0.833673,accuracy,0.646154,787.784327,0.646154,787.784327,1,True,1,...,"{'use_orig_features': True, 'max_base_models':...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[Scholarship_holder, Curricular_units_1st_sem_...",,"{'iterations': 10000, 'learning_rate': 0.03608...",{'iterations': 6814},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[],[WeightedEnsemble_L2]
1,WeightedEnsemble_L2,0.833673,accuracy,0.654347,787.796855,0.008193,0.012528,2,True,2,...,"{'use_orig_features': False, 'max_base_models'...",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...","[CatBoost_BAG_L1_1, CatBoost_BAG_L1_0, CatBoos...",,"{'ensemble_size': 25, 'subsample_size': 1000000}",{'ensemble_size': 1},"{'max_memory_usage_ratio': 1.0, 'max_time_limi...",[CatBoost_BAG_L1],[]


In [75]:
lb.iloc[0].child_hyperparameters

{'iterations': 10000,
 'learning_rate': 0.03608281349834408,
 'random_seed': 0,
 'allow_writing_files': False,
 'eval_metric': 'MultiClassOneVsAll',
 'objective': 'MultiClassOneVsAll',
 'bootstrap_type': 'Bernoulli',
 'colsample_bylevel': 0.40298480832888184,
 'max_depth': 5,
 'min_data_in_leaf': 183,
 'reg_lambda': 2.57671202752054,
 'subsample': 0.8710103634646746}

In [76]:
lb.iloc[0].child_hyperparameters_fit

{'iterations': 6814}

In [98]:
cb_pred_prob = cb_predictor.predict_proba(test_df[ori_all_feats])

In [157]:
cb_oof = read_pickle('AutogluonModels/cb_8fold_cv/models/CatBoost_BAG_L1/utils/oof.pkl')

In [158]:
cb_oof = cb_oof['_oof_pred_proba']

### NN

In [78]:
hyperparameters = {
    'NN_TORCH': {},
    'FASTAI': {},
}

In [None]:
nn_predictor = TabularPredictor(
    label='Target',
    problem_type='multiclass',
    eval_metric='accuracy',
    verbosity=2,
    path='AutogluonModels/nn_8fold_cv',
).fit(
    train_df, 
    hyperparameters=hyperparameters,
    num_bag_sets=1,
    num_bag_folds=8,
    num_stack_levels=0
)

No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/nn_8fold_cv"
AutoGluon Version:  1.1.0
Python Version:     3.8.0
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Sun Jan 19 18:21:42 CST 2020
CPU Count:          64
Memory Avail:       84.66 GB / 251.62 GB (33.6%)
Disk Space Avail:   81.64 GB / 879.22 GB (9.3%)
Train Data Rows:    76518
Train Data Co

In [80]:
lb = nn_predictor.leaderboard(extra_info=True)

In [81]:
lb

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order,...,hyperparameters,hyperparameters_fit,ag_args_fit,features,compile_time,child_hyperparameters,child_hyperparameters_fit,child_ag_args_fit,ancestors,descendants
0,WeightedEnsemble_L2,0.828825,accuracy,2.83194,316.603903,0.008604,0.525953,2,True,3,...,"{'use_orig_features': False, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}","[NeuralNetTorch_BAG_L1_2, NeuralNetFastAI_BAG_L1_0, NeuralNetTorch_BAG_L1_0, NeuralNetFastAI_BAG_L1_1, NeuralNetTorch_BAG_L1_1, NeuralNetFastAI_BAG_L1_2]",,"{'ensemble_size': 25, 'subsample_size': 1000000}",{'ensemble_size': 9},"{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}","[NeuralNetTorch_BAG_L1, NeuralNetFastAI_BAG_L1]",[]
1,NeuralNetFastAI_BAG_L1,0.827191,accuracy,1.675669,135.942005,1.675669,135.942005,1,True,1,...,"{'use_orig_features': True, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}","[Scholarship_holder, Curricular_units_1st_sem_credited, Curricular_units_2nd_sem_approved, Curricular_units_1st_sem_without_evaluations, Curricular_units_2nd_sem_without_evaluations, Educational_special_needs, Tuition_fees_up_to_date, Previous_qualification_grade, Debtor, Displaced, Curricular_units_1st_sem_enrolled, Application_mode, Application_order, Curricular_units_2nd_sem_credited, Curricular_units_1st_sem_evaluations, Curricular_units_2nd_sem_evaluations, Curricular_units_1st_sem_grade, Curricular_units_2nd_sem_grade, Nacionality, Course, GDP, Mother_s_qualification, Previous_qualif...",,"{'layers': None, 'emb_drop': 0.1, 'ps': 0.1, 'bs': 'auto', 'lr': 0.01, 'epochs': 'auto', 'early.stopping.min_delta': 0.0001, 'early.stopping.patience': 20, 'smoothing': 0.0}","{'epochs': 30, 'best_epoch': 9}","{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': ['bool', 'int', 'float', 'category'], 'valid_special_types': None, 'ignored_type_group_special': ['text_ngram', 'text_as_category'], 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None}",[],[WeightedEnsemble_L2]
2,NeuralNetTorch_BAG_L1,0.826799,accuracy,1.147667,180.135945,1.147667,180.135945,1,True,2,...,"{'use_orig_features': True, 'max_base_models': 25, 'max_base_models_per_type': 5, 'save_bag_folds': True}",{},"{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': None, 'valid_special_types': None, 'ignored_type_group_special': None, 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None, 'drop_unique': False}","[Scholarship_holder, Curricular_units_1st_sem_credited, Curricular_units_2nd_sem_approved, Curricular_units_1st_sem_without_evaluations, Curricular_units_2nd_sem_without_evaluations, Educational_special_needs, Tuition_fees_up_to_date, Previous_qualification_grade, Debtor, Displaced, Curricular_units_1st_sem_enrolled, Application_mode, Application_order, Curricular_units_2nd_sem_credited, Curricular_units_1st_sem_evaluations, Curricular_units_2nd_sem_evaluations, Curricular_units_1st_sem_grade, Curricular_units_2nd_sem_grade, Nacionality, Course, GDP, Mother_s_qualification, Previous_qualif...",,"{'num_epochs': 500, 'epochs_wo_improve': 20, 'activation': 'relu', 'embedding_size_factor': 1.0, 'embed_exponent': 0.56, 'max_embedding_dim': 100, 'y_range': None, 'y_range_extend': 0.05, 'dropout_prob': 0.1, 'optimizer': 'adam', 'learning_rate': 0.0003, 'weight_decay': 1e-06, 'proc.embed_min_categories': 4, 'proc.impute_strategy': 'median', 'proc.max_category_levels': 100, 'proc.skew_threshold': 0.99, 'use_ngram_features': False, 'num_layers': 4, 'hidden_size': 128, 'max_batch_size': 512, 'use_batchnorm': False, 'loss_function': 'auto'}","{'batch_size': 128, 'num_epochs': 12}","{'max_memory_usage_ratio': 1.0, 'max_time_limit_ratio': 1.0, 'max_time_limit': None, 'min_time_limit': 0, 'valid_raw_types': ['bool', 'int', 'float', 'category'], 'valid_special_types': None, 'ignored_type_group_special': ['text_ngram', 'text_as_category'], 'ignored_type_group_raw': None, 'get_features_kwargs': None, 'get_features_kwargs_extra': None, 'predict_1_batch_size': None, 'temperature_scalar': None}",[],[WeightedEnsemble_L2]


In [99]:
nn1_pred_prob = nn_predictor.predict_proba(test_df[ori_all_feats], model='NeuralNetFastAI_BAG_L1')

In [100]:
nn2_pred_prob = nn_predictor.predict_proba(test_df[ori_all_feats], model='NeuralNetTorch_BAG_L1')

In [159]:
nn1_oof = read_pickle('AutogluonModels/nn_8fold_cv/models/NeuralNetFastAI_BAG_L1/utils/oof.pkl')

In [160]:
nn1_oof = nn1_oof['_oof_pred_proba']

In [161]:
nn2_oof = read_pickle('AutogluonModels/nn_8fold_cv/models/NeuralNetTorch_BAG_L1/utils/oof.pkl')

In [162]:
nn2_oof = nn2_oof['_oof_pred_proba']

## Ensemble

In [107]:
def save_sub(pred, fname, prob=True):
    sub_df = pd.read_csv('../input/sample_submission.csv')
    if prob: pred = pred.values.argmax(axis=-1)
    
    sub_df['Target'] = pred
    sub_df['Target'] = sub_df['Target'].map(code2label)

    sub_df.to_csv(fname, index=False, header=True)

    with open(fname) as f:
        for i, line in enumerate(f):
            print(line)
            if i == 5: break

### 手动做概率平均

In [110]:
# 前三个模型效果相当，并且显著好于后两个模型
# plb = 0.83766
final_pred_prob = 0.3 * xgb_pred_prob + 0.3 * lgb_pred_prob + 0.3 * cb_pred_prob + 0.05 * nn1_pred_prob + 0.05 * nn2_pred_prob

In [248]:
save_sub(final_pred_prob, 'top1_sol_prob_weight_avg4.csv')

id,Target

76518,Dropout

76519,Graduate

76520,Graduate

76521,Graduate

76522,Enrolled



### Optuna搜索融合权重

In [254]:
import torch
import torch.nn.functional as F
import math

In [250]:
def ensemble_weight_objective(trail):
    w1 = trail.suggest_float('w1', -1.0, 1.0)
    w2 = trail.suggest_float('w2', -1.0, 1.0)
    w3 = trail.suggest_float('w3', -1.0, 1.0)
    w4 = trail.suggest_float('w4', -1.0, 1.0)
    w5 = trail.suggest_float('w5', -1.0, 1.0)

    w1 = math.exp(w1)
    w2 = math.exp(w2)
    w3 = math.exp(w3)
    w4 = math.exp(w4)
    w5 = math.exp(w5)
    w_sum = w1 + w2 + w3 + w4 + w5
    w1 /= w_sum
    w2 /= w_sum
    w3 /= w_sum
    w4 /= w_sum
    w5 /= w_sum
    
    avg_prob = w1 * xgb_oof + w2 * lgb_oof + w3 * cb_oof + w4 * nn1_oof + w5 * nn2_oof
    
    log_prob = torch.log(torch.tensor(avg_prob))
    target = torch.tensor(train_df[label])
    loss = F.nll_loss(log_prob, target)
    return loss.item()

In [251]:
pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
study = optuna.create_study(
    storage="sqlite:///db.sqlite3",
    study_name="ensemble_weight4",
    pruner=pruner, 
    direction="minimize",
)
study.optimize(ensemble_weight_objective, n_trials=100)

[I 2024-07-06 22:50:12,794] A new study created in RDB with name: ensemble_weight4
[I 2024-07-06 22:50:13,170] Trial 0 finished with value: 0.43111219739728907 and parameters: {'w1': 0.9414358306461403, 'w2': 0.13881671086437275, 'w3': -0.23846727531867717, 'w4': 0.7076566760628, 'w5': 0.031259003653632655}. Best is trial 0 with value: 0.43111219739728907.
[I 2024-07-06 22:50:13,527] Trial 1 finished with value: 0.42995514887892355 and parameters: {'w1': 0.9008025900234129, 'w2': 0.3140728621493165, 'w3': 0.10232540582706595, 'w4': 0.2129180598093341, 'w5': -0.38122395792485664}. Best is trial 1 with value: 0.42995514887892355.
[I 2024-07-06 22:50:13,892] Trial 2 finished with value: 0.4340189432662726 and parameters: {'w1': -0.22649841283440741, 'w2': 0.24963612251690792, 'w3': -0.9633608849717565, 'w4': 0.6028723410414962, 'w5': 0.3501297583107106}. Best is trial 1 with value: 0.42995514887892355.
[I 2024-07-06 22:50:14,249] Trial 3 finished with value: 0.4293405844369545 and paramet

In [241]:
study.best_value

0.4289618731613974

In [242]:
study.best_params

{'w1': 0.9984407185747,
 'w2': -0.53313679809071,
 'w3': 0.6690364025634744,
 'w4': -0.6247040574838945,
 'w5': -0.8968987126036836}

In [243]:
w1 = study.best_params['w1']
w2 = study.best_params['w2']
w3 = study.best_params['w3']
w4 = study.best_params['w4']
w5 = study.best_params['w5']

In [244]:
w1 = math.exp(w1)
w2 = math.exp(w2)
w3 = math.exp(w3)
w4 = math.exp(w4)
w5 = math.exp(w5)
w_sum = w1 + w2 + w3 + w4 + w5
w1 /= w_sum
w2 /= w_sum
w3 /= w_sum
w4 /= w_sum
w5 /= w_sum

In [246]:
w1, w2, w3, w4, w5

(0.4380026689442312,
 0.09469370142651651,
 0.3150781451268253,
 0.08640799686020062,
 0.06581748764222646)

In [247]:
# plb = 0.83746
final_pred_prob = w1 * xgb_pred_prob + w2 * lgb_pred_prob + w3 * cb_pred_prob + w4 * nn1_pred_prob + w5 * nn2_pred_prob

In [248]:
save_sub(final_pred_prob, 'top1_sol_prob_weight_avg4.csv')

id,Target

76518,Dropout

76519,Graduate

76520,Graduate

76521,Graduate

76522,Enrolled

