In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import roc_auc_score

In [2]:
sub1 = pd.read_csv("/kaggle/input/news-result/two_tower_valid.csv")
sub1.columns = ['dl', 'dl_label']

sub2 = pd.read_csv("/kaggle/input/news-result/xgb_valid.csv")
sub2.columns = ['xgb', 'xgb_label']

In [3]:
# Checking minor bugs
(sub1['dl_label'] != sub2['xgb_label']).sum()

0

In [4]:
oof_df = pd.DataFrame({
    'dl': sub1['dl'],
    'xgb' : sub2['xgb'],
})

In [5]:
oof_df

Unnamed: 0,dl,xgb
0,0.285397,0.209370
1,0.000006,0.000056
2,0.972774,0.983713
3,0.053409,0.027575
4,0.001673,0.682318
...,...,...
1168002,0.000006,0.000136
1168003,0.972384,0.987512
1168004,0.018462,0.041691
1168005,0.000123,0.002150


In [6]:
oof_df['dl'].corr(oof_df['xgb'])

0.978392985190617

In [7]:
### Power averaging
# power = 2
# preds['final_pred'] = (preds['dl'] ** power + preds['xgb'] ** power) / 2

### Simple averaging
# preds['final_pred'] = (preds['dl'] + preds['xgb']) / 2

In [8]:
# Evaluate oof preds
scores = {}
y = sub1['dl_label']
for col in oof_df.columns:
    scores[col] = roc_auc_score(y, oof_df[col])

# Sort scores
scores = {k: v for k, v in sorted(scores.items(), key=lambda item: item[1], reverse=True)}

# Sort oof_df and test_preds
oof_df = oof_df[list(scores.keys())]
# test_preds = test_preds[list(scores.keys())]

scores

{'dl': 0.988761471997079, 'xgb': 0.9882827450980752}

In [9]:
# Initialise
STOP = False
current_best_ensemble = oof_df.iloc[:,0]
# current_best_test_preds = test_preds.iloc[:,0]
MODELS = oof_df.iloc[:,1:]
weight_range = np.arange(0.01,0.51,0.01)   # or with negative weights: np.arange(-0.5,0.51,0.01)
history = [roc_auc_score(y, current_best_ensemble)]
i=0

# Hill climbing
while not STOP:
    i+=1
    potential_new_best_cv_score = roc_auc_score(y, current_best_ensemble)
    k_best, wgt_best = None, None
    for k in MODELS: # column name
        for wgt in weight_range:
            potential_ensemble = (1-wgt) * current_best_ensemble + wgt * MODELS[k]
            cv_score = roc_auc_score(y, potential_ensemble)
            if cv_score > potential_new_best_cv_score:
                potential_new_best_cv_score = cv_score
                k_best, wgt_best = k, wgt
            
    if k_best is not None:
        current_best_ensemble = (1-wgt_best) * current_best_ensemble + wgt_best * MODELS[k_best]
#         current_best_test_preds = (1-wgt_best) * current_best_test_preds + wgt_best * test_preds[k_best]
        MODELS.drop(k_best, axis=1, inplace=True)
        if MODELS.shape[1]==0:
            STOP = True
        print(f'Iteration: {i}, Model added: {k_best}, Best weight: {wgt_best:.2f}, Best AUC: {potential_new_best_cv_score:.5f}')
        history.append(potential_new_best_cv_score)
    else:
        STOP = True

Iteration: 1, Model added: xgb, Best weight: 0.37, Best AUC: 0.98946


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MODELS.drop(k_best, axis=1, inplace=True)


In [10]:
test_pred_dl = pd.read_csv('/kaggle/input/news-result/two_tower_test.csv')
test_pred_dl.columns = ['pred_dl', 'label']

test_pred_xgb = pd.read_csv('/kaggle/input/news-result/xgb_test.csv')
test_pred_xgb.columns = ['pred_xgb', 'label']

test_preds = pd.DataFrame({
    'dl': test_pred_dl['pred_dl'],
    'xgb' : test_pred_xgb['pred_xgb']
})

In [11]:
test_preds['final_pred'] = (test_preds['dl'] * (1-wgt_best) + test_preds['xgb'] * wgt_best)

In [12]:
test_preds

Unnamed: 0,dl,xgb,final_pred
0,0.964575,0.998601,0.977165
1,0.999998,0.999376,0.999768
2,0.000471,0.008775,0.003543
3,0.056739,0.071629,0.062248
4,0.895213,0.541142,0.764207
...,...,...,...
1728013,0.919966,0.969256,0.938203
1728014,0.000209,0.010082,0.003862
1728015,0.001585,0.014694,0.006435
1728016,0.049419,0.155538,0.088683


In [13]:
sub = pd.DataFrame({'pred' : test_preds['final_pred'].values})
compression_opts = dict(method='zip', archive_name='results.csv')  
sub.to_csv('results.zip', header=False, index=False, compression=compression_opts)

In [14]:
sub

Unnamed: 0,pred
0,0.977165
1,0.999768
2,0.003543
3,0.062248
4,0.764207
...,...
1728013,0.938203
1728014,0.003862
1728015,0.006435
1728016,0.088683
