In [None]:
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.base import clone
from scipy.special import logit
from scipy.linalg._misc import LinAlgWarning
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import optuna
import pickle
import glob
import json
import gc

warnings.filterwarnings('ignore')

In [None]:
train_path = '/kaggle/input/playground-series-s4e11/train.csv'
test_path = '/kaggle/input/playground-series-s4e11/test.csv'
sample_sub_path = '/kaggle/input/playground-series-s4e11/sample_submission.csv'
original_data_path = '/kaggle/input/depression-surveydataset-for-analysis/final_depression_dataset_1.csv'

oof_path = '/kaggle/input/playground-s04e11-autogluon'

target = 'Depression'
n_folds = 10
seed = 42

tune = True

In [None]:
train = pd.read_csv(train_path, index_col = 'id')
test = pd.read_csv(test_path, index_col = 'id')
original = pd.read_csv(original_data_path)

original['Depression'] = original['Depression'].map({'Yes' : 1, 'No' : 0})

train = pd.concat([train, original], axis = 0, ignore_index = True)

X = train.drop(target, axis = 1)
y = train[target]

In [None]:
def get_data(oof_path, test_path):
    oof_pred_probs = pickle.load(open(oof_path, 'rb'))
    test_pred_probs = pickle.load(open(test_path, 'rb'))
    
    scores = []
    skf = StratifiedKFold(n_splits = n_folds, random_state = seed, shuffle = True)
    for _, val_idx in skf.split(X, y):
        y_val = y[val_idx]
        y_pred_probs = oof_pred_probs[val_idx]
        score = accuracy_score(y_val, (y_pred_probs > 0.5).astype(int))
        scores.append(score)
        
    return oof_pred_probs, test_pred_probs, scores

In [None]:
oof_pred_probs_files = glob.glob(f'{oof_path}/oof_pred_probs/*.pkl')
test_pred_probs_files = glob.glob(f'{oof_path}/test_pred_probs/*.pkl')

In [None]:
scores = {}
oof_pred_probs = {}
test_pred_probs = {}

for oof_file_path, test_file_path in zip(oof_pred_probs_files, test_pred_probs_files):
    model_name = oof_file_path.split('/')[-1][:-28]
    if 'L1' in model_name:
        oof_pred_probs[model_name], test_pred_probs[model_name], scores[model_name] = get_data(oof_file_path, test_file_path)

In [None]:
# 1/0

In [None]:
best_score = 0
best_index = -1
true_y = y.copy()

for i, (model, preds) in enumerate(oof_pred_probs.items()):
    s = roc_auc_score(true_y, preds)
    if s > best_score:
        best_score = s
        best_index = i
        best_model = model
    print(f'AUC {s:.6f} {model}')
print()
print(f'Best single model is {best_model} with AUC = {best_score:0.6f}')

In [None]:
import cupy as cp, gc

def multiple_roc_auc_scores(actual, predicted):
    n_pos = cp.sum(actual)  # Number of positive samples (on GPU)
    n_neg = len(actual) - n_pos  # Number of negative samples (on GPU)
    ranked = cp.argsort(cp.argsort(predicted, axis=0), axis=0) + 1  # Ranks for each column (on GPU)
    aucs = (cp.sum(ranked[actual == 1, :], axis=0) - n_pos * (n_pos + 1) / 2) / (n_pos * n_neg)  # AUC computation
    return aucs  # AUC scores for each classifier (on GPU)

In [None]:
x_train = np.clip(pd.DataFrame(oof_pred_probs), 1e-15, 1-1e-15).values

In [None]:
USE_NEGATIVE_WGT = True
MAX_MODELS = 1000
TOL = 1e-7

indices = [best_index]
old_best_score = best_score
model_list = list(oof_pred_probs.keys())

#PREPARE/MOVE VARIABLES TO GPU FOR SPEED UP
x_train2 = cp.array( np.log( x_train/(1-x_train) ) ) #GPU LOGITS 
best_ensemble = x_train2[:,best_index] # GPU
truth = cp.array( train.Depression.values ) # GPU
start = -0.50
if not USE_NEGATIVE_WGT: start = 0.01
ww = cp.arange(start,0.51,0.01) # GPU
nn = len(ww)

# BEGIN HILL CLIMBING
models = [best_index]
weights = []
metrics = [best_score]

for kk in range(10000):

    best_score = 0
    best_index = -1
    best_weight = 0

    # TRY ADDING ONE MORE MODEL
    for k,ff in enumerate(model_list):
        new_model = x_train2[:,k] # GPU
        m1 = cp.repeat(best_ensemble[:, cp.newaxis], nn, axis=1) * (1-ww) # GPU
        m2 = cp.repeat(new_model[:, cp.newaxis], nn, axis=1) * ww # GPU
        mm = m1+m2 # GPU
        #mm = 1 / (1 + cp.exp(-mm)) # GPU (convert logits to probs - not needed for auc)
        new_aucs = multiple_roc_auc_scores(truth, mm)
        new_score = cp.max(new_aucs).item() # GPU -> CPU
        if new_score > best_score:
            best_score = new_score # CPU
            best_index = k # CPU
            ii = np.argmax(new_aucs).item() # GPU -> CPU
            best_weight = ww[ii].item() # GPU -> CPU
            potential_ensemble = mm[:,ii] # GPU
    del new_model, m1, m2, mm, new_aucs, new_score
    gc.collect()

    # STOPPING CRITERIA
    indices.append(best_index)
    indices = list(np.unique(indices))
    if len(indices)>MAX_MODELS:
        print(f'=> We reached {MAX_MODELS} models')
        indices = indices[:-1]
        break
    if best_score - old_best_score < TOL: 
        print(f'=> We reached tolerance {TOL}')
        break

    # RECORD NEW RESULT
    print(kk,'New best AUC',best_score,f'adding "{model_list[best_index]}"','with weight',f'{best_weight:0.3f}')
    models.append(best_index)
    weights.append(best_weight)
    metrics.append(best_score)
    best_ensemble = potential_ensemble
    old_best_score = best_score

In [None]:
# def compute_metric_acc(p):
#     t = np.partition(p, -25567)[-25567]
#     p = (p >= t).astype(int)
#     acc = accuracy_score(train.Depression.values, p)
#     return acc
def compute_metric_auc(logit):
    p = 1 / (1 + np.exp(-logit))
    auc = roc_auc_score(y, p)
    return auc
    
def compute_metric_acc(logit):
    p = 1 / (1 + np.exp(-logit))
    p = (p >= 0.5).astype(int)
    acc = accuracy_score(y, p)
    return acc

acc = compute_metric_acc(best_ensemble.get())
auc = compute_metric_auc(best_ensemble.get())
print(f"Overall Hill climbing ACC = {acc:.3f}")
print(f"Overall Hill climbing AUC = {auc:.3f}")

In [None]:
wgt = np.array([1])
for w in weights:
    wgt = wgt*(1-w)
    wgt = np.concatenate([wgt,np.array([w])])
    
rows = []
t = 0
for m,w,s in zip(models,wgt,metrics):
    name = model_list[m]
    dd = {}
    dd['weight'] = w
    dd['model'] = name
    rows.append(dd)
    t += float( f'{w:.3f}' )

# DISPLAY WEIGHT PER MODEL
df = pd.DataFrame(rows)
df = df.groupby('model').agg('sum').reset_index().sort_values('weight',ascending=False)
df = df.reset_index(drop=True)
df

In [None]:
# SANITY CHECK
print('Ensemble weights sum to',df.weight.sum())

In [None]:
# COMBINE OOF PREDITIONS (using weights from hill climbing)
x_map = {x:y for x,y in zip(model_list,np.arange(len(model_list)))}
x_train3 = x_train2.get()
ensemble = x_train3[:, x_map[df.model.iloc[0]] ] * df.weight.iloc[0]
for k in range(1,len(df)):
    ensemble += x_train3[:, x_map[df.model.iloc[k]] ] * df.weight.iloc[k]
m = compute_metric_acc(ensemble)
print(f'Overall Hill climbing ACC = {m:0.3f}')

In [None]:
# LOAD TEST PREDICTIONS
x_test = []
for f in model_list:
    p = np.clip(1e-15,1-1e-15,test_pred_probs[f]) # PREDS MUST BE 0<PROBS<1
    p = np.log( p/(1-p) ) # LOG ODDS
    x_test.append(p)
x_test = np.stack(x_test).T
print( x_test.shape )

# COMBINE TEST PREDITIONS (using weights from hill climbing)
x_map = {x:y for x,y in zip(model_list,np.arange(len(model_list)))}
pred = x_test[:, x_map[df.model.iloc[0]] ] * df.weight.iloc[0]
for k in range(1,len(df)):
    pred += x_test[:, x_map[df.model.iloc[k]] ] * df.weight.iloc[k]

In [None]:
sub_pred = 1 / (1 + np.exp(-pred))

In [None]:
# WRITE SUB TO CSV
sub = pd.read_csv("/kaggle/input/playground-series-s4e11/sample_submission.csv")
sub.Depression = (sub_pred >= 0.5).astype(int)
print("Test shape", sub.shape )
print("Test target mean is", sub.Depression.mean())
sub.to_csv(f"submission.csv",index=False)
sub.head()