This code will give you a general idea of how to do a machine learning project using scikit-learn and optuna.

It performs the necessary preprocessing, tunes the models with optuna, combines the tuned models, and performs ensemble (voting).

You can submit the finished result without any problem.

Below is the version history, so please refer to it when using it.

## 1. Data and Library Load

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import os
import random
from tqdm.auto import tqdm

# ignore warning
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.ensemble import RandomForestClassifier  # Bagging
from xgboost.sklearn import XGBClassifier            # GBM
from sklearn.linear_model import LogisticRegression  # LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC # SVM

# DL
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, Dense, ReLU, Softmax, Dropout
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# for checking multi-collinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# KFold(CV), partial : for optuna
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from functools import partial
from imblearn.over_sampling import SMOTE

# AutoML framework
import optuna
from optuna.samplers import TPESampler

In [2]:
# set configs
is_tuning = True

if is_tuning:
    sampler = TPESampler(seed=42) # 샘플러가 직접 결정하도록. 신상. 
    
is_scaling = True

is_pca = False

apply_vif = False

feature_selection = True # importance 가 없어짐.
if feature_selection: 
    m = 20
    
is_cuml = True
is_debug = True

sampling_method = 'hybrid' # 'under' or 'over'

if is_tuning:
    n_trials=30 # 50 -> 30으로 바꿈

# # import SVC
# if is_cuml:
#     from cuml.svm import SVC, LinearSVC
# else:
#     from sklearn.svm import SVC
    
# Keras model compile
learning_rate = 1e-2
batch_size = 32
epochs = 10

In [3]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)
    
seed_everything()

In [4]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

train = pd.read_csv('../input/icr-identify-age-related-conditions/train.csv')
test = pd.read_csv('../input/icr-identify-age-related-conditions/test.csv')
metadata = pd.read_csv('../input/icr-identify-age-related-conditions/greeks.csv')
print(train.shape, test.shape)

In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
metadata = pd.read_csv('./greeks.csv')
print(train.shape, test.shape)

(617, 58) (5, 57)


## 2. Data Preprocessing

LabelEncoding -> KNN Imputation -> (optional)calculate VIF -> (optional)apply PCA -> feature Scaling


                           -> (optional)feature selection  ------->

In [6]:
lb = LabelEncoder()
train.EJ = lb.fit_transform(train.EJ)  # A->0, B->1
train = train.drop(columns=["Id"])

In [7]:
imp = KNNImputer()
labels = train["Class"]
train = train.drop(columns="Class")
data = imp.fit_transform(train)
tmp = pd.DataFrame(columns=train.columns, data=data)
train = pd.concat([tmp, labels], axis=1)
train

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
0,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,4126.58731,...,7.298162,1.73855,0.094822,11.339138,72.611063,2003.810319,22.136229,69.834944,0.120343,1
1,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,5496.92824,...,0.173229,0.49706,0.568932,9.292698,72.611063,27981.562750,29.135430,32.131996,21.978000,0
2,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,5135.78024,...,7.709560,0.97556,1.198821,37.077772,88.609437,13676.957810,28.022851,35.192676,0.196941,0
3,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,4169.67738,...,6.122162,0.49706,0.284466,18.529584,82.416803,2094.262452,39.948656,90.493248,0.155829,0
4,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,5728.73412,...,8.153058,48.50134,0.121914,16.408728,146.109943,8524.370502,45.381316,36.262628,0.096614,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,4157.68439,...,0.173229,1.26092,0.067730,8.967128,217.148554,8095.932828,24.640462,69.191944,21.978000,0
613,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,5654.07556,...,10.223150,1.24236,0.426699,35.896418,496.994214,3085.308063,29.648928,124.808872,0.145340,0
614,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,5888.87769,...,0.173229,0.49706,0.067730,19.962092,128.896894,6474.652866,26.166072,119.559420,21.978000,0
615,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,4517.86560,...,9.256996,0.78764,0.670527,24.594488,72.611063,1965.343176,25.116750,37.155112,0.184622,0


In [None]:
###############
### z-score ###
###############
is_z = True
if is_z:
    # seperate class 0,1
    train_0 = train[train.Class == 0.0]
    train_1 = train[train.Class == 1.0]

    # calculate z-score
    train_0_mean = np.mean(train_0)
    train_0_std = np.std(train_0)

    z = (train_0 - train_0_mean) / train_0_std

    # find outliars base on z-score
    drop_index_list = list(set(sum([list(z[(z[i] > 3) | (z[i] < -3)].index) for i in train.columns[:-3]], [])))

    # assign을 이용해 z-score column을 새로 만드는 법.
    # train = train.assign(z_score = lambda x : x.AB.sub(x.AB.mean()).div(x.AB.std()))
    # train[['AB', 'z_score']]

    print(f'train_0에서 z-score로 걸러내는 column의 수 : {len(drop_index_list)}\n')

    print('--------------------------------------------------')
    print('Class_0에 있는 outliars만 제거한 후 Class 0, 1의 수 :')
    print(train.drop(index=drop_index_list).Class.value_counts())
    
    
    train.drop(index=drop_index_list, inplace=True)
    print(train.info())

In [None]:
# check missing data
train[train.isnull().any(axis=1)]

In [None]:
def check_vif(df):
    vifs = [variance_inflation_factor(df, i) for i in range(df.shape[1])]
    vif_df = pd.DataFrame({"features":df.columns, "VIF" : vifs})
    vif_df = vif_df.sort_values(by="VIF", ascending=False)
    remove_col = vif_df.iloc[0, 0]
    top_vif = vif_df.iloc[0, 1]
    return vif_df, remove_col, top_vif

In [None]:
# remove all features when VIF is over 10.
if apply_vif:
    top_vif = 100

    while(top_vif > 5):
        vif_df, remove_col, top_vif = check_vif(train)
        print(remove_col, top_vif)
        if top_vif < 5:
            break
        train = train.drop(columns=remove_col)

    display(train)

In [None]:
# feature selection via Feature Importance
X = train.drop(columns=["Class"])
y = train['Class']

if feature_selection:
    rf = RandomForestClassifier()
    rf.fit(X, y)
    print("Train ACC : %.4f" % accuracy_score(y, rf.predict(X)))
    fi_df = pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_})
    selected_cols = fi_df.sort_values(by="importance", ascending=False)[:m]["feature"].values
    
    display(selected_cols)
    
    X = train[selected_cols]
    display(X)

## 3. Data preprocessing

In [None]:
# class imbalance handling
## 1. undersampling
if sampling_method == 'under':
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape) # 108, 509 -> 108, 108
    c0 = c0.sample(n=c1.shape[0], random_state=42) # 509 -> 108
    train = pd.concat([c0, c1])
    print(train.shape)

In [None]:
## before oversampling
# df = train[selected_cols]
# df["Class"] = train["Class"]
# pd.pivot_table(index="Class", data=df)

In [None]:
## 2. oversampling -> SMOTE
if sampling_method == 'over':
    X = train[selected_cols]
    y = train['Class']

    smote = SMOTE(k_neighbors=5, random_state=42)
    # fit_resample 함수가 자동으로 minority class를 y 기준으로 찾아서, 1:1로 맞춰줌.
    X_resampled, y_resampled = smote.fit_resample(X, y)
    print(X_resampled.shape, y_resampled.shape)
    X_resampled["Class"] = y_resampled
    train = X_resampled

In [None]:
# After SMOTE
# df = X_resampled.copy()
# df["Class"] = y_resampled
# pd.pivot_table(index="Class", data=df)

In [None]:
# 3. hybrid approach
## class0 : 509 -> 300
## class1 : 108 -> 300
if sampling_method == 'hybrid':
    N = 300
    c1 = train[train.Class == 1]
    c0 = train[train.Class == 0]
    print(c1.shape, c0.shape)
    c0 = c0.sample(n=N) # 509 -> 300
    train = pd.concat([c0, c1])
    print(train.shape)
    
    if feature_selection:
        X = train[selected_cols]
    else:
        X = train.drop("Class", axis=1)
    y = train.Class

    smote = SMOTE(k_neighbors=5, random_state=42)
    # The fit_resample function automatically finds the minority class by y and fits it 1:1.
    X, y = smote.fit_resample(X, y) # 300, 108 --> 300, 300
    print(X.shape, y.shape)
    display(X), display(y)

In [None]:
## we don't need anymore because we dropped, valid set.

# # to make OOF prediction
# from sklearn.model_selection import train_test_split

# #X = train.drop(columns=["Class"])
# X = train[selected_cols]
# y = train['Class']

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=y, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

### feature scaling

- Use StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler

if is_scaling:
    scaler = StandardScaler()
    data_ = scaler.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=X.columns)
    display(X)
    
#     data_ = scaler.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=X_train.columns)
#     data_ = scaler.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=X_val.columns)
#     display(X_train)

In [None]:
if is_pca:
    from sklearn.decomposition import PCA
    
    pca = PCA(n_components=0.90, random_state=42)
    data_ = pca.fit_transform(X)
    X = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
    display(X)

#     data_ = pca.fit_transform(X_train)
#     X_train = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
#     data_ = pca.transform(X_val)
#     X_val = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])

#     display(X_train)

## 4. Fitting and Evaluation


- does not need anymore.

In [None]:
# set metric
evaluation_metric = balance_logloss
#evaluation_metric_keras = b_logloss_keras

## 5. (Super)Hyper-parameter Tuning and OOF prediction

Let's try hyper-parameter tuning using optuna, an AutoML framework.

Optuna defines a target function to optimize and then optimizes that function.

For each model, we define an optimizer function separately and then burn optuna.

In [None]:
# make oof predictions
submission = pd.read_csv('../input/icr-identify-age-related-conditions/sample_submission.csv')
final_preds = []
# rf_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# svm_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))
# xgb_preds = pd.DataFrame(columns=submission.columns, data=np.zeros((train.shape[0], submission.shape[1])))

In [None]:
# apply same preprocessing in Testset
test.EJ = lb.transform(test.EJ)  # A->0, B->1
test = test.drop(columns=["Id"])

if apply_vif or feature_selection:
    X_test = pd.DataFrame(columns=test.columns, data=imp.transform(test))
    if apply_vif:
        X_test = X_test[train.columns.drop("Class")]
    else:
        X_test = X_test[selected_cols]
    X_test = pd.DataFrame(columns=X_test.columns, data=scaler.transform(X_test))
elif is_pca:
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))
    data_ = pca.fit_transform(X_test)
    X_test = pd.DataFrame(data=data_, columns=[f"PC{i}" for i in range(1, data_.shape[1]+1)])
else:
    X_test = pd.DataFrame(columns=test.columns, data=scaler.transform(imp.transform(test)))

X_test

In [None]:
def rf_optimizer(trial, X, y, K):
    # define parameter to tune    
    n_estimators = trial.suggest_categorical('n_estimators', [50, 100, 200])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    max_features = trial.suggest_categorical('max_features', [0.6, 0.7, 0.8])
    
    
    # set model
    model = RandomForestClassifier(n_estimators=n_estimators,
                                   max_depth=max_depth,
                                   max_features=max_features,
                                   criterion='log_loss',
                                   class_weight='balanced'
                                  )
    
    # K-Fold Cross validation
    folds = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    # return mean score of CV
    return np.mean(losses)

In [None]:
def xgb_optimizer(trial, X, y, K):
    
    n_estimators = trial.suggest_categorical('n_estimators', [500, 1000, 2000])
    max_depth = trial.suggest_int('max_depth', 4, 10)
    colsample_bytree = trial.suggest_categorical('colsample_bytree', [0.5, 0.6, 0.7, 0.8])
    learning_rate = trial.suggest_float('learning_rate', 1e-3, 1e-2)
    reg_lambda = trial.suggest_categorical('reg_lambda', [0.1, 0.5, 1, 2])
    
    
    model = XGBClassifier(n_estimators=n_estimators,
                          max_depth=max_depth,
                          colsample_bytree=colsample_bytree,
                          learning_rate=learning_rate,
                          reg_lambda=reg_lambda,
                          random_state=42)
#                          scale_pos_weight=4.71)  ## we set class imbalance by using sampling method.
    
    
    folds = StratifiedKFold(n_splits=K, shuffle=True, random_state=42)
    losses = []
    
    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]
        
        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)
    
    
    return np.mean(losses)

In [None]:
K = 4 # set K of K-Fold
opt_func = partial(rf_optimizer, X=X, y=y, K=K)

if is_tuning:
    rf_study = optuna.create_study(direction="minimize", sampler=sampler) # determine minimize or maximize sth
    rf_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 4 

if is_tuning:
    best_loss = 9999.0
    best_C = 0
    kernel = 'linear'
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)

#     # for Linear SVM
#     for C in tqdm([1, 2, 5, 10, 100]):
#         losses = []
#         l_svm = LinearSVC(C=C, probability=True) ## cuml version. (faster model)

#         for train_idx, val_idx in folds.split(X, y):
#             X_train = X.iloc[train_idx, :]
#             y_train = y.iloc[train_idx]
#             X_val = X.iloc[val_idx, :]
#             y_val = y.iloc[val_idx]

#             l_svm.fit(X_train, y_train)
#             preds = l_svm.predict_proba(X_val).values
#             loss = evaluation_metric(y_val, preds)
#             losses.append(loss)

#         avg_loss = np.mean(losses)
#         if avg_loss < best_loss:
#             best_loss = avg_loss
#             best_C = C

    # for SVM with RBF kernel.
    for C in tqdm([1, 2, 5, 10, 100]):
        losses = []
        r_svm = SVC(C=C, probability=True) ## cuml version. (with rbf kernel)

        for train_idx, val_idx in folds.split(X, y):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            r_svm.fit(X_train, y_train)
            preds = r_svm.predict_proba(X_val)
            loss = evaluation_metric(y_val, preds)
            losses.append(loss)

        avg_loss = np.mean(losses)
        if avg_loss < best_loss:
            best_loss = avg_loss
            best_C = C
            kernel = 'rbf'

    print("SVM(%s) log loss : %.4f" % (kernel, best_loss))

In [None]:
def svm_optimizer(trial, X, y, K):
    C = trial.suggest_float('C', 0.45, 0.55)
    kernel = trial.suggest_categorical('kernel', ['rbf'])


    model = SVC(C=C,
                kernel=kernel,
                # class_weight='balanced', # if class imbalanced
                probability=True,
                # cache_size=1000,
                random_state=42
               )

    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)
    losses = []

    for train_idx, val_idx in folds.split(X, y):
        X_train = X.iloc[train_idx, :]
        y_train = y.iloc[train_idx]

        X_val = X.iloc[val_idx, :]
        y_val = y.iloc[val_idx]

        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)
        loss = evaluation_metric(y_val, preds)
        losses.append(loss)

    return np.mean(losses)

In [None]:
K = 4
opt_func = partial(svm_optimizer, X=X, y=y, K=K)

if is_tuning:
    svm_study = optuna.create_study(direction="minimize", sampler=sampler)
    svm_study.optimize(opt_func, n_trials=n_trials)

In [None]:
K = 4
opt_func = partial(xgb_optimizer, X=X, y=y, K=K)

if is_tuning:
    xgb_study = optuna.create_study(direction="minimize", sampler=sampler)
    xgb_study.optimize(opt_func, n_trials=n_trials)

In [None]:
# visualize experiment logs
def display_experiment_log(study):
    display(study.trials_dataframe())
    print("Best Score: %.4f" % study.best_value)
    print("Best params: ", study.best_trial.params)
    history = study.trials_dataframe()
    display(history[history.value == study.best_value])
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_param_importances(study).show()

In [None]:
if is_tuning:
    display_experiment_log(rf_study)

In [None]:
if is_tuning:
    display_experiment_log(xgb_study)

## 6. Test Prediction and Make Submission

In [None]:
# Finalize Models
if is_tuning:
    rf_best_params = rf_study.best_params
    xgb_best_params = xgb_study.best_params

    best_rf = RandomForestClassifier(**rf_best_params)
    best_xgb = XGBClassifier(**xgb_best_params)
    best_svm = r_svm
#     if kernel == 'linear':
#         best_svm = l_svm
#     else:
#         best_svm = r_svm

### Need to validate OOF prediction score.

- OOF score is not correlated with LB score.

In [None]:
# Make KFold OOF prediction
def oof_preds(best_model, svm=False):
    # call global variable
    global final_preds
    
    # make KFold
    folds = StratifiedKFold(n_splits=K, random_state=42, shuffle=True)
    losses = []
            
    if svm: # cuml SVC
        if kernel == 'linear':
            svm = LinearSVC(C=best_C, probability=True)
        else:
            svm = SVC(C=best_C, probability=True)
            

        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            svm.fit(X_train, y_train)
            preds = svm.predict_proba(X_val)
            test_preds = svm.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)
    
    else:
        # fitting with best_model
        for i, (train_idx, val_idx) in enumerate(folds.split(X, y)):
            X_train = X.iloc[train_idx, :]
            y_train = y.iloc[train_idx]
            X_val = X.iloc[val_idx, :]
            y_val = y.iloc[val_idx]

            print(f"========== Fold {i+1} ==========")
            best_model.fit(X_train, y_train)
            preds = best_model.predict_proba(X_val)
            test_preds = best_model.predict_proba(X_test)
            final_preds.append(test_preds)
            loss = evaluation_metric(y_val, preds)
            print(f"Loss : {loss:.4f}")
            losses.append(loss)

    avg_loss = np.mean(losses)
    return avg_loss

In [None]:
print("Random Forest")
rf_loss = oof_preds(best_model=best_rf, svm=False)
print("Avg Loss : %.4f" % rf_loss)
print("\nXGBoost")
xgb_loss = oof_preds(best_model=best_xgb, svm=False)
print("Avg Loss : %.4f" % xgb_loss)
print("\nSupport Vector Machine")
svm_loss = oof_preds(best_model=best_svm, svm=True)
print("Avg Loss : %.4f" % svm_loss)
print("\nRF + SVM")
print("Avg Loss : %.4f" % np.mean([rf_loss, svm_loss]))
print("\nRF + XGB")
print("Avg Loss : %.4f" % np.mean([rf_loss, xgb_loss]))
print("\nSVM + XGB")
print("Avg Loss : %.4f" % np.mean([xgb_loss, svm_loss]))
print("\nTotal logloss : %.4f" % np.mean([rf_loss, xgb_loss, svm_loss]))

In [None]:
pd.DataFrame(columns=[0, 1], data=best_rf.predict_proba(X_val)).head()

In [None]:
best_svm.predict_proba(X_val)

In [None]:
final_preds = final_preds[:-4] # remove SVM KFold predictions

In [None]:
submission[['class_0', 'class_1']] = np.mean(final_preds, axis=0)
submission

In [None]:
# #voting_weights = [0.1, 0.1, 0.25, 0.25, 0.3]
# #voting_weights = [0.2, 0.2, 0.2, 0.2, 0.2]
# #voting_weights = [0.25, 0.25, 0.25, 0.25]
# #voting_weights = [0.5, 0.5]
# voting_weights = [0.35, 0.35, 0.3]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_nn[:, 0] + voting_weights[4]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_nn[:, 1] + voting_weights[4]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_lr[:, 0] + voting_weights[2]*preds_xgb[:, 0] + voting_weights[3]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_lr[:, 1] + voting_weights[2]*preds_xgb[:, 1] + voting_weights[3]*preds_svm[:, 1]
# submission['class_0'] = voting_weights[0]*preds_rf[:, 0] + voting_weights[1]*preds_xgb[:, 0] + voting_weights[2]*preds_svm[:, 0]
# submission['class_1'] = voting_weights[0]*preds_rf[:, 1] + voting_weights[1]*preds_xgb[:, 1] + voting_weights[2]*preds_svm[:, 1]
# # submission

In [None]:
submission.to_csv("submission.csv", index=False)