In [31]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import optuna
import random
from tqdm.notebook import tqdm

from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.feature_selection import RFECV, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from concurrent.futures import ThreadPoolExecutor

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from pprint import pprint
import os

pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

experiment_name = 'lgbm'

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
TARGET = 'is_legendary'
train.shape

(700, 296)

In [4]:
if TARGET in list(train.columns):
    print(f'The target {TARGET} is the training data')

The target is_legendary is the training data


In [5]:
X = train.drop(TARGET, axis=1)
y = train[TARGET]

n_splits = 10
sk10 = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=5)

In [6]:
models = [
    LGBMClassifier(n_jobs=-1, random_state=5),
    XGBClassifier(random_state=5),
    # CatBoostClassifier(random_state=5, verbose=False),
    RandomForestClassifier(random_state=5),
    ]

- Feature Importances

In [7]:
feat_importance_features = {}

for model in models:
    model_name = model.__class__.__name__
    # Initialize array to store feature importances
    feature_importances = np.zeros(X.shape[1])

    # Loop through each fold and calculate the feature importances
    for train_index, test_index in sk10.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)

        # Get the feature importances and them to the total
        feature_importances += model.feature_importances_

    feature_importances /= n_splits

    feature_importances_dict = dict(zip(X.columns, feature_importances))

    df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

    # Resetting index with a name for the column
    df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
    df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

    # Save to CSV
    df.to_csv(f'{model_name}_feature_importances.csv')

    fi_threshold = 0

    fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

    feat_importance_features[model_name] = fi_feats

- RFECV

In [8]:
rfe_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=5)

# Initialize empty dictionary for RFECV features
rfecv_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__
		
    features = feat_importance_features[MLA_name]

    # incase there is no feature that had importance, go to the next model
    if len(features) == 0:
        continue
	
    X_rfecv = X[features]

    try:
        print(f'Starting with {MLA_name}')
        # Create the RFECV object and rank each feature
        selector = RFECV(alg, cv=sk10, step=1, scoring='accuracy', verbose=2)
        selector = selector.fit(X_rfecv, y)

        selected_features = list(X_rfecv.columns[selector.support_])

        rfecv_features[MLA_name] = selected_features

        print(f'Done with {MLA_name}', end='\n\n')
    
    except ValueError:
        rfecv_features[MLA_name] = features
        print(f'{MLA_name} does not have coef_ or feature_importances_', end='\n\n')

Starting with LGBMClassifier
Fitting estimator with 51 features.
Fitting estimator with 50 features.
Fitting estimator with 49 features.
Fitting estimator with 48 features.
Fitting estimator with 47 features.
Fitting estimator with 46 features.
Fitting estimator with 45 features.
Fitting estimator with 44 features.
Fitting estimator with 43 features.
Fitting estimator with 42 features.
Fitting estimator with 41 features.
Fitting estimator with 40 features.
Fitting estimator with 39 features.
Fitting estimator with 38 features.
Fitting estimator with 37 features.
Fitting estimator with 36 features.
Fitting estimator with 35 features.
Fitting estimator with 34 features.
Fitting estimator with 33 features.
Fitting estimator with 32 features.
Fitting estimator with 31 features.
Fitting estimator with 30 features.
Fitting estimator with 29 features.
Fitting estimator with 28 features.
Fitting estimator with 27 features.
Fitting estimator with 26 features.
Fitting estimator with 25 features.

In [9]:
with open('rfecv_features_accuracy.txt', mode='w') as f:
    pprint(rfecv_features, stream=f)

- SFS

In [22]:
# Initialize empty dictionary for SFS features
sfs_features = {}

for alg in models:
    # set name
    MLA_name = alg.__class__.__name__

    try:
            
        features = rfecv_features[MLA_name]
        # features = feat_importance_features[MLA_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {MLA_name}')

        sfs = SFS(alg,
            k_features='best',
            forward=False,
            floating=False,
            scoring='accuracy',
            verbose=2,
            n_jobs=-1,
            cv=sk10)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        sfs_features[MLA_name] = list(selected_sfs_feats)

        print(f'Done with {MLA_name}', end='\n\n')

    except KeyError:
        print(f'{MLA_name} not in the dictionary.')

Running backward feature selection with LGBMClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.9s remaining:    2.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.2s finished

[2024-03-01 15:36:23] Features: 9/1 -- score: 0.9928571428571429[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.7s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    1.0s finished

[2024-03-01 15:36:24] Features: 8/1 -- score: 0.9957142857142858[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:    0.8s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:    0.9s finished

[2024-03-01 15:36:25] F

Done with LGBMClassifier

Running backward feature selection with XGBClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of  11 | elapsed:    1.9s remaining:    8.7s
[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:    2.4s remaining:    0.8s
[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:    2.8s finished

[2024-03-01 15:36:32] Features: 10/1 -- score: 0.9942857142857143[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.1s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished

[2024-03-01 15:36:33] Features: 9/1 -- score: 0.9957142857142858[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    0.8s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    1.1s finished

[2024-03-01 15:36:35] 

Done with XGBClassifier

Running backward feature selection with RandomForestClassifier


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Done with RandomForestClassifier



[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:    1.7s finished

[2024-03-01 15:36:42] Features: 1/1 -- score: 0.9885714285714287

In [None]:
with open('sfs_features_accuracy.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [10]:
def evaluate_models_roc(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train ROC', 
                                        'MLA Test ROC', 
                                        'MLA Test ROC Std',
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train ROC': 0,
                'MLA Test ROC': 0,
                'MLA Test ROC Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        roc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'ROC': roc_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Calculate standard deviation
        std_test_roc = cv_results['test_ROC'].std() if 'test_ROC' in cv_results else 0


        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train ROC': (cv_results['train_ROC'].mean()) if 'train_ROC' in cv_results else 0,
            'MLA Test ROC': (cv_results['test_ROC'].mean()) if 'test_ROC' in cv_results else 0,
            'MLA Test ROC Std': std_test_roc,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test ROC'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [11]:
def evaluate_models_accuracy(models, X, y, important_features, cv_split, experiment_name):
    # Create a DataFrame to store comparison results
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train Accuracy', 
                                        'MLA Test Accuracy', 
                                        'MLA Test Accuracy Std',
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train Accuracy': 0,
                'MLA Test Accuracy': 0,
                'MLA Test Accuracy Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        # Perform cross-validation
        accuracy_scorer = make_scorer(accuracy_score, greater_is_better=True, needs_proba=False)
        
        cv_results = cross_validate(alg, X[features], y, 
                                    cv=cv_split, 
                                    scoring={'Accuracy': accuracy_scorer}, 
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Format time
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Calculate standard deviation
        std_test_accuracy = cv_results['test_Accuracy'].std() if 'test_Accuracy' in cv_results else 0


        print(f'Done with {MLA_name}.')
        
        # Populate results
        return {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train Accuracy': (cv_results['train_Accuracy'].mean()) if 'train_Accuracy' in cv_results else 0,
            'MLA Test Accuracy': (cv_results['test_Accuracy'].mean()) if 'test_Accuracy' in cv_results else 0,
            'MLA Test Accuracy Std': std_test_accuracy,
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

    results_list = []

    # Use ThreadPoolExecutor for parallel execution
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            if result:
                results_list.append(result)

    # Create a DataFrame from the list of dictionaries
    MLA_compare = pd.DataFrame(results_list)

    # Sort and save results
    MLA_compare.sort_values(by=['MLA Test Accuracy'], ascending=True, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [12]:
baseline_features = {}

for model in models:
    model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [13]:
baseline_models = evaluate_models_roc(models, X, y, baseline_features, sk10, f'{experiment_name}_roc')
baseline_models

Done with XGBClassifier.
Done with RandomForestClassifier.
Done with LGBMClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC,MLA Test ROC,MLA Test ROC Std,MLA Time
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",1.0,0.998698,0.002668,0 min 1.55 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.999479,0.001563,0 min 0.20 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",1.0,1.0,0.0,0 min 0.45 sec


In [14]:
baseline_models = evaluate_models_accuracy(models, X, y, baseline_features, sk10, f'{experiment_name}_accuracy')
baseline_models

Done with RandomForestClassifier.
Done with LGBMClassifier.
Done with XGBClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Test Accuracy Std,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.992857,0.009583,0 min 0.20 sec
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",1.0,0.992857,0.009583,0 min 1.13 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",1.0,0.994286,0.006999,0 min 0.52 sec


In [15]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

feat_importance_models = evaluate_models_roc(models, X, y, feat_importance_features, sk10, f'{experiment_name}_featimp_roc')
feat_importance_models

Done with LGBMClassifier.
Done with XGBClassifier.
Done with RandomForestClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train ROC,MLA Test ROC,MLA Test ROC Std,MLA Time
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",1.0,0.998698,0.002668,0 min 0.20 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.999479,0.001563,0 min 0.15 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",1.0,0.999479,0.001562,0 min 0.48 sec


In [16]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

feat_importance_models = evaluate_models_accuracy(models, X, y, feat_importance_features, sk10, f'{experiment_name}_featimp_accuracy')
feat_importance_models

Done with XGBClassifier.
Done with LGBMClassifier.
Done with RandomForestClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Test Accuracy Std,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.992857,0.009583,0 min 0.18 sec
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",1.0,0.992857,0.009583,0 min 0.23 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",1.0,0.994286,0.009476,0 min 0.49 sec


In [17]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

rfecv_models = evaluate_models_accuracy(models, X, y, rfecv_features, sk10, f'{experiment_name}_rfecv_accuracy')
rfecv_models

Done with XGBClassifier.
Done with LGBMClassifier.
Done with RandomForestClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Test Accuracy Std,MLA Time
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.991429,0.009476,0 min 0.13 sec
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.99873,0.991429,0.009476,0 min 0.09 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.997143,0.997143,0.005714,0 min 0.38 sec


In [23]:
# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_accuracy(models, X, y, sfs_features, sk10, f'{experiment_name}_sfs_accuracy')
sfs_models

Done with RandomForestClassifier.
Done with LGBMClassifier.
Done with XGBClassifier.


Unnamed: 0,MLA Name,MLA Parameters,MLA Train Accuracy,MLA Test Accuracy,MLA Test Accuracy Std,MLA Time
1,XGBClassifier,"{'objective': 'binary:logistic', 'use_label_en...",0.997778,0.995714,0.006547,0 min 0.06 sec
0,LGBMClassifier,"{'boosting_type': 'gbdt', 'class_weight': None...",1.0,0.997143,0.005714,0 min 0.06 sec
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.997143,0.997143,0.005714,0 min 0.25 sec


# Final Models

In [59]:
model1 = LGBMClassifier(n_jobs=-1, random_state=5)
model2 = XGBClassifier(random_state=5)
model3 = RandomForestClassifier(random_state=5)

In [60]:
lgbm_feats = ['weight_kg', 'base_egg_steps', 'capture_rate', 'attack', 'sp_attack', 'hp']
randomforest_feats = ['base_egg_steps', 'capture_rate']
xgb_feats = ['capture_rate', 'base_total', 'percentage_male', 'against_ghost', 'against_fairy', 'type2_fairy', 'against_fight', 'generation', 'experience_growth']

# Ensembling

In [69]:
model1_results, model2_results, model3_results, y_test_list = [], [], [], []

X_lgbm = X[lgbm_feats]
X_rf = X[randomforest_feats]
X_xgb = X[xgb_feats]

for i, (train_index, test_index) in enumerate(sk10.split(X, y)):
    X_train_lgbm, X_test_lgbm = X_lgbm.iloc[train_index], X_lgbm.iloc[test_index]
    X_train_rf, X_test_rf = X_rf.iloc[train_index], X_rf.iloc[test_index]
    X_train_xgb, X_test_xgb = X_xgb.iloc[train_index], X_xgb.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    model1.fit(X_train_lgbm, y_train)
    model1_results.append(model1.predict(X_test_lgbm))

    model2.fit(X_train_xgb, y_train)
    model2_results.append(model2.predict(X_test_xgb))

    model3.fit(X_train_rf, y_train)
    model3_results.append(model3.predict(X_test_rf))

    y_test_list.append(y_test)

    print(f'Done with fold {i+1}.')

Done with fold 1.
Done with fold 2.
Done with fold 3.
Done with fold 4.
Done with fold 5.
Done with fold 6.
Done with fold 7.
Done with fold 8.
Done with fold 9.
Done with fold 10.


In [70]:
model1_results[8]

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0])

In [73]:
model1_weights, model2_weights, model3_weights, scores = [], [], [], []

for i in tqdm(range(20000)):
    weight_1 = np.random.random_sample(size=1)[0]
    weight_2 = np.random.random_sample(size=1)[0]
    weight_3 = np.random.random_sample(size=1)[0]

    model1_weights.append(weight_1)
    model2_weights.append(weight_2)
    model3_weights.append(weight_3)

    scores_in = []

    for j in range(10):
        weighted_pred = weight_1 * model1_results[j] + weight_2 * model2_results[j] + weight_3 * model3_results[j]
        # scores_in.append(roc_auc_score(y_test_list[j], weighted_pred))
        scores_in.append(accuracy_score(y_test_list[j], weighted_pred.astype(int)))

    scores.append(np.mean(scores_in))

  0%|          | 0/20000 [00:00<?, ?it/s]

In [74]:
results_df = pd.DataFrame()
results_df['model_1'] = model1_weights
results_df['model_2'] = model2_weights
results_df['model_3'] = model3_weights
results_df['score'] = scores
results_df = results_df.sort_values(by='score', ascending=False).reset_index(drop=True)
results_df.head(10)

Unnamed: 0,model_1,model_2,model_3,score
0,0.852266,0.132082,0.206277,0.998571
1,0.398644,0.434911,0.669488,0.998571
2,0.2691,0.186554,0.774207,0.998571
3,0.686079,0.095665,0.947099,0.998571
4,0.19931,0.453358,0.936516,0.998571
5,0.591295,0.378306,0.722578,0.998571
6,0.373842,0.010586,0.953921,0.998571
7,0.420709,0.424388,0.826683,0.998571
8,0.811328,0.07312,0.568553,0.998571
9,0.421955,0.325532,0.884591,0.998571


# Get the test data submission (Single Model)

In [None]:
model1 = model1.fit(X, y)

In [None]:
prediction = (model1.predict(test)).astype(int)
prediction_df = pd.DataFrame(prediction)

In [None]:
submission = pd.read_csv('sample_submission_updated.csv')
submission.loc[:, 'is_legendary'] = prediction_df.values
submission.head()

In [None]:
submission.to_csv('submission_0.999479_roc.csv', index=False)

# Get the test data submission (Ensemble Models)

In [75]:
model1 = model1.fit(X_lgbm, y)
model2 = model2.fit(X_xgb, y)
model3 = model3.fit(X_rf, y)

In [76]:
ensemble_pred = (
                results_df['model_1'][0] * model1.predict(test[lgbm_feats]) +
                results_df['model_2'][0] * model2.predict(test[xgb_feats]) +
                results_df['model_3'][0] * model3.predict(test[randomforest_feats])
                 )

ensemble_df = pd.DataFrame(ensemble_pred)
# If all models predict 0, instead of getting NaN, fill in 0
ensemble_df = ensemble_df.div(ensemble_df.sum(axis=1), axis=0).fillna(0)

In [77]:
ensemble_df

Unnamed: 0,0
0,0.0
1,1.0
2,0.0
3,0.0
4,0.0
...,...
96,0.0
97,1.0
98,0.0
99,0.0


In [78]:
submission = pd.read_csv('sample_submission_updated.csv')
submission.loc[:, 'is_legendary'] = ensemble_df.values
submission.head()

Unnamed: 0,ID,is_legendary
0,440,0.0
1,486,1.0
2,130,0.0
3,385,0.0
4,568,0.0


In [58]:
submission.to_csv('submission_0.998571_accuracy.csv', index=False)