STEPS

~~ 1. Because the dataset is more than 1 million rows, run a train_test_split to get the R2 test score and get the LB result for the best model~~

~~ 2. Get a smaller form of the dataset (about 150,000 rows) and get both a train_test_split and 5CV scores then get the LB results for the best model~~

~~ 3. If there is no difference between 1 and 2 then perform SFS on the dataset from 2~~

4. Stack the models using either LinearRegression or Ridge
5. Hyperparameter tuning on each model 5 times
6. Stack all the 54 models
7. Perform feature engineering using OpenFE on the dataset from 2
8. Perform feature selection using FFS in OpenFE on 7 for each model
9. Hyperparameter tune each model from 8 5 times
10. Stack all the 54 models from 9
11. Stack all 108 models from 5 and 9

In [25]:
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMRegressor

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import numpy as np
from openfe import OpenFE, tree_to_formula, transform, TwoStageFeatureSelector, ForwardFeatureSelector
import os
import pandas as pd
from pprint import pprint

import random

from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import cross_validate, KFold, train_test_split, StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import time
from tqdm.notebook import tqdm

from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

experiment_name = 'openfe_smaller_data'

In [2]:
# train_big = pd.read_csv('train.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [3]:
train.shape, test.shape

((1117957, 21), (745305, 20))

In [4]:
features_list = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

In [5]:
TARGET = 'FloodProbability'

In [6]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.1, random_state=5)

# Get the indices for the validation set
for _, test_indx in sss.split(train, train[TARGET]):
    valid_train = train.iloc[test_indx]
    train_train = train.drop(test_indx)

len(valid_train), len(train_train)

(111796, 1006161)

In [None]:
# train_big['Sum_All_w_Intercept'] = (train_big[features_list].sum(axis=1) * 0.0056) - 0.0533
# train['Sum_All_w_Intercept'] = (train[features_list].sum(axis=1) * 0.0056) - 0.0533
# test['Sum_All_w_Intercept'] = (test[features_list].sum(axis=1) * 0.0056) - 0.0533

# train_big['Sum_All'] = train_big[features_list].sum(axis=1)
# train['Sum_All'] = train[features_list].sum(axis=1)
# test['Sum_All'] = test[features_list].sum(axis=1)

# train_big['Sum_Special'] = (train_big[features_list].sum(axis=1).isin(np.arange(72, 76))).astype(int)
# train['Sum_Special'] = (train[features_list].sum(axis=1).isin(np.arange(72, 76))).astype(int)
# test['Sum_Special'] = (test[features_list].sum(axis=1).isin(np.arange(72, 76))).astype(int)

In [7]:
train.sample(3)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
90092,4,6,5,5,5,3,9,3,4,6,4,4,5,5,7,8,4,4,3,3,0.485
933447,3,5,9,8,3,8,5,5,9,5,4,9,8,5,7,5,4,5,4,5,0.59
845222,6,5,2,6,6,8,2,7,3,6,2,6,4,3,6,2,1,4,3,8,0.46


In [8]:
valid_train.sample(3)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
728225,6,5,2,6,4,2,7,6,4,4,3,5,3,8,5,6,2,4,8,6,0.5
1014452,8,5,7,6,2,8,6,5,4,4,6,3,8,7,3,4,6,5,3,5,0.535
547614,7,5,7,9,7,7,4,4,3,3,6,5,5,2,7,8,6,4,3,4,0.545


In [9]:
def get_percentage_of_unique(df, target_variable):
    value_counts = df[target_variable].value_counts()

    total_count = len(df)

    percentage = (value_counts / total_count) * 100

    return percentage

print(get_percentage_of_unique(train, TARGET)), print(get_percentage_of_unique(train_train, TARGET)), print(get_percentage_of_unique(valid_train, TARGET))

FloodProbability
0.490    3.874925
0.495    3.783866
0.520    3.691197
0.485    3.681358
0.505    3.678675
           ...   
0.700    0.002862
0.725    0.002594
0.715    0.002326
0.710    0.002057
0.285    0.001789
Name: count, Length: 83, dtype: float64
FloodProbability
0.490    3.874927
0.495    3.783887
0.520    3.691159
0.485    3.681319
0.505    3.678636
           ...   
0.700    0.002882
0.725    0.002584
0.715    0.002286
0.710    0.002087
0.285    0.001789
Name: count, Length: 83, dtype: float64
FloodProbability
0.490    3.874915
0.495    3.783677
0.520    3.691545
0.485    3.681706
0.505    3.679023
           ...   
0.725    0.002683
0.715    0.002683
0.700    0.002683
0.710    0.001789
0.285    0.001789
Name: count, Length: 83, dtype: float64


(None, None, None)

In [10]:
X = valid_train.drop([TARGET], axis=1)
y = valid_train[TARGET]

# n_splits = 10
k3 = KFold(n_splits=3, shuffle=True, random_state=5)
k5 = KFold(n_splits=5, shuffle=True, random_state=5)
k10 = KFold(n_splits=10, shuffle=True, random_state=5)

- Define Models list

In [11]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=50))
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    # ('nystroem', Nystroem(n_components=500, random_state=5)),
    ('ridge', Ridge())
])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LinearRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [12]:
models = [
    CatBoostRegressor(random_state=5, verbose=False, early_stopping_rounds=100),
    ExtraTreesRegressor(random_state=5),
    HistGradientBoostingRegressor(random_state=5),
    LinearRegression(),
    linear_pipeline,
    LGBMRegressor(random_state=5, n_jobs=-1),
    RandomForestRegressor(random_state=5),
    knn_pipeline,
    ridge_pipeline,
    XGBRegressor(random_state=5),
]

- Create custom evaluation function

In [None]:
# def round_to_nearest_005(x):
#     return round(round(x / 0.005) * 0.005, 3)

In [13]:
def evaluate_models_cv(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train R2', 
                                        'MLA Test R2', 
                                        'MLA Test R2 Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            MLA_name = alg.name
        else:
            MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train R2': 0,
                'MLA Test R2': 0,
                'MLA Test R2 Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='r2',
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train R2': cv_results['train_score'].mean(),
            'MLA Test R2': cv_results['test_score'].mean(),
            'MLA Test R2 Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test R2'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [14]:
def evaluate_models_test_train(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train R2', 
                                        'MLA Test R2', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            MLA_name = alg.name
        else:
            MLA_name = alg.__class__.__name__

        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train R2': 0,
                'MLA Test R2': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                            y,
                                                            test_size=0.1,
                                                            stratify=y,
                                                            shuffle=True,
                                                            random_state=5)

        start_time = time.time()
        alg.fit(X_train, y_train)
        end_time = time.time()

        # Evaluate the model
        train_score = r2_score(y_train, alg.predict(X_train))
        test_score = r2_score(y_test, alg.predict(X_test))

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train R2': train_score,
            'MLA Test R2': test_score,
            'MLA Time': f'{(end_time - start_time) / 60:.2f} min',
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=50) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(tqdm(models, desc='Models'))]
        for future in tqdm(futures, total=len(futures), desc='Progress'):
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test R2'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [15]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [None]:
%%time

baseline_models = evaluate_models_cv(models, X, y, baseline_features, k10, f'{experiment_name}')
baseline_models

# Raw train Linear Regression score - 0.845309
# Raw train Linear Regression 5CV score - 0.844941 (0.000751)
# Raw train Linear Regression 10CV score - 0.844941 (0.000817) 32 seconds runtime
# SSS train Linear Regression 10CV score - 0.844951 (0.002986) 2 seconds runtime

# SSS DATA 10CV RESULTS (13 minutes total runtime)
# Nystroem Ridge,0.844950,0.00298,0 min 0.16 sec
# LinearRegression,0.844950,0.00298,0 min 0.23 sec
# LR Pipeline,0.844950,0.00298,0 min 0.30 sec
# CatBoostRegressor,0.844612,0.00274,0 min 57.84 sec
# XGBRegressor,0.786724,0.00345,0 min 30.13 sec
# LGBMRegressor,0.760541,0.00324,0 min 3.84 sec
# HistGradientBoostingRegressor,0.760066,0.00357,0 min 5.60 sec
# KNN,0.647916,0.00289,0 min 0.14 sec
# ExtraTreesRegressor,0.615353,0.00318,2 min 32.38 sec
# RandomForestRegressor,0.613513,0.00339,3 min 5.19 sec

- Feature Importances

In [20]:
# Generate a random feature for X
np.random.seed(5)
X['random_control_feature'] = np.round(np.random.uniform(1, 20, X.shape[0]), 0)
X.shape

(111796, 21)

In [21]:
X.sample(3)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,random_control_feature
578354,5,8,6,6,3,1,4,3,4,2,6,3,5,5,8,5,1,3,5,3,5.0
881804,4,5,5,6,4,5,4,5,4,5,8,7,5,8,6,4,5,3,3,4,4.0
262732,5,2,5,3,7,2,2,6,6,5,6,11,3,4,2,6,6,9,3,3,6.0


In [23]:
feat_importance_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    try:
        # Initialize array to store feature importances
        feature_importances = np.zeros(X.shape[1])

        # Loop through each fold and calculate the feature importances
        for train_index, test_index in k3.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model.fit(X_train, y_train)

            # Get the feature importances and them to the total
            feature_importances += model.feature_importances_

        feature_importances /= 3

        feature_importances_dict = dict(zip(X.columns, feature_importances))

        df = pd.DataFrame.from_dict(feature_importances_dict, orient='index')

        # Resetting index with a name for the column
        df = df.reset_index().rename(columns={'index': 'Feature', 0: 'Avg_Feat_Importance'})
        df.sort_values(by='Avg_Feat_Importance', ascending=False, inplace=True)

        # Save to CSV
        df.to_csv(f'{model_name}_feature_importances.csv')

        fi_threshold = 0

        fi_feats = df[df['Avg_Feat_Importance'] > fi_threshold]['Feature'].tolist()

        feat_importance_features[model_name] = fi_feats
        print(f'Done with {model_name}')

    except AttributeError:
        feat_importance_features[model_name] = list(X.columns)
        print(f'{model_name} does not have feature_importances_')

Done with CatBoostRegressor
Done with ExtraTreesRegressor
HistGradientBoostingRegressor does not have feature_importances_
LinearRegression does not have feature_importances_
LR Pipeline does not have feature_importances_
Done with LGBMRegressor
Done with RandomForestRegressor
KNN does not have feature_importances_
Nystroem Ridge does not have feature_importances_
Done with XGBRegressor


In [24]:
with open('featimp_features.txt', mode='w') as f:
    pprint(feat_importance_features, stream=f)

*Compared to a random control feature, for the models that do have feature_importances_ all the features are higher than zero and are more important than the random control feature*

*runtime is 7 minutes for feature importances*

- Permutation Importance

In [None]:
%%time

perm_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=5)

perm_importances = {model.__class__.__name__: [] for model in models}

for i, (train_idx, test_idx) in enumerate(perm_cv.split(X, y)):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    for model in models:
        if hasattr(model, 'name'):
            model_name = model.name
        else:
            model_name = model.__class__.__name__

        model.fit(X_train, y_train)
        # Calculate permutation importance
        result = permutation_importance(model, X_test, y_test, n_repeats=3, random_state=5, n_jobs=-1, scoring='r2')
        perm_importances[model_name].append(result.importances_mean)
        print(f'Done with {model_name}.')
    
    print(f'Done with Fold {i+1}', end='\n\n')

In [None]:
%%time

# Average importances across folds and export to CSV
for model_name, importances in perm_importances.items():
    avg_importance = np.mean(importances, axis=0)
    importance_df = pd.DataFrame({'Feature': df_reduced_spear.columns, 'Importance': avg_importance})
    importance_df.sort_values(by='Importance', ascending=False, inplace=True)
    # Export to CSV
    importance_df.to_csv(f'.\permutation_importances\{model_name}_permutation_importance.csv', index=False)

print('Done with Permuation Importances', end='\n\n')

In [None]:
directory = 'permutation_importances'

# Initialize a dictionary for the features
perm_important_features = {}

for model in models:
    model_name = model.__class__.__name__
    csv_path = os.path.join(directory, f'{model_name}_permutation_importance.csv')
    if os.path.exists(csv_path):
        df = pd.read_csv(csv_path)

        # Check for 'random_control_feature' and its importance
        if 'random_control_feature' in df['Feature'].values:
            random_feature_importance = df.loc[df['Feature'] == 'random_control_feature', 'Importance'].iloc[0]
        else:
            random_feature_importance = 0

        # Determine the threshold
        threshold = max(0, random_feature_importance)

        # Filter features where importance is greater than 0
        important_feats_filtered = df[df['Importance'] > threshold]['Feature'].tolist()

        # Reorder important_feats based on the predefined features_list
        important_feats_ordered = [feat for feat in features_list if feat in important_feats_filtered]

        # Add to importance dictionary
        perm_important_features[model_name] = important_feats_ordered

    else:
        print(f'CSV file for {model_name} not found.')

print('Done getting important features dictionary')

In [None]:
X.drop('random_control_feature', axis=1, inplace=True)

- SFS

In [None]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for model in models:
    # set name
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    try:
        features = baseline_features[model_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {model_name}')

        sfs = SFS(model,
            k_features='best',
            forward=False,
            floating=True,
            scoring='r2',
            verbose=2,
            n_jobs=-1,
            cv=None)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')

    except KeyError:
        print(f'{model_name} not in the dictionary.')

In [None]:
with open('sfs_features_lgbm.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [None]:
sfs_features = {'CatBoostRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'ExtraTreesRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'HistGradientBoostingRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'LinearRegression': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_Special'],
'LGBMRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'RandomForestRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'KNN': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'Nystroem Ridge': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_Special'],
'XGBRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],}

In [None]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_cv(models, X, y, sfs_features, k3, f'{experiment_name}_sfs')
sfs_models

- Single Model Prediction

In [None]:
# model = LinearRegression()
# model = LGBMRegressor(random_state=5, n_jobs=-1)
model = ExtraTreesRegressor(random_state=5)

features = sfs_features['ExtraTreesRegressor']
features

In [None]:
train_big[train_big['Sum_All'].isna()]

In [None]:
%%time

model.fit(train_big[features], train_big[TARGET])

In [None]:
pred = model.predict(test[features])
pred

In [None]:
pred_df = pd.DataFrame(pred, columns=[TARGET])
pred_df.head()

In [None]:
# pred_df['FloodProbability_rounded'] = pred_df['FloodProbability'].apply(round_to_nearest_005)
# pred_df.head()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], pred_df], axis=1)
submission_df.columns = ['id', 'FloodProbability']
submission_df.head()

In [None]:
submission_df.to_csv('submission_extrat_0.871479_3cv.csv', index=False)

- Get Stacking score

In [None]:
meta_model = LinearRegression()

In [None]:
%%time

meta_scores = []

for i, (train_idx, meta_idx) in enumerate(k5.split(X)):
    print(f'Fold {i + 1}')
    X_train, X_meta = X.iloc[train_idx], X.iloc[meta_idx]
    y_train, y_meta = y.iloc[train_idx], y.iloc[meta_idx]

    print(X_train.shape, X_meta.shape, y_train.shape, y_meta.shape)
    meta_features_fold = np.zeros((X_meta.shape[0], len(models)))
    # meta_test_features = np.zeros((y.shape[0], len(models)))
    # meta_targets = np.zeros(y.shape[0])

    for i, model in enumerate(models):
        model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
        print(f'Starting {model_name}')
        model_features = sfs_features[model_name]
        # model_features = baseline_features[model_name]

        # Fit model on the selected features
        model.fit(X_train[model_features], y_train)
        preds = model.predict(X_meta[model_features])
        meta_features_fold[:, i] = preds

    # Train the meta-model on the predictions from the base models
    meta_model.fit(meta_features_fold, y_meta)
    
    # Predict using the meta-model
    final_preds = meta_model.predict(meta_features_fold)
    
    # Calculate r2_score for the current fold
    current_fold_r2_score = r2_score(y_meta, final_preds)
    meta_scores.append(current_fold_r2_score)

In [None]:
# Calculate the average RMSLE across all folds
average_r2 = np.mean(meta_scores)
average_r2

# 0.843288

# 0.84242 - Ridge stack
# 0.845440748488788 - LR stack

# 0.8768246026677646 - Features w/ LR stack

- Get stacking submission

In [None]:
%%time

# Retrain base models on all data
all_base_model_predictions = []

for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    print(f'Starting {model_name}')
    model_features = sfs_features[model_name]

    model.fit(X[model_features], y)
    preds = model.predict(test[model_features])
    all_base_model_predictions.append(preds.reshape(-1, 1))

# Stack predictions for the meta model
X_new_meta = np.hstack(all_base_model_predictions)

# Use the meta model to make final predictions
final_predictions = meta_model.predict(X_new_meta)

In [None]:
model_names = []
for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    model_names.append(model_name)
model_names

In [None]:
print('Ensemble weights')
weights = pd.Series(meta_model.coef_, index=model_names)
print(weights)
print(f'Weights total: {weights.sum()}')
print(f'Intercept: {meta_model.intercept_}', end='\n\n')
print(f"Average Stacking R2 across all folds: {average_r2:.5f}")

In [None]:
final_predictions_df = pd.DataFrame(final_predictions, columns=[TARGET])
final_predictions_df.head()

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], final_predictions_df], axis=1)
submission_df.head()

In [None]:
submission_df.to_csv('submission_lr_stacking_0.87682.csv', index=False)

### OpenFE

Possible features
1. (Addition of all features * 0.0056) - 0.0533
2. Addition of human activities - ['TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']
3. Addition of natural occurences - ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

In [None]:
['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

In [None]:
%%time

ofe = OpenFE()
ofe.fit(
    data=X,
    label=y,
    # n_data_blocks=2,
    # feature_boosting=True,
    task='regression',
    # stage2_metric='permutation',
    # metric='rmse', 
    n_jobs=4,
)

In [None]:
train_x, test_x = transform(X, test, ofe.new_features_list, n_jobs=4 )

In [None]:
baseline_openfe_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_openfe_features[model_name] = list(train_x.columns)

In [None]:
%%time

baseline_openfe_models = evaluate_models_cv(models, X, y, baseline_openfe_features, k5, f'{experiment_name}_openfe')
baseline_openfe_models