STEPS

~~ 1. Because the dataset is more than 1 million rows, run a train_test_split to get the R2 test score and get the LB result for the best model~~

~~ 2. Get a smaller form of the dataset (about 150,000 rows) and get both a train_test_split and 5CV scores then get the LB results for the best model~~

~~ 3. If there is no difference between 1 and 2 then perform SFS on the dataset from 2~~

4. Stack the models using either LinearRegression or Ridge
5. Hyperparameter tuning on each model 5 times
6. Stack all the 54 models
7. Perform feature engineering using OpenFE on the dataset from 2
8. Perform feature selection using FFS in OpenFE on 7 for each model
9. Hyperparameter tune each model from 8 5 times
10. Stack all the 54 models from 9
11. Stack all 108 models from 5 and 9

In [1]:
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor
from concurrent.futures import ThreadPoolExecutor
from lightgbm import LGBMRegressor

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import numpy as np
from openfe import OpenFE, tree_to_formula, transform, TwoStageFeatureSelector, ForwardFeatureSelector
import pandas as pd
from pprint import pprint

import random

from sklearn.ensemble import ExtraTreesRegressor, HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import make_scorer, r2_score
from sklearn.model_selection import cross_validate, KFold, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import time

from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)

experiment_name = 'openfe_smaller_data'

In [2]:
# train = pd.read_csv('train.csv')
train = pd.read_csv('smaller_train.csv')
test = pd.read_csv('test.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True)

In [3]:
train.shape, test.shape

((149960, 21), (745305, 20))

In [4]:
features_list = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors']

In [5]:
TARGET = 'FloodProbability'

In [6]:
train['Sum_All_w_Intercept'] = (train[features_list].sum(axis=1) * 0.0056) - 0.0533
test['Sum_All_w_Intercept'] = (test[features_list].sum(axis=1) * 0.0056) - 0.0533

train['Sum_All'] = train[features_list].sum(axis=1)
test['Sum_All'] = test[features_list].sum(axis=1)

train['Sum_Special'] = (train[features_list].sum(axis=1).isin(np.arange(72, 76))).astype(int)
test['Sum_Special'] = (test[features_list].sum(axis=1).isin(np.arange(72, 76))).astype(int)

In [7]:
train.sample(3)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability,Sum_All_w_Intercept,Sum_All,Sum_Special
93203,1,7,3,3,7,4,3,5,8,8,2,5,5,2,6,6,6,4,4,7,0.465,0.4843,96,0
81681,6,5,5,6,3,7,7,4,5,4,8,7,7,5,5,7,5,4,3,4,0.545,0.5459,107,0
74100,6,4,2,4,8,8,5,5,7,6,5,4,5,4,3,7,6,3,5,6,0.525,0.5235,103,0


In [8]:
X = train.drop([TARGET], axis=1)
y = train[TARGET]

n_splits = 5
k5 = KFold(n_splits=n_splits, shuffle=True, random_state=5)

- Define Models list

In [9]:
# Define pipelines
knn_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=50))
])

ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    # ('nystroem', Nystroem(n_components=500, random_state=5)),
    ('ridge', Ridge())
])

linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', LinearRegression()),
])

# Manually set pipeline names
knn_pipeline.name = 'KNN'
ridge_pipeline.name = 'Nystroem Ridge'
linear_pipeline.name = 'LR Pipeline'

In [10]:
models = [
    # CatBoostRegressor(random_state=5, verbose=False, early_stopping_rounds=100),
    CatBoostRegressor(random_state=5, verbose=False),
    ExtraTreesRegressor(random_state=5),
    HistGradientBoostingRegressor(random_state=5),
    LinearRegression(),
    # linear_pipeline,
    LGBMRegressor(random_state=5, n_jobs=-1),
    RandomForestRegressor(random_state=5),
    knn_pipeline,
    ridge_pipeline,
    XGBRegressor(random_state=5),
]

- Create custom evaluation function

In [11]:
def evaluate_models_cv(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train R2', 
                                        'MLA Test R2', 
                                        'MLA Test R2 Std', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            MLA_name = alg.name
        else:
            MLA_name = alg.__class__.__name__
        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train R2': 0,
                'MLA Test R2': 0,
                'MLA Test R2 Std': 0,
                'MLA Time': "0 min 0.00 sec",
            }
        
        cv_results = cross_validate(alg, 
                                    X[features], 
                                    y, cv=cv_split, 
                                    scoring='r2',
                                    return_train_score=True, 
                                    n_jobs=-1)

        # Time formatting
        mean_fit_time = cv_results['fit_time'].mean()
        minutes, seconds = divmod(mean_fit_time, 60)

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train R2': cv_results['train_score'].mean(),
            'MLA Test R2': cv_results['test_score'].mean(),
            'MLA Test R2 Std': cv_results['test_score'].std(),
            'MLA Time': f"{int(minutes)} min {seconds:.2f} sec",
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test R2'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [12]:
def evaluate_models_test_train(models, X, y, important_features, cv_split, experiment_name):
    MLA_compare = pd.DataFrame(columns=['MLA Name', 
                                        'MLA Parameters', 
                                        'MLA Train R2', 
                                        'MLA Test R2', 
                                        'MLA Time'])
    
    def evaluate_model(alg, idx):
        if hasattr(alg, 'name'):
            MLA_name = alg.name
        else:
            MLA_name = alg.__class__.__name__

        features = important_features.get(MLA_name, [])

        # Check if the list of important features is empty
        if len(features) == 0:
            # If empty, return results with zero values
            print(f'Skipping {MLA_name} due to no important features.')
            return {
                'MLA Name': MLA_name,
                'MLA Parameters': str(alg.get_params()),
                'MLA Train R2': 0,
                'MLA Test R2': 0,
                'MLA Time': "0 min 0.00 sec",
            }

        X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                            y,
                                                            test_size=0.1,
                                                            stratify=y,
                                                            shuffle=True,
                                                            random_state=5)

        start_time = time.time()
        alg.fit(X_train, y_train)
        end_time = time.time()

        # Evaluate the model
        train_score = r2_score(y_train, alg.predict(X_train))
        test_score = r2_score(y_test, alg.predict(X_test))

        # Results population
        result = {
            'MLA Name': MLA_name,
            'MLA Parameters': str(alg.get_params()),
            'MLA Train R2': train_score,
            'MLA Test R2': test_score,
            'MLA Time': f'{(end_time - start_time) / 60:.2f} min',
        }

        print(f'Done with {MLA_name}.')
        return result

    results_list = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(evaluate_model, alg, idx) for idx, alg in enumerate(models)]
        for future in futures:
            result = future.result()
            results_list.append(result)

    MLA_compare = pd.DataFrame(results_list)

    MLA_compare.sort_values(by=['MLA Test R2'], ascending=False, inplace=True)
    MLA_compare.to_csv(f'{experiment_name}_results.csv', index=False)

    return MLA_compare

In [13]:
baseline_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_features[model_name] = list(X.columns)

In [14]:
%%time

baseline_models = evaluate_models_test_train(models, X, y, baseline_features, k5, f'{experiment_name}')
baseline_models

# Raw train Linear Regression score - 0.845309
# Raw train Linear Regression 5CV score - 0.844941 (0.000751)
# Smaller train Linear Regression train_test score - 0.843288
# Smaller train Linear Regression 5CV score - 0.843459 (0.001384)
# Smaller train Linear Regression w/ kaggle discussion features (Sum_All_w_Intercept and Sum_Special) score - 0.849927
# Smaller train Linear Regression w/ default OpenFE 5CV score - 

# 0.864707 LGBM - Sum_All_w_Intercept and Sum_Special
# 0.864829 LGBM - Sum_All ONLY

Done with LinearRegression.
Done with Nystroem Ridge.
Done with CatBoostRegressor.
Done with KNN.
Done with XGBRegressor.
Done with LGBMRegressor.
Done with HistGradientBoostingRegressor.
Done with ExtraTreesRegressor.
Done with RandomForestRegressor.
CPU times: total: 11min 15s
Wall time: 4min 46s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train R2,MLA Test R2,MLA Time
1,ExtraTreesRegressor,"{'bootstrap': False, 'ccp_alpha': 0.0, 'criter...",1.0,0.876312,3.66 min
5,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.98256,0.8739,4.66 min
0,CatBoostRegressor,"{'loss_function': 'RMSE', 'verbose': False, 'r...",0.874549,0.865113,1.18 min
4,LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",0.869311,0.864707,1.52 min
2,HistGradientBoostingRegressor,"{'categorical_features': None, 'early_stopping...",0.867356,0.864518,1.55 min
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.878386,0.864389,1.47 min
7,Nystroem Ridge,"{'memory': None, 'steps': [('scaler', Standard...",0.849782,0.849928,0.00 min
3,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.849781,0.849927,0.01 min
6,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.813004,0.803755,0.01 min


- SFS

In [None]:
%%time

# Initialize empty dictionary for SFS features
sfs_features = {}

for model in models:
    # set name
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    try:
        features = baseline_features[model_name]

        # incase there is no feature that had importance, go to the next model
        if len(features) == 0:
            continue
        
        X_sfs = X[features]

        print(f'Running backward feature selection with {model_name}')

        sfs = SFS(model,
            k_features='best',
            forward=False,
            floating=True,
            scoring='r2',
            verbose=2,
            n_jobs=7,
            cv=None)
        
        sfs = sfs.fit(X_sfs, y)

        # Get the selected features index
        selected_sfs_idx = list(sfs.k_feature_idx_)

        # Get the feature names
        selected_sfs_feats = X_sfs.columns[selected_sfs_idx]

        selected_features = list(selected_sfs_feats)

        # # Reorder selected_features based on the predefined features_list
        # selected_features_ordered = [feat for feat in features_list if feat in selected_features]

        sfs_features[model_name] = selected_features

        print(f'Done with {model_name}', end='\n\n')

    except KeyError:
        print(f'{model_name} not in the dictionary.')

In [None]:
with open('sfs_features_lgbm.txt', mode='w') as f:
    pprint(sfs_features, stream=f)

In [17]:
sfs_features = {'CatBoostRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'ExtraTreesRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'HistGradientBoostingRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'LinearRegression': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_Special'],
'LGBMRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],
'RandomForestRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'KNN': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_All', 'Sum_Special'],
'Nystroem Ridge': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All_w_Intercept', 'Sum_Special'],
'XGBRegressor': ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors', 'Sum_All'],}

In [18]:
%%time

# Set seeds for reproducibility
np.random.seed(42)
random.seed(42)

sfs_models = evaluate_models_test_train(models, X, y, sfs_features, k5, f'{experiment_name}_sfs')
sfs_models

Done with Nystroem Ridge.
Done with LinearRegression.
Done with KNN.
Done with XGBRegressor.
Done with LGBMRegressor.
Done with CatBoostRegressor.
Done with HistGradientBoostingRegressor.
Done with ExtraTreesRegressor.
Done with RandomForestRegressor.
CPU times: total: 13min 19s
Wall time: 5min 35s


Unnamed: 0,MLA Name,MLA Parameters,MLA Train R2,MLA Test R2,MLA Time
1,ExtraTreesRegressor,"{'bootstrap': False, 'ccp_alpha': 0.0, 'criter...",1.0,0.876849,4.20 min
5,RandomForestRegressor,"{'bootstrap': True, 'ccp_alpha': 0.0, 'criteri...",0.98256,0.8739,5.46 min
0,CatBoostRegressor,"{'loss_function': 'RMSE', 'verbose': False, 'r...",0.874549,0.865113,2.15 min
4,LGBMRegressor,"{'boosting_type': 'gbdt', 'class_weight': None...",0.869327,0.864829,2.03 min
8,XGBRegressor,"{'objective': 'reg:squarederror', 'base_score'...",0.87817,0.864683,1.80 min
2,HistGradientBoostingRegressor,"{'categorical_features': None, 'early_stopping...",0.868183,0.864578,2.17 min
3,LinearRegression,"{'copy_X': True, 'fit_intercept': True, 'n_job...",0.849782,0.849928,0.01 min
7,Nystroem Ridge,"{'memory': None, 'steps': [('scaler', Standard...",0.849782,0.849927,0.01 min
6,KNN,"{'memory': None, 'steps': [('scaler', Standard...",0.813004,0.803755,0.01 min


- Single Model Prediction

In [25]:
# model = LinearRegression()
# model = LGBMRegressor(random_state=5, n_jobs=-1)
model = ExtraTreesRegressor(random_state=5)

features = sfs_features['ExtraTreesRegressor']
features

['MonsoonIntensity',
 'TopographyDrainage',
 'RiverManagement',
 'Deforestation',
 'Urbanization',
 'ClimateChange',
 'DamsQuality',
 'Siltation',
 'AgriculturalPractices',
 'Encroachments',
 'IneffectiveDisasterPreparedness',
 'DrainageSystems',
 'CoastalVulnerability',
 'Landslides',
 'Watersheds',
 'DeterioratingInfrastructure',
 'PopulationScore',
 'WetlandLoss',
 'InadequatePlanning',
 'PoliticalFactors',
 'Sum_All']

In [26]:
model.fit(X[features], y)

In [27]:
pred = model.predict(test[features])
pred

array([0.57865, 0.45115, 0.44995, ..., 0.6227 , 0.5486 , 0.53625])

In [28]:
pred_df = pd.DataFrame(pred, columns=[TARGET])
pred_df.head()

Unnamed: 0,FloodProbability
0,0.57865
1,0.45115
2,0.44995
3,0.46515
4,0.47535


In [29]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], pred_df], axis=1)
submission_df.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.57865
1,1117958,0.45115
2,1117959,0.44995
3,1117960,0.46515
4,1117961,0.47535


In [30]:
submission_df.to_csv('submission_extrat_small_data_0.876849.csv', index=False)

- Get Stacking score

In [31]:
meta_model = LinearRegression()

In [32]:
%%time

meta_scores = []

for i, (train_idx, meta_idx) in enumerate(k5.split(X)):
    print(f'Fold {i + 1}')
    X_train, X_meta = X.iloc[train_idx], X.iloc[meta_idx]
    y_train, y_meta = y.iloc[train_idx], y.iloc[meta_idx]

    print(X_train.shape, X_meta.shape, y_train.shape, y_meta.shape)
    meta_features_fold = np.zeros((X_meta.shape[0], len(models)))
    # meta_test_features = np.zeros((y.shape[0], len(models)))
    # meta_targets = np.zeros(y.shape[0])

    for i, model in enumerate(models):
        model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
        print(f'Starting {model_name}')
        model_features = sfs_features[model_name]
        # model_features = baseline_features[model_name]

        # Fit model on the selected features
        model.fit(X_train[model_features], y_train)
        preds = model.predict(X_meta[model_features])
        meta_features_fold[:, i] = preds

    # Train the meta-model on the predictions from the base models
    meta_model.fit(meta_features_fold, y_meta)
    
    # Predict using the meta-model
    final_preds = meta_model.predict(meta_features_fold)
    
    # Calculate r2_score for the current fold
    current_fold_r2_score = r2_score(y_meta, final_preds)
    meta_scores.append(current_fold_r2_score)

Fold 1
(119968, 23) (29992, 23) (119968,) (29992,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting LinearRegression
Starting LGBMRegressor
Starting RandomForestRegressor
Starting KNN
Starting Nystroem Ridge
Starting XGBRegressor
Fold 2
(119968, 23) (29992, 23) (119968,) (29992,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting LinearRegression
Starting LGBMRegressor
Starting RandomForestRegressor
Starting KNN
Starting Nystroem Ridge
Starting XGBRegressor
Fold 3
(119968, 23) (29992, 23) (119968,) (29992,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting LinearRegression
Starting LGBMRegressor
Starting RandomForestRegressor
Starting KNN
Starting Nystroem Ridge
Starting XGBRegressor
Fold 4
(119968, 23) (29992, 23) (119968,) (29992,)
Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor

In [33]:
# Calculate the average RMSLE across all folds
average_r2 = np.mean(meta_scores)
average_r2

# 0.843288

# 0.84242 - Ridge stack
# 0.845440748488788 - LR stack

# 0.8768246026677646 - Features w/ LR stack

0.8768246026677646

- Get stacking submission

In [35]:
%%time

# Retrain base models on all data
all_base_model_predictions = []

for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    print(f'Starting {model_name}')
    model_features = sfs_features[model_name]

    model.fit(X[model_features], y)
    preds = model.predict(test[model_features])
    all_base_model_predictions.append(preds.reshape(-1, 1))

# Stack predictions for the meta model
X_new_meta = np.hstack(all_base_model_predictions)

# Use the meta model to make final predictions
final_predictions = meta_model.predict(X_new_meta)

Starting CatBoostRegressor
Starting ExtraTreesRegressor
Starting HistGradientBoostingRegressor
Starting LinearRegression
Starting LGBMRegressor
Starting RandomForestRegressor
Starting KNN
Starting Nystroem Ridge
Starting XGBRegressor
CPU times: total: 41min 59s
Wall time: 23min 12s


In [36]:
model_names = []
for model in models:
    model_name = model.__class__.__name__ if not hasattr(model, 'name') else model.name
    model_names.append(model_name)
model_names

['CatBoostRegressor',
 'ExtraTreesRegressor',
 'HistGradientBoostingRegressor',
 'LinearRegression',
 'LGBMRegressor',
 'RandomForestRegressor',
 'KNN',
 'Nystroem Ridge',
 'XGBRegressor']

In [37]:
print('Ensemble weights')
weights = pd.Series(meta_model.coef_, index=model_names)
print(weights)
print(f'Weights total: {weights.sum()}')
print(f'Intercept: {meta_model.intercept_}', end='\n\n')
print(f"Average Stacking RMSLE across all folds: {average_r2:.5f}")

Ensemble weights
CatBoostRegressor                 0.180837
ExtraTreesRegressor               0.768871
HistGradientBoostingRegressor    -0.289552
LinearRegression                 85.901788
LGBMRegressor                     0.381702
RandomForestRegressor             0.044886
KNN                               0.015027
Nystroem Ridge                  -85.943893
XGBRegressor                     -0.061552
dtype: float64
Weights total: 0.9981135550586854
Intercept: 0.0006111252766685116

Average Stacking RMSLE across all folds: 0.87682


In [38]:
final_predictions_df = pd.DataFrame(final_predictions, columns=[TARGET])
final_predictions_df.head()

Unnamed: 0,FloodProbability
0,0.577797
1,0.451557
2,0.450474
3,0.466101
4,0.475134


In [39]:
submission = pd.read_csv('sample_submission.csv')
submission_df = pd.concat([submission['id'], final_predictions_df], axis=1)
submission_df.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577797
1,1117958,0.451557
2,1117959,0.450474
3,1117960,0.466101
4,1117961,0.475134


In [40]:
submission_df.to_csv('submission_lr_stacking_0.87682.csv', index=False)

### OpenFE

Possible features
1. (Addition of all features * 0.0056) - 0.0533
2. Addition of human activities - ['TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']
3. Addition of natural occurences - ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

In [None]:
['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

In [None]:
%%time

ofe = OpenFE()
ofe.fit(
    data=X,
    label=y,
    # n_data_blocks=2,
    # feature_boosting=True,
    task='regression',
    # stage2_metric='permutation',
    # metric='rmse', 
    n_jobs=4,
)

In [None]:
train_x, test_x = transform(X, test, ofe.new_features_list, n_jobs=4 )

In [None]:
baseline_openfe_features = {}

for model in models:
    if hasattr(model, 'name'):
        model_name = model.name
    else:
        model_name = model.__class__.__name__

    baseline_openfe_features[model_name] = list(train_x.columns)

In [None]:
%%time

baseline_openfe_models = evaluate_models_cv(models, X, y, baseline_openfe_features, k5, f'{experiment_name}_openfe')
baseline_openfe_models