In [20]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.ensemble import VotingRegressor, StackingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_csv('./train.csv', index_col=False, sep=',', header=0)
df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
0,0,5,8,5,8,6,4,4,3,3,4,2,5,3,3,5,4,7,5,7,3,0.445
1,1,6,7,4,4,8,8,3,5,4,6,9,7,2,0,3,5,3,3,4,3,0.45
2,2,6,5,6,7,3,7,1,5,4,5,6,7,3,7,5,6,8,2,3,3,0.53
3,3,3,4,6,5,4,8,4,7,6,8,5,2,4,7,4,4,6,5,7,5,0.535
4,4,5,3,2,6,4,4,3,3,3,3,5,2,2,6,6,4,1,2,3,5,0.415


## Data cleaning

In [3]:
def prerpocessing(df):
    df.drop('id', axis=1, inplace=True)

df_cleaned = df.copy()
prerpocessing(df_cleaned)

## Training

In [4]:
X = df_cleaned.drop('FloodProbability', axis=1)
y = df_cleaned['FloodProbability']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [6]:
def train_eval_pipeline(pipeline: Pipeline):
    """
    This functions trains a model using the input pipelin and evaluates its performance
    Args:
        pipeline (Pipeline): Model pipeline
    Returns:
        fit_model (Pipeline): Fit model
        train_preds: Predictions on the train set
        test_preds: Predictions on the test set
    """
    fit_model = pipeline.fit(X_train, y_train)
    train_preds = fit_model.predict(X_train)
    test_preds = fit_model.predict(X_test)

    print(f"Train R2 score: {r2_score(y_train, train_preds)}")
    print(f"Test R2 score: {r2_score(y_test, test_preds)}")
    
    return(fit_model, train_preds, test_preds)
    

In [7]:
catboost_pipeline = make_pipeline(StandardScaler(), CatBoostRegressor(iterations=1000, learning_rate=0.1, depth=10, subsample=1,\
    colsample_bylevel=1, min_data_in_leaf=100, verbose=0))

cb_model, cb_train_preds, cb_test_preds = train_eval_pipeline(catboost_pipeline)

Train R2 score: 0.8614577584219719
Test R2 score: 0.8483489951069415


In [8]:
lr_pipeline = make_pipeline(StandardScaler(), LinearRegression())

lr_model, lr_train_preds, lr_test_preds = train_eval_pipeline(lr_pipeline)

Train R2 score: 0.8446206036673864
Test R2 score: 0.8457091621175303


In [9]:
xgb_pipeline = make_pipeline(StandardScaler(), XGBRegressor(random_state=0, max_depth=5, learning_rate=0.3, subsample=1,\
    min_child_weight=0, max_leaves=0))
    
xgb_model, xgb_train_preds, xgb_test_preds = train_eval_pipeline(xgb_pipeline)

Train R2 score: 0.8186582891594377
Test R2 score: 0.8144888569342431


In [10]:
lgbm_pipeline = make_pipeline(StandardScaler(), LGBMRegressor(random_state=0, max_depth=3, learning_rate=0.1,\
    n_estimators=500, boosting_type='gbdt'))
    
lgbm_model, lgbm_train_preds, lgbm_test_preds = train_eval_pipeline(lgbm_pipeline)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 782569, number of used features: 20
[LightGBM] [Info] Start training from score 0.504469
Train R2 score: 0.8386101908500276
Test R2 score: 0.8382308332921752


### Voting

In [13]:
pipelines = [('lr', lr_pipeline), ('catboost', catboost_pipeline), ('xgb', xgb_pipeline), ('lgbm', lgbm_pipeline)]
ensemble = VotingRegressor(estimators=pipelines)

In [14]:
ens_model, ens_train_preds, ens_test_preds = train_eval_pipeline(ensemble)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 782569, number of used features: 20
[LightGBM] [Info] Start training from score 0.504469
Train R2 score: 0.8484883703996753
Test R2 score: 0.8442679757903377


### Stacking

In [21]:
stack = StackingRegressor(estimators=pipelines, final_estimator=catboost_pipeline)

In [22]:
stack_model, stack_train_preds, stack_test_preds = train_eval_pipeline(stack)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.022538 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 365
[LightGBM] [Info] Number of data points in the train set: 782569, number of used features: 20
[LightGBM] [Info] Start training from score 0.504469
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017790 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 366
[LightGBM] [Info] Number of data points in the train set: 626055, number of used features: 20
[LightGBM] [Info] Start training from score 0.504550
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018768 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

## Create submission

In [23]:
test_df = pd.read_csv('./test.csv', header=0, sep=',')
test_df.head()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,IneffectiveDisasterPreparedness,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors
0,1117957,4,6,3,5,6,7,8,7,8,4,8,5,7,5,6,3,6,4,4,5
1,1117958,4,4,2,9,5,5,4,7,5,4,2,4,7,4,5,1,7,4,4,3
2,1117959,1,3,6,5,7,2,4,6,4,2,7,9,2,5,5,2,3,6,8,3
3,1117960,2,4,4,6,4,5,4,3,4,4,7,8,4,6,7,6,4,2,4,4
4,1117961,6,3,2,4,6,4,5,5,3,7,4,3,2,6,4,6,8,4,5,5


In [24]:
test_ids = test_df['id']
prerpocessing(test_df)

### Voting

This reached .8432 score on Kaggle: (Same as combine models with .25 weights)

In [17]:
ens_preds = ens_model.predict(test_df)

### Stacking

In [25]:
stack_preds = stack_model.predict(test_df)

This reached .8646 on Kaggle

In [27]:
submission_df = pd.DataFrame([test_ids.values, stack_preds]).T
submission_df.columns = ['id', 'FloodProbability']

In [28]:
submission_df.to_csv('./submission.csv', index=False)