In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
flood_dataset = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")

In [None]:
flood_dataset.describe().T

# **Data Preprocessing: (for each columns in Horizontal ways)**

In [None]:
def add_features(df):
    addfeatures = df.columns.tolist()
    df['std_features'] = df[addfeatures].std(axis=1)
    df['mean_features'] = 0.1*df[addfeatures].mean(axis=1)
    df['median_features'] = 0.1*df[addfeatures].median(axis=1)
    df['max_features'] = df[addfeatures].max(axis=1)
    df['min_features'] = df[addfeatures].min(axis=1)    
    return df

In [None]:
flood_dataset_df=flood_dataset.copy()
flood_dataset_df.drop(['id'], axis = 1, inplace = True)

In [None]:
df = flood_dataset_df
df1 = df.drop(["FloodProbability"], axis=1)

In [None]:
flood_dataset_df = add_features(df1)
y = df["FloodProbability"]

In [None]:
def add_more_features(flood_dataset_df):
    flood_dataset_df['weather'] = - flood_dataset_df['MonsoonIntensity'] - flood_dataset_df['ClimateChange']
    flood_dataset_df['structure'] = flood_dataset_df['TopographyDrainage'] + flood_dataset_df['RiverManagement'] - flood_dataset_df['Deforestation'] + flood_dataset_df['DamsQuality'] + flood_dataset_df['Encroachments'] + flood_dataset_df['DrainageSystems'] - flood_dataset_df['Watersheds'] - flood_dataset_df['DeterioratingInfrastructure'] + flood_dataset_df['WetlandLoss'] 
    flood_dataset_df['population'] = - flood_dataset_df['Urbanization'] * flood_dataset_df['PopulationScore']
    flood_dataset_df['soil_infrastructure'] = - flood_dataset_df['Siltation'] + flood_dataset_df['AgriculturalPractices'] - flood_dataset_df['Landslides']
    flood_dataset_df['preparedness'] = - flood_dataset_df['IneffectiveDisasterPreparedness'] + flood_dataset_df['CoastalVulnerability'] - flood_dataset_df['InadequatePlanning'] - flood_dataset_df['PoliticalFactors'] 
    return flood_dataset_df

In [None]:
flood_dataset_df = add_more_features(flood_dataset_df)

In [None]:
flood_dataset_df["FloodProbability"]=y

In [None]:
flood_dataset_df.info()

In [None]:
flood_dataset_df.head(5)

In [None]:
flood_dataset_df.to_csv('train_without_preprocessing.csv', index=False)

In [None]:
del flood_dataset_df

# **For Test Dataset!!**

In [None]:
flood_dataset = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

flood_dataset_df=flood_dataset.copy()
flood_dataset_df.drop(['id'], axis = 1, inplace = True)

df = flood_dataset_df
flood_dataset_df = add_features(df)

flood_dataset_df = add_more_features(flood_dataset_df)

flood_dataset_df.info()

In [None]:
flood_dataset_df.to_csv('test_data.csv', index=False)

# **Data Preprocessing: (add more features for each subjective columns in vertical ways)**

In [None]:
flood_dataset_df = pd.read_csv("/kaggle/working/train_without_preprocessing.csv")

In [None]:
flood_dataset_df.columns

In [None]:
def add_manipulated_features(df):    
    df['all_row_MonsoonIntensity'] = df['MonsoonIntensity'] / df['MonsoonIntensity'].mean()   
    df['all_row_TopographyDrainage'] = df['TopographyDrainage'] / df['TopographyDrainage'].mean() 
    df['all_row_RiverManagement'] = df['RiverManagement'] / df['RiverManagement'].mean() 
    df['all_row_Deforestation'] = df['Deforestation'] / df['Deforestation'].mean() 
    df['all_row_Urbanization'] = df['Urbanization'] / df['Urbanization'].mean() 
    df['all_row_ClimateChange'] = df['ClimateChange'] / df['ClimateChange'].mean() 
    df['all_row_DamsQuality'] = df['DamsQuality'] / df['DamsQuality'].mean() 
    df['all_row_Siltation'] = df['Siltation'] / df['Siltation'].mean() 
    df['all_row_AgriculturalPractices'] = df['AgriculturalPractices'] / df['AgriculturalPractices'].mean() 
    df['all_row_Encroachments'] = df['Encroachments'] / df['Encroachments'].mean() 
    df['all_row_IneffectiveDisasterPreparedness'] = df['IneffectiveDisasterPreparedness'] / df['IneffectiveDisasterPreparedness'].mean()
    df['all_row_DrainageSystems'] = df['DrainageSystems'] / df['DrainageSystems'].mean() 
    df['all_row_CoastalVulnerability'] = df['CoastalVulnerability'] / df['CoastalVulnerability'].mean() 
    df['all_row_Landslides'] = df['Landslides'] / df['Landslides'].mean() 
    df['all_row_Watersheds'] = df['Watersheds'] / df['Watersheds'].mean() 
    df['all_row_DeterioratingInfrastructure'] = df['DeterioratingInfrastructure'] / df['DeterioratingInfrastructure'].mean() 
    df['all_row_PopulationScore'] = df['PopulationScore'] / df['PopulationScore'].mean() 
    df['all_row_WetlandLoss'] = df['WetlandLoss'] / df['WetlandLoss'].mean()
    df['all_row_InadequatePlanning'] = df['InadequatePlanning'] / df['InadequatePlanning'].mean() 
    df['all_row_PoliticalFactors'] = df['PoliticalFactors'] / df['PoliticalFactors'].mean() 
    df['all_row_weather'] = df['weather'] / df['weather'].mean() 
    df['all_row_structure'] = df['structure'] / df['structure'].mean()
    df['all_row_population'] = df['population'] / df['population'].mean() 
    df['all_row_soil_infrastructure'] = df['soil_infrastructure'] / df['soil_infrastructure'].mean() 
    df['all_row_preparedness'] = df['preparedness'] / df['preparedness'].mean() 
    return df

In [None]:
flood_dataset_df = add_manipulated_features(flood_dataset_df)

In [None]:
flood_dataset_df.head(5)

In [None]:
flood_dataset_df.to_csv('train_preprocessing.csv', index=False)
del flood_dataset_df

In [None]:
flood_dataset_df = pd.read_csv("/kaggle/working/test_data.csv")
flood_dataset_df = add_manipulated_features(flood_dataset_df)
flood_dataset_df.to_csv('test_data_featured.csv', index=False)

In [None]:
del flood_dataset_df

In [None]:
flood_dataset_df = pd.read_csv("/kaggle/working/train_without_preprocessing.csv")

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

#flood_dataset_df.drop(['id'], axis = 1, inplace = True)
dataX = flood_dataset_df.drop(['FloodProbability'], axis = 1)
datay = flood_dataset_df['FloodProbability']

X_train, X_test, Y_train, Y_test = train_test_split(dataX, datay, test_size = 0.3, random_state = 42)

from xgboost import XGBRegressor
xgb = XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     #objective='reg:linear', nthread=-1,
                                     objective='reg:squarederror', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006)
xgb_model = xgb.fit(X_train , Y_train)

y_pred_xgboost = xgb_model.predict(X_test)

xgboost_rmse_calculator = np.sqrt(mean_squared_error(np.log(Y_test), np.log(y_pred_xgboost)))
xgboost_r2_metric = r2_score(Y_test, y_pred_xgboost)

print(f"Extreme Gradient Boosting RMSE Metric: {xgboost_rmse_calculator}")
print(f"Extreme Gradient Boosting R-squared Metric: {xgboost_r2_metric}")

In [None]:
from lightgbm import LGBMRegressor
            
lgb_params = {
    'boosting_type': 'gbdt', 
    'n_estimators':2000, 
    'learning_rate' :  0.012,
    'num_leaves' : 250, 
    'subsample_for_bin': 165700, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075e-06, 
    'reg_lambda': 3.839e-07, 
    'colsample_bytree': 0.9634,
    'subsample': 0.9592, 
    'max_depth': 10,
    'random_state':0,
    'verbosity':-1}

lgbm = LGBMRegressor(**lgbm_params)

lgbm_model = lgbm.fit(X_train , Y_train)

y_pred_lgbm = lgbm_model.predict(X_test)

lgbm_rmse_calculator = np.sqrt(mean_squared_error(np.log(Y_test), np.log(y_pred_lgbm)))
lgbm_r2_metric = r2_score(Y_test, y_pred_lgbm)

print(f"Light Gradient Boosting RMSE Metric: {lgbm_rmse_calculator}")
print(f"Light Gradient Boosting R-squared Metric: {lgbm_r2_metric}")

In [None]:
testddata = pd.read_csv("/kaggle/working/test_data.csv")

input_test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

id_no = input_test['id']
test_pred_lgbm = lgbm_model.predict(testddata)

submission = pd.DataFrame({
    'id': id_no,
    'FloodProbability': test_pred_lgbm
})

print(submission.head())
submission.to_csv('/kaggle/working/submission.csv', index=False)