In [1]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s4e5/sample_submission.csv
/kaggle/input/playground-series-s4e5/train.csv
/kaggle/input/playground-series-s4e5/test.csv


In [2]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

In [3]:
flood_dataset = pd.read_csv("/kaggle/input/playground-series-s4e5/train.csv")

In [4]:
flood_dataset.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,1117957.0,558978.0,322726.531784,0.0,279489.0,558978.0,838467.0,1117956.0
MonsoonIntensity,1117957.0,4.92145,2.056387,0.0,3.0,5.0,6.0,16.0
TopographyDrainage,1117957.0,4.926671,2.093879,0.0,3.0,5.0,6.0,18.0
RiverManagement,1117957.0,4.955322,2.072186,0.0,4.0,5.0,6.0,16.0
Deforestation,1117957.0,4.94224,2.051689,0.0,4.0,5.0,6.0,17.0
Urbanization,1117957.0,4.942517,2.083391,0.0,3.0,5.0,6.0,17.0
ClimateChange,1117957.0,4.934093,2.057742,0.0,3.0,5.0,6.0,17.0
DamsQuality,1117957.0,4.955878,2.083063,0.0,4.0,5.0,6.0,16.0
Siltation,1117957.0,4.927791,2.065992,0.0,3.0,5.0,6.0,16.0
AgriculturalPractices,1117957.0,4.942619,2.068545,0.0,3.0,5.0,6.0,16.0


# **Data Preprocessing: (for each columns in Horizontal ways)**

In [5]:
def add_features(df):
    addfeatures = df.columns.tolist()
    df['std_features'] = df[addfeatures].std(axis=1)
    df['mean_features'] = 0.1*df[addfeatures].mean(axis=1)
    df['median_features'] = 0.1*df[addfeatures].median(axis=1)
    df['max_features'] = df[addfeatures].max(axis=1)
    df['min_features'] = df[addfeatures].min(axis=1)    
    return df

In [6]:
flood_dataset_df=flood_dataset.copy()
flood_dataset_df.drop(['id'], axis = 1, inplace = True)

In [7]:
df = flood_dataset_df
df1 = df.drop(["FloodProbability"], axis=1)

In [8]:
flood_dataset_df = add_features(df1)
y = df["FloodProbability"]

In [9]:
def add_more_features(flood_dataset_df):
    flood_dataset_df['weather'] = - flood_dataset_df['MonsoonIntensity'] - flood_dataset_df['ClimateChange']
    flood_dataset_df['structure'] = flood_dataset_df['TopographyDrainage'] + flood_dataset_df['RiverManagement'] - flood_dataset_df['Deforestation'] + flood_dataset_df['DamsQuality'] + flood_dataset_df['Encroachments'] + flood_dataset_df['DrainageSystems'] - flood_dataset_df['Watersheds'] - flood_dataset_df['DeterioratingInfrastructure'] + flood_dataset_df['WetlandLoss'] 
    flood_dataset_df['population'] = - flood_dataset_df['Urbanization'] * flood_dataset_df['PopulationScore']
    flood_dataset_df['soil_infrastructure'] = - flood_dataset_df['Siltation'] + flood_dataset_df['AgriculturalPractices'] - flood_dataset_df['Landslides']
    flood_dataset_df['preparedness'] = - flood_dataset_df['IneffectiveDisasterPreparedness'] + flood_dataset_df['CoastalVulnerability'] - flood_dataset_df['InadequatePlanning'] - flood_dataset_df['PoliticalFactors'] 
    return flood_dataset_df

In [10]:
flood_dataset_df = add_more_features(flood_dataset_df)

In [11]:
flood_dataset_df["FloodProbability"]=y

In [12]:
flood_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1117957 entries, 0 to 1117956
Data columns (total 31 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   MonsoonIntensity                 1117957 non-null  int64  
 1   TopographyDrainage               1117957 non-null  int64  
 2   RiverManagement                  1117957 non-null  int64  
 3   Deforestation                    1117957 non-null  int64  
 4   Urbanization                     1117957 non-null  int64  
 5   ClimateChange                    1117957 non-null  int64  
 6   DamsQuality                      1117957 non-null  int64  
 7   Siltation                        1117957 non-null  int64  
 8   AgriculturalPractices            1117957 non-null  int64  
 9   Encroachments                    1117957 non-null  int64  
 10  IneffectiveDisasterPreparedness  1117957 non-null  int64  
 11  DrainageSystems                  1117957 non-null 

In [13]:
flood_dataset_df.head(5)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,mean_features,median_features,max_features,min_features,weather,structure,population,soil_infrastructure,preparedness,FloodProbability
0,5,8,5,8,6,4,4,3,3,4,...,0.47,0.45,8,2,-9,14,-42,-3,-9,0.445
1,6,7,4,4,8,8,3,5,4,6,...,0.47,0.4,9,0,-14,18,-24,-1,-14,0.45
2,6,5,6,7,3,7,1,5,4,5,...,0.495,0.5,8,1,-13,8,-24,-8,-9,0.53
3,3,4,6,5,4,8,4,7,6,8,...,0.52,0.5,8,2,-11,16,-24,-8,-13,0.535
4,5,3,2,6,4,4,3,3,3,3,...,0.36,0.3,6,1,-9,-1,-4,-6,-11,0.415


In [14]:
flood_dataset_df.to_csv('train_without_preprocessing.csv', index=False)

In [15]:
del flood_dataset_df

# **For Test Dataset!!**

In [16]:
flood_dataset = pd.read_csv("/kaggle/input/playground-series-s4e5/test.csv")

flood_dataset_df=flood_dataset.copy()
flood_dataset_df.drop(['id'], axis = 1, inplace = True)

df = flood_dataset_df
flood_dataset_df = add_features(df)

flood_dataset_df = add_more_features(flood_dataset_df)

flood_dataset_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 745305 entries, 0 to 745304
Data columns (total 30 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   MonsoonIntensity                 745305 non-null  int64  
 1   TopographyDrainage               745305 non-null  int64  
 2   RiverManagement                  745305 non-null  int64  
 3   Deforestation                    745305 non-null  int64  
 4   Urbanization                     745305 non-null  int64  
 5   ClimateChange                    745305 non-null  int64  
 6   DamsQuality                      745305 non-null  int64  
 7   Siltation                        745305 non-null  int64  
 8   AgriculturalPractices            745305 non-null  int64  
 9   Encroachments                    745305 non-null  int64  
 10  IneffectiveDisasterPreparedness  745305 non-null  int64  
 11  DrainageSystems                  745305 non-null  int64  
 12  Co

In [17]:
flood_dataset_df.to_csv('test_data.csv', index=False)

# **Data Preprocessing: (add more features for each subjective columns in vertical ways)**

In [18]:
flood_dataset_df = pd.read_csv("/kaggle/working/train_without_preprocessing.csv")

In [19]:
flood_dataset_df.columns

Index(['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
       'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
       'Siltation', 'AgriculturalPractices', 'Encroachments',
       'IneffectiveDisasterPreparedness', 'DrainageSystems',
       'CoastalVulnerability', 'Landslides', 'Watersheds',
       'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
       'InadequatePlanning', 'PoliticalFactors', 'std_features',
       'mean_features', 'median_features', 'max_features', 'min_features',
       'weather', 'structure', 'population', 'soil_infrastructure',
       'preparedness', 'FloodProbability'],
      dtype='object')

In [20]:
def add_manipulated_features(df):    
    df['all_row_MonsoonIntensity'] = df['MonsoonIntensity'] / df['MonsoonIntensity'].mean()   
    df['all_row_TopographyDrainage'] = df['TopographyDrainage'] / df['TopographyDrainage'].mean() 
    df['all_row_RiverManagement'] = df['RiverManagement'] / df['RiverManagement'].mean() 
    df['all_row_Deforestation'] = df['Deforestation'] / df['Deforestation'].mean() 
    df['all_row_Urbanization'] = df['Urbanization'] / df['Urbanization'].mean() 
    df['all_row_ClimateChange'] = df['ClimateChange'] / df['ClimateChange'].mean() 
    df['all_row_DamsQuality'] = df['DamsQuality'] / df['DamsQuality'].mean() 
    df['all_row_Siltation'] = df['Siltation'] / df['Siltation'].mean() 
    df['all_row_AgriculturalPractices'] = df['AgriculturalPractices'] / df['AgriculturalPractices'].mean() 
    df['all_row_Encroachments'] = df['Encroachments'] / df['Encroachments'].mean() 
    df['all_row_IneffectiveDisasterPreparedness'] = df['IneffectiveDisasterPreparedness'] / df['IneffectiveDisasterPreparedness'].mean()
    df['all_row_DrainageSystems'] = df['DrainageSystems'] / df['DrainageSystems'].mean() 
    df['all_row_CoastalVulnerability'] = df['CoastalVulnerability'] / df['CoastalVulnerability'].mean() 
    df['all_row_Landslides'] = df['Landslides'] / df['Landslides'].mean() 
    df['all_row_Watersheds'] = df['Watersheds'] / df['Watersheds'].mean() 
    df['all_row_DeterioratingInfrastructure'] = df['DeterioratingInfrastructure'] / df['DeterioratingInfrastructure'].mean() 
    df['all_row_PopulationScore'] = df['PopulationScore'] / df['PopulationScore'].mean() 
    df['all_row_WetlandLoss'] = df['WetlandLoss'] / df['WetlandLoss'].mean()
    df['all_row_InadequatePlanning'] = df['InadequatePlanning'] / df['InadequatePlanning'].mean() 
    df['all_row_PoliticalFactors'] = df['PoliticalFactors'] / df['PoliticalFactors'].mean() 
    df['all_row_weather'] = df['weather'] / df['weather'].mean() 
    df['all_row_structure'] = df['structure'] / df['structure'].mean()
    df['all_row_population'] = df['population'] / df['population'].mean() 
    df['all_row_soil_infrastructure'] = df['soil_infrastructure'] / df['soil_infrastructure'].mean() 
    df['all_row_preparedness'] = df['preparedness'] / df['preparedness'].mean() 
    
    df['ClimateImpact'] = df['MonsoonIntensity'] + df['ClimateChange']
    df['AnthropogenicPressure'] = df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments']
    df['InfrastructureQuality'] = df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']
    df['CoastalVulnerabilityTotal'] = df['CoastalVulnerability'] + df['Landslides']
    df['PreventiveMeasuresEfficiency'] = df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning']
    df['EcosystemImpact'] = df['WetlandLoss'] + df['Watersheds']
    df['SocioPoliticalContext'] = df['PopulationScore'] * df['PoliticalFactors']


    df['FloodVulnerabilityIndex'] = (df['AnthropogenicPressure'] + df['InfrastructureQuality'] +
                                     df['CoastalVulnerabilityTotal'] + df['PreventiveMeasuresEfficiency']) / 4
    
    df['PopulationDensityImpact'] = df['PopulationScore'] * (df['Urbanization'] + df['Encroachments'])
    
    df['DeforestationUrbanizationRatio'] = df['Deforestation'] / df['Urbanization']
    
    df['AgriculturalEncroachmentImpact'] = df['AgriculturalPractices'] * df['Encroachments']
    
    df['DamDrainageInteraction'] = df['DamsQuality'] * df['DrainageSystems']
    
    df['LandslideSiltationInteraction'] = df['Landslides'] * df['Siltation']
    
    df['WatershedWetlandRatio'] = df['Watersheds'] / df['WetlandLoss']
    
    df['PoliticalPreparednessInteraction'] = df['PoliticalFactors'] * df['IneffectiveDisasterPreparedness']
    
    
    df['TopographyDrainageSiltation'] = df['TopographyDrainage'] + df['Siltation']
    
    df['ClimateAnthropogenicInteraction'] = df['ClimateImpact'] * df['AnthropogenicPressure']
    
    df['InfrastructurePreventionInteraction'] = df['InfrastructureQuality'] * df['PreventiveMeasuresEfficiency']
    
    df['CoastalEcosystemInteraction'] = df['CoastalVulnerabilityTotal'] * df['EcosystemImpact']

    
    return df

In [21]:
flood_dataset_df = add_manipulated_features(flood_dataset_df)

In [22]:
flood_dataset_df.head(5)

Unnamed: 0,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,Encroachments,...,DeforestationUrbanizationRatio,AgriculturalEncroachmentImpact,DamDrainageInteraction,LandslideSiltationInteraction,WatershedWetlandRatio,PoliticalPreparednessInteraction,TopographyDrainageSiltation,ClimateAnthropogenicInteraction,InfrastructurePreventionInteraction,CoastalEcosystemInteraction
0,5,8,5,8,6,4,4,3,3,4,...,1.333333,12,20,9,1.0,6,11,189,182,60
1,6,7,4,4,8,8,3,5,4,6,...,0.5,24,21,0,1.0,27,12,308,255,12
2,6,5,6,7,3,7,1,5,4,5,...,2.333333,20,7,35,2.5,18,10,247,210,70
3,3,4,6,5,4,8,4,7,6,8,...,1.25,48,8,49,0.8,25,11,253,180,99
4,5,3,2,6,4,4,3,3,3,3,...,1.5,9,6,18,3.0,25,6,144,90,64


In [23]:
flood_dataset_df.to_csv('train_preprocessing.csv', index=False)
del flood_dataset_df

In [24]:
flood_dataset_df = pd.read_csv("/kaggle/working/test_data.csv")
flood_dataset_df = add_manipulated_features(flood_dataset_df)
flood_dataset_df.to_csv('test_data_featured.csv', index=False)

In [25]:
del flood_dataset_df

In [26]:
flood_dataset_df = pd.read_csv("/kaggle/working/train_preprocessing.csv")

In [27]:
flood_dataset_df.shape

(1117957, 75)

In [28]:
flood_dataset_test = pd.read_csv("/kaggle/working/test_data_featured.csv")

In [29]:
flood_dataset_test.shape

(745305, 74)

In [30]:
dataX = flood_dataset_df.drop(['FloodProbability'], axis = 1)
datay = flood_dataset_df['FloodProbability']

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(dataX, datay, test_size = 0.3, random_state = 42)

In [32]:
from lightgbm import LGBMRegressor
            
lgb_params = {
    'boosting_type': 'gbdt', 
    'n_estimators':2000, 
    'learning_rate' :  0.012,
    'num_leaves' : 250, 
    'subsample_for_bin': 165700, 
    'min_child_samples': 114, 
    'reg_alpha': 2.075e-06, 
    'reg_lambda': 3.839e-07, 
    'colsample_bytree': 0.9634,
    'subsample': 0.9592, 
    'max_depth': 10,
    'random_state':0,
    'verbosity':-1}

lgbm = LGBMRegressor(**lgb_params)

lgbm_model = lgbm.fit(X_train , Y_train)

y_pred_lgbm = lgbm_model.predict(X_test)

lgbm_rmse_calculator = np.sqrt(mean_squared_error(np.log(Y_test), np.log(y_pred_lgbm)))
lgbm_r2_metric = r2_score(Y_test, y_pred_lgbm)

print(f"Light Gradient Boosting RMSE Metric: {lgbm_rmse_calculator}")
print(f"Light Gradient Boosting R-squared Metric: {lgbm_r2_metric}")

Light Gradient Boosting RMSE Metric: 0.03744476510470013
Light Gradient Boosting R-squared Metric: 0.8689446496322025


In [33]:
testddata = pd.read_csv("/kaggle/working/test_data_featured.csv")

input_test = pd.read_csv('/kaggle/input/playground-series-s4e5/test.csv')

id_no = input_test['id']
test_pred_lgbm = lgbm_model.predict(testddata)

submission = pd.DataFrame({
    'id': id_no,
    'FloodProbability': test_pred_lgbm
})

print(submission.head())
submission.to_csv('/kaggle/working/submission.csv', index=False)

        id  FloodProbability
0  1117957          0.577754
1  1117958          0.458490
2  1117959          0.452747
3  1117960          0.465340
4  1117961          0.465006
