In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

from sklearn.ensemble import VotingRegressor, BaggingRegressor, StackingRegressor
from sklearn.linear_model import BayesianRidge, SGDRegressor, Ridge
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import make_pipeline
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('결과/submission_ensemble.csv')
target = 'FloodProbability'


In [3]:
def feature_engineering(df):
    


    BASE_FEATURES = X_test.columns
    
    df['CombinedUrbanImpact'] = df['Urbanization'] * df['PopulationScore']
    df['EnvironmentalDegradation'] = df['Deforestation'] + df['Siltation'] + df['WetlandLoss']
    df['InfrastructureVulnerability'] = df['DeterioratingInfrastructure'] + df['DrainageSystems'] + df['DamsQuality']
    df['NaturalDisasterRisk'] = df['MonsoonIntensity'] + df['ClimateChange'] + df['Landslides'] + df['CoastalVulnerability']
    df['ManagementEffectiveness'] = df['RiverManagement'] + df['AgriculturalPractices'] + df['Encroachments'] + df['InadequatePlanning'] + df['PoliticalFactors']
    df['Infrastructure_Risk'] = df['DamsQuality'] * df['DrainageSystems']
    df['wet_Risk'] = df['WetlandLoss'] * df['Encroachments']

    df['sum'] = df[BASE_FEATURES].sum(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    df['max'] = df[BASE_FEATURES].max(axis=1)
    df['min'] = df[BASE_FEATURES].min(axis=1)
    df['median'] = df[BASE_FEATURES].median(axis=1)

    return df

    
    # df['ClimateAnthropogenicInteraction'] = (df['MonsoonIntensity'] + df['ClimateChange']) * (
    #     df['Deforestation'] + df['Urbanization'] + df['AgriculturalPractices'] + df['Encroachments'])
    # df['InfrastructurePreventionInteraction'] = (df['DamsQuality'] + df['DrainageSystems'] + df['DeterioratingInfrastructure']) * (
    #     df['RiverManagement'] + df['IneffectiveDisasterPreparedness'] + df['InadequatePlanning'])
    
    # # 기본 통계량 추가
    # df['sum'] = df.sum(axis=1)
    # df['mean'] = df.mean(axis=1)
    # df['std'] = df.std(axis=1)
    # df['median'] = df.median(axis=1)
    # df['max'] = df.max(axis=1)
    # df['min'] = df.min(axis=1)
    
    # return df

X_train = feature_engineering(X_train)
X_test = feature_engineering(X_test)

In [10]:
# Polynomial Features 생성
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)

In [12]:
# 타겟 변수 분리

num_cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']
target = 'FloodProbability'
X_train = X_train_scaled.drop(columns=[target,'id'])
y_train = X_train_scaled[target]
X_test = X_test_scaled.drop(columns=['id'])

In [12]:
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

# lgbm_params = {
#     'num_leaves': 183,
#     'learning_rate': 0.01183688880802108,
#     'n_estimators': 577,
#     'subsample_for_bin': 165697,
#     'min_child_samples': 114,
#     'reg_alpha': 2.075080888948164e-06,
#     'reg_lambda': 3.838938366471552e-07,
#     'colsample_bytree': 0.9634044234652241,
#     'subsample': 0.9592138618622019,
#     'max_depth': 9
# }

# 모델 학습
gb_model = GradientBoostingRegressor()
xgb_model = xgb.XGBRegressor()
lgb_model = lgb.LGBMRegressor()
cat_model = CatBoostRegressor()
# lgb_model = lgb.LGBMRegressor(**lgbm_params)
# cat_model = CatBoostRegressor(depth=8, learning_rate=0.011277016304363601)


# 각각의 모델 학습
gb_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)
lgb_model.fit(X_train, y_train)
cat_model.fit(X_train, y_train)

# 예측
gb_preds = gb_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)
lgb_preds = lgb_model.predict(X_test)
cat_preds = cat_model.predict(X_test)

# 앙상블
final_preds = (gb_preds + xgb_preds + lgb_preds + cat_preds) / 4

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.098780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1869
[LightGBM] [Info] Number of data points in the train set: 1117957, number of used features: 34
[LightGBM] [Info] Start training from score 0.504480
Learning rate set to 0.124117
0:	learn: 0.0458489	total: 76.5ms	remaining: 1m 16s
1:	learn: 0.0413605	total: 153ms	remaining: 1m 16s
2:	learn: 0.0375364	total: 231ms	remaining: 1m 16s
3:	learn: 0.0342574	total: 310ms	remaining: 1m 17s
4:	learn: 0.0315188	total: 386ms	remaining: 1m 16s
5:	learn: 0.0291980	total: 463ms	remaining: 1m 16s
6:	learn: 0.0272629	total: 537ms	remaining: 1m 16s
7:	learn: 0.0256435	total: 614ms	remaining: 1m 16s
8:	learn: 0.0243146	total: 692ms	remaining: 1m 16s
9:	learn: 0.0232255	total: 767ms	remaining: 1m 15s
10:	learn: 0.0223522	total: 842ms	remaining: 1m 15s
11:	learn: 0.0216496	total: 921ms	remaining: 1m 15s
12:	learn: 

In [14]:
final_preds

array([0.57704133, 0.45314787, 0.45075744, ..., 0.61966622, 0.54906807,
       0.5263063 ])

In [7]:
from joblib import Parallel, delayed

# 함수 정의
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

# 모델 학습
models = [
    GradientBoostingRegressor(n_estimators=1000, max_depth=8, random_state=35),
    xgb.XGBRegressor(n_estimators=1000, max_depth=8, random_state=35),
    lgb.LGBMRegressor(n_estimators=577, max_depth=9, learning_rate=0.01183688880802108, random_state=35),
    CatBoostRegressor(iterations=1000, depth=8, learning_rate=0.011277016304363601, random_seed=35, verbose=0)
]

# 병렬 처리
trained_models = Parallel(n_jobs=-1)(delayed(train_model)(model, X_train, y_train) for model in models)

# 예측
gb_preds = trained_models[0].predict(X_test)
xgb_preds = trained_models[1].predict(X_test)
lgb_preds = trained_models[2].predict(X_test)
cat_preds = trained_models[3].predict(X_test)

# 앙상블
final_preds2 = (gb_preds + xgb_preds + lgb_preds + cat_preds) / 4



In [14]:
final_preds2

array([0.57350564, 0.45970278, 0.45691026, ..., 0.61461393, 0.55187991,
       0.52135426])

In [15]:
submission = pd.DataFrame({'id': test['id'], 'FloodProbability': final_preds.flatten()})

submission

# CSV 파일로 저장
submission.to_csv('submission_4ensem6.csv', index=False)

In [None]:
# 각 모델의 하이퍼파라미터 그리드를 정의
param_grid_gb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'min_samples_split': [2, 5]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

param_grid_lgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 5]
}

param_grid_cat = {
    'iterations': [100, 200],
    'learning_rate': [0.01, 0.1],
    'depth': [3, 5]
}

In [98]:
train = pd.read_csv('train.csv')

def simplified_getFeats(df):
    num_cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

    # scaler = StandardScaler().fit(df[num_cols])
    # df[num_cols] = scaler.transform(df[num_cols])  # Scale early

    BASE_FEATURES = test.columns
    df = train.copy()
    df['CombinedUrbanImpact'] = df['Urbanization'] * df['PopulationScore']
    df['EnvironmentalDegradation'] = df['Deforestation'] + df['Siltation'] + df['WetlandLoss']
    df['InfrastructureVulnerability'] = df['DeterioratingInfrastructure'] + df['DrainageSystems'] + df['DamsQuality']
    df['NaturalDisasterRisk'] = df['MonsoonIntensity'] + df['ClimateChange'] + df['Landslides'] + df['CoastalVulnerability']
    df['ManagementEffectiveness'] = df['RiverManagement'] + df['AgriculturalPractices'] + df['Encroachments'] + df['InadequatePlanning'] + df['PoliticalFactors']
    df['Infrastructure_Risk'] = df['DamsQuality'] * df['DrainageSystems']
    df['wet_Risk'] = df['WetlandLoss'] * df['Encroachments']
    df['total'] = df[BASE_FEATURES].sum(axis=1)
    df['mean'] = df[BASE_FEATURES].mean(axis=1)
    df['std'] = df[BASE_FEATURES].std(axis=1)
    # df['max'] = df[BASE_FEATURES].max(axis=1)
    # df['min'] = df[BASE_FEATURES].min(axis=1)

    return df




train_simp = simplified_getFeats(train)
test_simp = simplified_getFeats(test)

train_a = train_simp.drop(columns = num_cols)
# train_a = train_a.drop(columns=['FloodProbability', 'id'])

test_a = test_simp.drop(columns = num_cols)

train_a.head()

Unnamed: 0,id,FloodProbability,CombinedUrbanImpact,EnvironmentalDegradation,InfrastructureVulnerability,NaturalDisasterRisk,ManagementEffectiveness,Infrastructure_Risk,wet_Risk,total,mean,std
0,0,0.445,42,16,13,15,22,20,20,94,4.47619,1.990453
1,1,0.45,24,12,15,16,21,21,18,95,4.52381,2.379476
2,2,0.53,24,14,14,23,21,7,10,101,4.809524,1.990453
3,3,0.535,24,17,10,22,32,8,40,107,5.095238,1.670472
4,4,0.415,4,11,9,17,16,6,6,76,3.619048,1.465476


In [87]:
train.describe()

Unnamed: 0,id,MonsoonIntensity,TopographyDrainage,RiverManagement,Deforestation,Urbanization,ClimateChange,DamsQuality,Siltation,AgriculturalPractices,...,DrainageSystems,CoastalVulnerability,Landslides,Watersheds,DeterioratingInfrastructure,PopulationScore,WetlandLoss,InadequatePlanning,PoliticalFactors,FloodProbability
count,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,...,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0,1117957.0
mean,558978.0,4.92145,4.926671,4.955322,4.94224,4.942517,4.934093,4.955878,4.927791,4.942619,...,4.946893,4.953999,4.931376,4.929032,4.925907,4.92752,4.950859,4.940587,4.939004,0.5044803
std,322726.5,2.056387,2.093879,2.072186,2.051689,2.083391,2.057742,2.083063,2.065992,2.068545,...,2.072333,2.088899,2.078287,2.082395,2.064813,2.074176,2.068696,2.081123,2.09035,0.0510261
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285
25%,279489.0,3.0,3.0,4.0,4.0,3.0,3.0,4.0,3.0,3.0,...,4.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,0.47
50%,558978.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.505
75%,838467.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,...,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,6.0,0.54
max,1117956.0,16.0,18.0,16.0,17.0,17.0,17.0,16.0,16.0,16.0,...,17.0,17.0,16.0,16.0,17.0,18.0,19.0,16.0,16.0,0.725


In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# 다시 feature engineering
train = pd.read_csv('train.csv')

def simplified_getFeats(df):
    num_cols = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality', 'Siltation', 'AgriculturalPractices', 'Encroachments', 'IneffectiveDisasterPreparedness', 'DrainageSystems', 'CoastalVulnerability', 'Landslides', 'Watersheds', 'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss', 'InadequatePlanning', 'PoliticalFactors']

    scaler = StandardScaler().fit(df[num_cols])
    df[num_cols] = scaler.transform(df[num_cols])  # Scale early

    # Basic Stats
    df['mean'] = df[num_cols].mean(axis=1)
    df['std'] = df[num_cols].std(axis=1)
    df['max'] = df[num_cols].max(axis=1)
    df['min'] = df[num_cols].min(axis=1)

    # Interaction Features (Simplified)
    df['Climate_Risk'] = df['MonsoonIntensity'] * df['ClimateChange']
    df['Infrastructure_Risk'] = df['DamsQuality'] * df['DrainageSystems']
    df['wet_Risk'] = df['WetlandLoss'] * df['Encroachments']

    return df

train_simp = simplified_getFeats(train)

X = train_simp.drop(columns=['FloodProbability', 'id'])
y = train_simp['FloodProbability']



# 훈련, 검증, 테스트 데이터 나누기
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=35)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=35)

In [11]:
cat_params = {
    'n_estimators':8000,
    'random_state':0,
    'learning_rate': 0.011277016304363601, 
    'depth': 8, 
    'subsample': 0.8675506657380021, 
    'colsample_bylevel': 0.7183884158632279, 
    'min_data_in_leaf': 98,
    'bootstrap_type': 'Bernoulli'
}

model_cat = CatBoostRegressor(**cat_params)
model_cat.fit(X_train,y_train)
y_pred_cat = model_cat.predict(X_val)

print('Catboost r2 score', r2_score(y_val,y_pred_cat) )
print('Catboost MSE', mean_squared_error(y_val, y_pred_cat))


# Catboost r2 score 0.8672054220168302
# Catboost MSE 0.0003471241268770488

# Catboost r2 score 0.8668518127683409
# Catboost MSE 0.00034804845905613

# Catboost r2 score 0.8656126021489724
# Catboost MSE 0.00035128774721682367

# Catboost r2 score 0.8654701928437349
# Catboost MSE 0.0003516600041755826

# Catboost r2 score 0.8655641638038795
# Catboost MSE 0.00035141436472262177

0:	learn: 0.0505374	total: 74.4ms	remaining: 12m 24s
1:	learn: 0.0500696	total: 150ms	remaining: 12m 31s
2:	learn: 0.0496010	total: 225ms	remaining: 12m 28s
3:	learn: 0.0491510	total: 299ms	remaining: 12m 27s
4:	learn: 0.0487072	total: 378ms	remaining: 12m 34s
5:	learn: 0.0482586	total: 455ms	remaining: 12m 37s
6:	learn: 0.0478171	total: 541ms	remaining: 12m 52s
7:	learn: 0.0473771	total: 614ms	remaining: 12m 47s
8:	learn: 0.0469610	total: 686ms	remaining: 12m 41s
9:	learn: 0.0465514	total: 764ms	remaining: 12m 43s
10:	learn: 0.0461403	total: 842ms	remaining: 12m 44s
11:	learn: 0.0457333	total: 922ms	remaining: 12m 47s
12:	learn: 0.0453248	total: 993ms	remaining: 12m 43s
13:	learn: 0.0449224	total: 1.07s	remaining: 12m 42s
14:	learn: 0.0445234	total: 1.15s	remaining: 12m 42s
15:	learn: 0.0441316	total: 1.22s	remaining: 12m 40s
16:	learn: 0.0437591	total: 1.29s	remaining: 12m 40s
17:	learn: 0.0433818	total: 1.38s	remaining: 12m 43s
18:	learn: 0.0430277	total: 1.45s	remaining: 12m 41s
19

In [12]:
sub1 = pd.read_csv('결과/submission_ann.csv')
sub2 = pd.read_csv('결과/submission_ensemble.csv')
sub3 = pd.read_csv('결과/submission_engi_lgb_para.csv')
sub4 = pd.read_csv('결과/submission_stacking.csv')


In [13]:
# 예측값 평균 내기
ensemble_sub = sub1.copy()
ensemble_sub['FloodProbability'] = (sub1['FloodProbability'] + sub2['FloodProbability'] + sub3['FloodProbability'] + sub4['FloodProbability']) / 4

# 앙상블 결과 저장
ensemble_sub.to_csv('submission_ensemble4.csv', index=False)

In [14]:
ensemble_sub.head()

Unnamed: 0,id,FloodProbability
0,1117957,0.577256
1,1117958,0.453612
2,1117959,0.450856
3,1117960,0.468728
4,1117961,0.468857
