## 학습용 데이터 만들기
 - 1. 침수데이터 결측치 해결
 - 2. 침수데이터와 비침수 데이터 합치기
 - 3. 필요한 데이터만 뽑아내기
 - 4. 가중치 주기
 - 5. 학습용 데이터 저장

In [None]:
#이미 가중치를 주어 만들어 두었지만 학습용데이터가 만들어지는 과정을 보여주기 위해 가중치를 주기 전 데이터를 사용

In [154]:
import pandas as pd
import numpy as np

In [155]:
sig_info = pd.read_csv('../Data/processing_data/2nd/sig_info.csv')

In [156]:
BF_R_P = pd.read_csv('../data/processing_data/2nd/busan_flood_geo_rain.csv')

In [157]:
BF_R_P = pd.merge(BF_R_P, sig_info, how='inner', on = 'ZONE')

In [158]:
BF_R_P['FLOOD'] = np.nan

In [159]:
SEQ_list = BF_R_P['SEQ'].unique()

In [160]:
for seq in SEQ_list:
    #사건 중 최대 강우량 인덱스를 마지막 인덱스로 지정
    end_idx = BF_R_P.loc[(BF_R_P["SEQ"]==seq)]["HOUR_RAINFALL"].idxmax()
    
    #시작 인덱스를 마지막 인덱스와 같은 값으로 정하고 아래에서 -1씩 뺄것
    start_idx = end_idx
    
    #해당 사건의 최대 "FLUD_DPWT"(침수심) 값을 저장
    FLUD_DPWT_max = BF_R_P.loc[end_idx,"FLUD_DPWT"]
    
    #해당 행의 강우 값을 저장하여 5이하가 나올 때까지 사용할 변수
    time_rain = BF_R_P.loc[(BF_R_P["SEQ"]==seq)]["HOUR_RAINFALL"].max()

    #해당 행의 강우가 5이하이면 스톱하여 그 떄의 인덱스를 찾아 start_idx로 지정
    while time_rain >=5:
        start_idx -= 1
        time_rain = BF_R_P.loc[start_idx,"HOUR_RAINFALL"]
    
    # 최대침수심에서 0인 곳까지 인덱스의 범위
    n=end_idx-start_idx
    
    # 침수심 결측치를 채울 첫 행은 강우가 5mm이하 이므로 0
    BF_R_P.loc[start_idx,"FLUD_DPWT"]=0
    
    # "FLOOD"(침수여부)에 대해서 시작 인덱스와 마지막 인덱스는 정해져있음
    # 시작 인덱스의 "FLOOD" == 0
    # 마지막 인덱스의 "FLOOD" == 1
    BF_R_P.loc[start_idx,"FLOOD"] = 0
    BF_R_P.loc[end_idx,"FLOOD"] = 1
    
    # 시작 인덱스와 마지막 인덱스 사이의 침수심 결측치를 선형적인 값을 채움
    for i in range(1,n):
        FLUD_DPWT = (FLUD_DPWT_max/(n))*i
        BF_R_P.loc[start_idx+i,"FLUD_DPWT"]=FLUD_DPWT
        if FLUD_DPWT >= 0.2 :
            BF_R_P.loc[start_idx+i,"FLOOD"]=1
        else:
            BF_R_P.loc[start_idx+i,"FLOOD"]=0


In [161]:
BUnF = pd.read_csv('../data/processing_data/2nd/busan_unflood_geo_rain.csv')

In [162]:
BUnF = pd.merge(BUnF, sig_info, how='inner', on = 'ZONE')

In [163]:
Flood_Exist = BF_R_P[BF_R_P['FLOOD']>=0]

In [164]:
Unf_match = BUnF.loc[Flood_Exist[Flood_Exist['FLOOD']==1].index]

In [165]:
training_data = pd.concat([Flood_Exist,Unf_match])

In [166]:
training_data = training_data[['PUMP_RATIO','HOUR_RAINFALL','IMP_SUR_RATIO','SLOPE_AVG','HIGH','F_WEIGHT','FLOOD']]

In [167]:
training_data.reset_index(drop=True, inplace=True)

In [168]:
weight_data = pd.DataFrame()

In [169]:
weight_data['Hourly_Rainfall_Weight'] = training_data['HOUR_RAINFALL'].copy()
weight_data['Impervious_Surface_Weight'] = training_data['IMP_SUR_RATIO'].copy()
weight_data['SLOPE_AVG_Weight'] = training_data['SLOPE_AVG'].copy()
weight_data['HIGH_Weight'] = training_data['HIGH'].copy()

In [170]:
#가중치

## 강우량 가중치

In [171]:
for idx,col in enumerate(training_data['HOUR_RAINFALL']):
    if col <= 15:
        weight_data['Hourly_Rainfall_Weight'][idx] = col / (46)
    elif col > 15 and col <= 35:
        weight_data['Hourly_Rainfall_Weight'][idx] = col / (18 * 2**2)
    elif col > 35 and col <= 60:
        weight_data['Hourly_Rainfall_Weight'][idx] = col / (17 * 3**2)
    elif col > 60 and col <= 80:
        weight_data['Hourly_Rainfall_Weight'][idx] = col / (11 * 4**2)
    elif col > 80:
        weight_data['Hourly_Rainfall_Weight'][idx] = col / (6.28 * 5**2)

## 불투수면 가중치

In [172]:
for idx,col in enumerate(training_data['IMP_SUR_RATIO']):
    if col <= 15:
        weight_data['Impervious_Surface_Weight'][idx] = col / 6.66
    elif col > 15 and col <= 25:
        weight_data['Impervious_Surface_Weight'][idx] = col / (20 * 2)
    elif col > 25 and col <= 40:
        weight_data['Impervious_Surface_Weight'][idx] = col / (13.33 * 3)
    elif col > 40 and col <= 55:
        weight_data['Impervious_Surface_Weight'][idx] = col / (33.33 * 4)
    elif col > 55:
        weight_data['Impervious_Surface_Weight'][idx] = col / (26.67 * 5)

## 경사도 가중치

In [173]:
for idx,col in enumerate(training_data['SLOPE_AVG']):
    if col <= 10:
        weight_data['SLOPE_AVG_Weight'][idx] = col / (41.67 / 5**2 )
    elif col > 10 and col <= 15:
        weight_data['SLOPE_AVG_Weight'][idx] = col / (25 / 4**2)
    elif col > 15 and col <= 25:
        weight_data['SLOPE_AVG_Weight'][idx] = col / (16.67 / 3**2)
    elif col > 25 and col <= 50:
        weight_data['SLOPE_AVG_Weight'][idx] = col / (8.33 / 2**2)
    elif col > 50:
        weight_data['SLOPE_AVG_Weight'][idx] = col / (8.33 / 1**2)

## 고도 가중치

In [174]:
for idx,col in enumerate(training_data['HIGH']):
    if col <= 20:
        weight_data['HIGH_Weight'][idx] = col / (79.62 / 1**2)
    elif col > 20 and col <= 40:
        weight_data['HIGH_Weight'][idx] = col / (10.18 / 2**2)
    elif col > 40 and col <= 60:
        weight_data['HIGH_Weight'][idx] = col / (4.63 / 3**2)
    elif col > 60 and col <= 80:
        weight_data['HIGH_Weight'][idx] = col / (2.78 / 4**2)
    elif col > 80:
        weight_data['HIGH_Weight'][idx] = col / (2.78 / 5**2)

In [175]:
training_data = pd.concat([training_data,weight_data],axis=1)

In [176]:
training_data

Unnamed: 0,PUMP_RATIO,HOUR_RAINFALL,IMP_SUR_RATIO,SLOPE_AVG,HIGH,F_WEIGHT,FLOOD,Hourly_Rainfall_Weight,Impervious_Surface_Weight,SLOPE_AVG_Weight,HIGH_Weight
0,4.115226e-07,4.8,62.32,1.0,3.66,0.075019,0.0,0.104348,0.467342,0.599952,0.045968
1,4.115226e-07,9.6,62.32,1.0,3.66,0.075019,0.0,0.208696,0.467342,0.599952,0.045968
2,4.115226e-07,18.7,62.32,1.0,3.66,0.075019,1.0,0.259722,0.467342,0.599952,0.045968
3,4.115226e-07,0.5,62.32,4.5,5.75,0.075019,0.0,0.010870,0.467342,2.699784,0.072218
4,4.115226e-07,5.0,62.32,4.5,5.75,0.075019,1.0,0.108696,0.467342,2.699784,0.072218
...,...,...,...,...,...,...,...,...,...,...,...
791,1.364381e-08,90.0,7.70,11.0,59.65,0.016667,,0.573248,1.156156,7.040000,115.950324
792,1.364381e-08,90.0,7.70,45.0,33.49,0.016667,,0.573248,1.156156,21.608643,13.159136
793,1.364381e-08,90.0,7.70,22.5,5.32,0.016667,,0.573248,1.156156,12.147570,0.066817
794,1.364381e-08,90.0,7.70,45.0,4.92,0.016667,,0.573248,1.156156,21.608643,0.061794


In [177]:
training_data = training_data[['PUMP_RATIO','Hourly_Rainfall_Weight','Impervious_Surface_Weight',\
                               'SLOPE_AVG_Weight','HIGH_Weight','F_WEIGHT','FLOOD']]

In [178]:
training_data = training_data.fillna(0)

In [179]:
training_data['FLOOD'] = training_data['FLOOD'].astype(int)

In [180]:
training_data.reset_index(drop=True, inplace=True)

In [181]:
training_data.to_csv('../Data/final_data/training/geo_data/geo_training_data.csv', encoding='utf-8',index=False)

In [182]:
training_data

Unnamed: 0,PUMP_RATIO,Hourly_Rainfall_Weight,Impervious_Surface_Weight,SLOPE_AVG_Weight,HIGH_Weight,F_WEIGHT,FLOOD
0,4.115226e-07,0.104348,0.467342,0.599952,0.045968,0.075019,0
1,4.115226e-07,0.208696,0.467342,0.599952,0.045968,0.075019,0
2,4.115226e-07,0.259722,0.467342,0.599952,0.045968,0.075019,1
3,4.115226e-07,0.010870,0.467342,2.699784,0.072218,0.075019,0
4,4.115226e-07,0.108696,0.467342,2.699784,0.072218,0.075019,1
...,...,...,...,...,...,...,...
791,1.364381e-08,0.573248,1.156156,7.040000,115.950324,0.016667,0
792,1.364381e-08,0.573248,1.156156,21.608643,13.159136,0.016667,0
793,1.364381e-08,0.573248,1.156156,12.147570,0.066817,0.016667,0
794,1.364381e-08,0.573248,1.156156,21.608643,0.061794,0.016667,0
