In [1]:
import numpy      as np
import pandas     as pd
import seaborn    as sns
from sklearn.model_selection   import train_test_split
import matplotlib.pyplot as plt
import missingno as msno

In [2]:
hospital_male_frm = pd.read_csv("./hospital_male_test.csv", encoding='cp949')
hospital_female_frm =  pd.read_csv("./hospital_female_test.csv", encoding='cp949')

In [3]:
# hospital_male_frm["rhm_min_max_dif"] = (hospital_male_frm["avg_rhm"] - hospital_male_frm["min_rhm"]) * 2
# hospital_female_frm["rhm_min_max_dif"] = (hospital_female_frm["avg_rhm"] - hospital_female_frm["min_rhm"])*2
hospital_male_frm["rhm_min_avg_dif"] = (hospital_male_frm["avg_rhm"] - hospital_male_frm["min_rhm"])
hospital_female_frm["rhm_min_avg_dif"] = (hospital_female_frm["avg_rhm"] - hospital_female_frm["min_rhm"])

In [4]:
hospital_male_frm['min_max_Ta'] = hospital_male_frm['max_ta'] - hospital_male_frm['min_ta']
hospital_female_frm['min_max_Ta'] = hospital_female_frm['max_ta'] - hospital_female_frm['min_ta']

In [5]:
hospital_male_frm['min_max_ps'] = hospital_male_frm['max_ps'] - hospital_male_frm['min_ps']
hospital_female_frm['min_max_ps'] = hospital_female_frm['max_ps'] - hospital_female_frm['min_ps']

In [6]:
# 지역 인구대비 질환자 발생비율 컬럼. 10**6을 한 이유는 광역시, 도 단위가 100만 단위로 떨어지기때문에 eda를 위해 곱함
hospital_male_frm["D/R"] = (hospital_male_frm["frequency"]/hospital_male_frm["total_pop"])*(10**6)
hospital_female_frm["D/R"] = (hospital_female_frm["frequency"]/hospital_female_frm["total_pop"])*(10**6)

In [7]:
display(hospital_female_frm[["area","tma","D/R"]])
print()
display(hospital_male_frm[["area","tma","D/R"]])

Unnamed: 0,area,tma,D/R
0,강원,2011-12-01,3.930972
1,경기,2011-12-01,0.844301
2,경남,2011-12-01,0.608269
3,경북,2011-12-01,0.744342
4,광주,2011-12-01,2.710549
...,...,...,...
31581,전남,2016-12-31,
31582,전북,2016-12-31,
31583,제주,2016-12-31,
31584,충남,2016-12-31,





Unnamed: 0,area,tma,D/R
0,강원,2011-12-01,3.879588
1,경기,2011-12-01,0.664965
2,경남,2011-12-01,1.201378
3,경북,2011-12-01,4.425670
4,광주,2011-12-01,0.000000
...,...,...,...
31581,전남,2016-12-31,
31582,전북,2016-12-31,
31583,제주,2016-12-31,
31584,충남,2016-12-31,


In [8]:
#질환발생자 없는 날 확인
fail1_frm = hospital_male_frm[hospital_male_frm["D/R"]<=0.001]
fail2_frm = hospital_female_frm[hospital_female_frm["D/R"]<=0.001]
print()
display(fail1_frm[["D/R"]])
print()
display(fail2_frm[["D/R"]])




Unnamed: 0,D/R
4,0.0
9,0.0
10,0.0
14,0.0
15,0.0
...,...
24829,0.0
24830,0.0
24832,0.0
24834,0.0





Unnamed: 0,D/R
5,0.0
9,0.0
10,0.0
14,0.0
17,0.0
...,...
24831,0.0
24833,0.0
24834,0.0
24835,0.0


In [9]:
hospital_male_frm.to_csv('./hospital_male_0726_test.csv', encoding='cp949')

In [10]:
hospital_female_frm.to_csv('./hospital_female_0726_test.csv', encoding='cp949')

---
## 한파/폭염 컬럼 추가 (컬럼 2개)

- 주의보, 경보 : 각각 나눠서 0(정상),1(주의보),2(경보)
- 한파 : 전날, 당일, 다음날 >  당일에 기입
    - 주의보(1)
        - 최저기온 차 전날 10 이상 & 당일 3도 이하 >> 아래 조건에 영하 12도 있으므로 아래 조건으로 대체
        - 최저기온 오늘, 내일 영하 12도 이하
    - 경보(2)
        - 최저기온 차 전날 15 이상 & 당일 3도 이하 >> 아래 조건에 영하 12도 있으므로 아래 조건으로 대체
        - 최저기온 오늘, 내일 영하 15도 이하

- 폭염 : 전날, 당일 > 당일에 기입
    - 주의보(1)
        - 최고기온 어제, 오늘 33도 이상
    - 경보(2)
        - 최고기온 어제, 오늘 35도 이상



In [12]:
male   = pd.read_csv("./hospital_male_0726_test.csv", encoding='cp949')
female = pd.read_csv("./hospital_female_0726_test.csv", encoding='cp949')

In [13]:
male.tma = pd.to_datetime(male.tma)
female.tma = pd.to_datetime(female.tma)

# 날짜순으로 재정렬(shift 사용 목적)
male = male.sort_values('tma').reset_index(drop=True)
female = female.sort_values('tma').reset_index(drop=True)

In [14]:
import datetime

area_lst = male.area.unique()
area_lst

array(['강원', '충북', '충남', '제주', '전북', '인천', '울산', '세종', '전남', '부산', '대전',
       '대구', '광주', '경북', '경남', '경기', '서울'], dtype=object)

## 기온 하루전, 다음날 더미변수로 생성

In [15]:
for gen in [male, female] :
    for area in area_lst:
        gen.loc[gen.area == area,'max_ta_1b'] = gen.loc[gen.area == area, 'max_ta'].shift(1)   # 하루 전 최고기온
        gen.loc[gen.area == area,'min_ta_1b'] = gen.loc[gen.area == area, 'min_ta'].shift(1)   # 하루 전 최저기온
        gen.loc[gen.area == area,'min_ta_1a'] = gen.loc[gen.area == area, 'min_ta'].shift(-1)  # 다음 날 최저기온

## 폭염(heat_wave)

In [16]:
for gen in [male, female] :
    gen['heat_wave'] = 0

    gen.loc[(gen.max_ta_1b >= 33) & (gen.max_ta >= 33), 'heat_wave'] = 1
    gen.loc[(gen.max_ta_1b >= 35) & (gen.max_ta >= 35), 'heat_wave'] = 2

## 한파(cold_wave)

In [17]:
for gen in [male, female] :
    gen['cold_wave'] = 0
    
    gen.loc[(gen.min_ta <= -12) & (gen.min_ta_1a <= -12), 'cold_wave'] = 1
    gen.loc[(gen.min_ta - gen.min_ta_1b <= -10) & (gen.min_ta <= 3), 'cold_wave'] = 1 
    gen.loc[(gen.min_ta <= -15) & (gen.min_ta_1a <= -15), 'cold_wave'] = 2
    gen.loc[(gen.min_ta - gen.min_ta_1b <= -15) & (gen.min_ta <= 3), 'cold_wave'] = 2
    
    

In [18]:
display(male.head())
display(female.head())

Unnamed: 0,area,tma,yyyy,mm,dd,weekday,avg_ta,max_ta,min_ta,sum_gsr,...,avg_age,rhm_min_avg_dif,min_max_Ta,min_max_ps,D/R,max_ta_1b,min_ta_1b,min_ta_1a,heat_wave,cold_wave
0,강원,2011-12-01,2011,12,1,3,1.281944,3.573239,-0.271831,0.108493,...,39.3,16.669545,3.84507,2.99375,3.879588,,,-0.347222,0,0
1,충북,2011-12-01,2011,12,1,3,4.775,7.370833,2.195833,0.0,...,38.0,15.830769,5.175,2.8,5.076265,,,-0.433333,0,0
2,충남,2011-12-01,2011,12,1,3,6.2875,8.8375,4.571875,0.173437,...,38.6,12.15,4.265625,2.62,0.0,,,3.1125,0,0
3,제주,2011-12-01,2011,12,1,3,9.314286,11.547619,8.071429,0.0,...,36.9,6.544444,3.47619,2.5,0.0,,,7.838095,0,0
4,전북,2011-12-01,2011,12,1,3,6.02069,8.4,4.282759,0.306667,...,38.9,14.627273,4.117241,2.6,2.141323,,,2.914286,0,0


Unnamed: 0,area,tma,yyyy,mm,dd,weekday,avg_ta,max_ta,min_ta,sum_gsr,...,avg_age,rhm_min_avg_dif,min_max_Ta,min_max_ps,D/R,max_ta_1b,min_ta_1b,min_ta_1a,heat_wave,cold_wave
0,강원,2011-12-01,2011,12,1,3,1.281944,3.573239,-0.271831,0.108493,...,42.3,16.669545,3.84507,2.99375,3.930972,,,-0.347222,0,0
1,충북,2011-12-01,2011,12,1,3,4.775,7.370833,2.195833,0.0,...,40.8,15.830769,5.175,2.8,3.871357,,,-0.433333,0,0
2,충남,2011-12-01,2011,12,1,3,6.2875,8.8375,4.571875,0.173437,...,41.5,12.15,4.265625,2.62,0.963139,,,3.1125,0,0
3,제주,2011-12-01,2011,12,1,3,9.314286,11.547619,8.071429,0.0,...,40.0,6.544444,3.47619,2.5,0.0,,,7.838095,0,0
4,전북,2011-12-01,2011,12,1,3,6.02069,8.4,4.282759,0.306667,...,42.2,14.627273,4.117241,2.6,2.127594,,,2.914286,0,0


다음날의 데이터가 없는 2016년 12월 31일의 경우 min_ta 값이 전지역 모두 영하12도 이상으로 해당 변수에서 '정상'에 해당하므로 0으로 처리

In [19]:
female[(female.yyyy == 2016) & (female.mm == 12)].iloc[-20:]

Unnamed: 0,area,tma,yyyy,mm,dd,weekday,avg_ta,max_ta,min_ta,sum_gsr,...,avg_age,rhm_min_avg_dif,min_max_Ta,min_max_ps,D/R,max_ta_1b,min_ta_1b,min_ta_1a,heat_wave,cold_wave
31566,경기,2016-12-30,2016,12,30,4,-4.336364,1.578182,-10.961818,0.0,...,40.1,29.308571,12.54,4.416667,,-0.141818,-7.458182,-4.921818,0,0
31567,강원,2016-12-30,2016,12,30,4,-4.837838,0.905405,-11.917568,0.0,...,45.0,31.618519,12.822973,5.166727,,-1.17973,-9.213514,-4.525676,0,0
31568,서울,2016-12-30,2016,12,30,4,-2.853571,1.432143,-8.078571,0.0,...,42.0,21.1,9.510714,4.0,,-0.635714,-5.996429,-2.446429,0,0
31569,제주,2016-12-31,2016,12,31,5,3.997143,7.994286,0.062857,0.0,...,41.9,22.135484,7.931429,3.046667,,7.08,-1.088571,,0,0
31570,전북,2016-12-31,2016,12,31,5,1.012903,5.864516,-3.435484,0.820968,...,44.7,15.777778,9.3,3.466667,,3.890323,-7.783871,,0,0
31571,전남,2016-12-31,2016,12,31,5,2.539189,7.959459,-2.978378,0.0,...,46.4,21.753535,10.937838,2.573333,,5.682432,-3.641892,,0,0
31572,인천,2016-12-31,2016,12,31,5,2.645,6.77,-1.005,0.0,...,40.8,15.008333,7.775,2.74375,,2.5,-7.085,,0,0
31573,울산,2016-12-31,2016,12,31,5,3.928571,8.685714,-0.585714,0.0,...,39.9,17.866667,9.271429,3.1,,7.157143,-3.3,,0,0
31574,세종,2016-12-31,2016,12,31,5,0.6,4.8,-2.3,0.0,...,37.5,14.7,7.1,3.2,,3.4,-10.04,,0,0
31575,서울,2016-12-31,2016,12,31,5,1.307143,5.425,-2.446429,0.0,...,42.0,13.335294,7.871429,2.8,,1.432143,-8.078571,,0,0


## 더미변수 drop (max_ta_1b, min_ta_1b, min_ta_1a)

In [20]:
male.drop(columns = ['max_ta_1b', 'min_ta_1b', 'min_ta_1a'], inplace=True)
female.drop(columns = ['max_ta_1b', 'min_ta_1b', 'min_ta_1a'], inplace=True)

## csv 파일로 저장

In [21]:
male.to_csv("./hospital_male_0729_test.csv", encoding='cp949', index=False)
female.to_csv("./hospital_female_0729_test.csv", encoding='cp949', index=False)