In [1]:
from google.colab import drive
drive.mount('/content/drive/')

# 라이브러리
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

import statsmodels.graphics.tsaplots as sgt

import warnings
warnings.filterwarnings('ignore')

# 폰트 깨짐 방지
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

# !sudo apt-get install -y fonts-nanum
# !sudo fc-cache -fv
# !rm ~/.cache/matplotlib -rf

plt.rc('font', family='NanumBarunGothic')

MessageError: ignored

# 서리 데이터 전처리

In [None]:
data = pd.read_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/서리.csv',encoding='EUC-KR')
# 열 이름 변경
data.rename(columns={'서리': '서리시작', '서리.1': '서리시작(평비)', '서리.2':'서리끝', '서리.3':'서리끝(평비)'}, inplace=True)
# 0번째 행 제거
data.drop(0,axis=0, inplace=True)
data

Unnamed: 0,지점,년도,서리시작,서리시작(평비),서리끝,서리끝(평비)
1,강릉,1988.0,1988-12-30,,1989-03-26,
2,강릉,1989.0,1989-12-13,,1990-03-26,
3,강릉,1990.0,1990-12-06,,1991-04-02,
4,강릉,1991.0,1991-12-05,,1992-03-19,
5,강릉,1992.0,1992-11-02,,1993-04-12,
...,...,...,...,...,...,...
2013,충주,2011.0,2011-10-18,-4일,2012-04-08,-5일
2014,충주,2012.0,2012-10-18,-4일,2013-04-13,0일
2015,충주,2013.0,2013-10-26,4일,2014-04-06,-7일
2016,충주,2014.0,2014-10-17,-5일,,


#### 결측값처리

In [None]:
# 결측값 확인
data.isna().sum()   # --> 결측값 있는데 없다고 뜬다!

지점          0
년도          0
서리시작        0
서리시작(평비)    0
서리끝         0
서리끝(평비)     0
dtype: int64

In [None]:
# 서리 데이터 지점 및 개수 확인
print(data['지점'].unique())
print(data['지점'].nunique(), "개")

['강릉' '대관령' '동해' '북강릉' '북춘천' '삼척' '속초' '영월' '원주' '인제' '철원' '춘천' '태백' '홍천'
 '동두천' '수원' '양평' '이천' '파주' '거제' '거창' '남해' '밀양' '북창원' '산청' '진주' '창원' '통영'
 '합천' '구미' '문경' '봉화' '상주' '안동' '영덕' '영주' '영천' '울릉도' '울진' '의성' '포항' '광주'
 '대구' '대구(기)' '대전' '부산' '서울' '울산' '강화' '백령도' '인천' '고흥' '목포' '무안' '순천' '여수'
 '완도' '장흥' '주암' '진도(첨찰산)' '해남' '흑산도' '고창' '고창군' '군산' '남원' '부안' '임실' '장수'
 '전주' '정읍' '고산' '서귀포' '성산' '성산포' '제주' '금산' '보령' '부여' '서산' '천안' '홍성' '보은'
 '제천' '청주' '추풍령' '충주']
87 개


In [None]:
# 열별로 결측 NA로 바꾸기
data['서리시작'].replace([' ', '결측'], np.nan, inplace=True)
data['서리시작(평비)'].replace([' ', '―'], np.nan, inplace=True)
data['서리끝'].replace([' ', '결측'], np.nan, inplace=True)
data['서리끝(평비)'].replace([' ', '―'], np.nan, inplace=True)

In [None]:
# 결측값 확인
data.isna().sum()

지점            0
년도            0
서리시작         12
서리시작(평비)    151
서리끝          62
서리끝(평비)     200
dtype: int64

In [None]:
# 서리시작의 관측안됨은 분석에 불필요한 열이라고 판단되어 제거
#  - 관측 안됨: 현상이 발생하지 않은 경우 (https://data.kma.go.kr/data/seasonObs/seasonObsDataList.do?pgmNo=648)
data = data[data['서리시작'] != '관측 안됨']
data = data[data['서리끝'] != '관측 안됨']

In [None]:
data

Unnamed: 0,지점,년도,서리시작,서리시작(평비),서리끝,서리끝(평비)
1,강릉,1988.0,1988-12-30,,1989-03-26,
2,강릉,1989.0,1989-12-13,,1990-03-26,
3,강릉,1990.0,1990-12-06,,1991-04-02,
4,강릉,1991.0,1991-12-05,,1992-03-19,
5,강릉,1992.0,1992-11-02,,1993-04-12,
...,...,...,...,...,...,...
2013,충주,2011.0,2011-10-18,-4일,2012-04-08,-5일
2014,충주,2012.0,2012-10-18,-4일,2013-04-13,0일
2015,충주,2013.0,2013-10-26,4일,2014-04-06,-7일
2016,충주,2014.0,2014-10-17,-5일,,


In [None]:
# 데이터 유형 확인 후 적절한 type으로 변경
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1985 entries, 1 to 2017
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   지점        1985 non-null   object 
 1   년도        1985 non-null   float64
 2   서리시작      1973 non-null   object 
 3   서리시작(평비)  1864 non-null   object 
 4   서리끝       1926 non-null   object 
 5   서리끝(평비)   1817 non-null   object 
dtypes: float64(1), object(5)
memory usage: 108.6+ KB


In [None]:
data['서리시작'] = pd.to_datetime(data['서리시작'])
data['서리시작(평비)'] = data['서리시작(평비)'].str.replace('일', '', regex=False).apply(lambda x: pd.to_numeric(x, errors='coerce'))
data['서리끝'] = pd.to_datetime(data['서리끝'])
data['서리끝(평비)'] = data['서리끝(평비)'].str.replace('일', '', regex=False).apply(lambda x: pd.to_numeric(x, errors='coerce'))

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1985 entries, 1 to 2017
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   지점        1985 non-null   object        
 1   년도        1985 non-null   float64       
 2   서리시작      1973 non-null   datetime64[ns]
 3   서리시작(평비)  1864 non-null   float64       
 4   서리끝       1926 non-null   datetime64[ns]
 5   서리끝(평비)   1817 non-null   float64       
dtypes: datetime64[ns](2), float64(3), object(1)
memory usage: 108.6+ KB


#### 서리끝(평비)만 최종 데이터 프레임으로...

In [None]:
data1 = data.drop(['서리시작',	'서리시작(평비)'], axis=1)

In [None]:
# 서리끝에 결측치가 있으면 해당 열 제거
data1 = data1.dropna(subset=['서리끝'])

평비 결측값 채워넣기

In [None]:
data1['365'] = (data1['서리끝'] - pd.to_datetime(data1['서리끝'].dt.year, format='%Y')) + pd.Timedelta(days=1)
data1['365'] = data1['365'].dt.days

In [None]:
mean_values = data1.groupby('지점')['365'].mean().reset_index()
mean_values.rename(columns={'365': '평균값'}, inplace=True)

In [None]:
mean_values

Unnamed: 0,지점,평균값
0,강릉,81.550000
1,강화,91.300000
2,거제,84.500000
3,거창,113.230769
4,고산,133.857143
...,...,...
82,합천,93.736842
83,해남,108.700000
84,홍성,95.500000
85,홍천,105.368421


In [None]:
data1 = pd.merge(data1, mean_values, on='지점', how='left')
data1['임시_서리끝(평비))'] = data1['365'] - data1['평균값']
data1['서리끝(평비)'].fillna(data1['임시_서리끝(평비))'], inplace=True)

In [None]:
data1

Unnamed: 0,지점,년도,서리끝,서리끝(평비),365,평균값,임시_서리끝(평비))
0,강릉,1988.0,1989-03-26,3.45,85,81.550000,3.450000
1,강릉,1989.0,1990-03-26,3.45,85,81.550000,3.450000
2,강릉,1990.0,1991-04-02,10.45,92,81.550000,10.450000
3,강릉,1991.0,1992-03-19,-2.55,79,81.550000,-2.550000
4,강릉,1992.0,1993-04-12,20.45,102,81.550000,20.450000
...,...,...,...,...,...,...,...
1921,충주,2009.0,2010-04-17,4.00,107,103.846154,3.153846
1922,충주,2010.0,2011-04-13,0.00,103,103.846154,-0.846154
1923,충주,2011.0,2012-04-08,-5.00,99,103.846154,-4.846154
1924,충주,2012.0,2013-04-13,0.00,103,103.846154,-0.846154


In [None]:
data_final = data1.drop(['평균값',	'임시_서리끝(평비))'], axis=1)

In [None]:
data_final.isna().sum() # 모두 잘 처리함!

지점         0
년도         0
서리끝        0
서리끝(평비)    0
365        0
dtype: int64

In [None]:
data_final.to_csv('/content/drive/MyDrive/삼성화재&Postech 데이터 리스크 /데이터/서리)1차전처리데이터.csv', index=False)