# 시계열 데이터 전처리 (결측치 채우기)
---

In [67]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.tsa.api as tsa

## 함수 정리
---

In [3]:
def datetime_convert(data:pd.DataFrame, column:str, split="-"):
    """데이터 프레임 컬럼 datetime으로 바꿔주는 함수

    Args:
        data (pd.DataFrame): datadrame
        column (str): column name

    Returns:
        pd.DataFrame: dataframe
    """
    data_c = data.copy()
    data_c[column].astype(str)

    if split == "-":
        data_c[column] = pd.to_datetime(data_c[column])
    else:
        data_c[column] = data_c[column].astype(str).str.replace(split, "-")
        data_c[column] = pd.to_datetime(data_c[column])

    return data_c

In [37]:
def connect_year_month(data:pd.DataFrame, year:str, month:str, is_num=True):
    """connect year and month column to datetime
    Args:
        data (pd.DataFrame): dataframe
        year (str): year column name
        month (str): month column name

    Returns:
        pd.DataFrame: dataframe
    """
    data_c = data.copy()
    data_c[year] = data_c[year].astype(str)
    data_c[month] = data_c[month].astype(str)

    if is_num == False:
        if data_c[year].iloc[0][-1] == "년":
            data_c[year] = data_c[year].str[:-1]
        if data_c[month].iloc[0][-1] == "월":
            data_c[month] = data_c[month].str[:-1]

    data_c[year] = data_c[year] + "-" + data_c[month]
    data_c[year] = pd.to_datetime(data_c[year])
    data_c.drop(columns=month, inplace=True)

    return data_c

## 데이터 로딩
---

In [33]:
# 파일 불러오기
data_path_1 = "./용도별판매전력량추이_1997_2014.csv"
data_path_2 = "./용도별판매전력량추이_2015_2021.csv"

# 97 ~ 14 년도 데이터 정리
df_97_14 = pd.read_csv(data_path_1, encoding="euc-kr")

# 15 ~ 21 년도 데이터 정리
df_15_21_raw = pd.read_csv(data_path_2, encoding="euc-kr")
df_15_21 = df_15_21_raw.drop(0, axis=0)

# 조건
c1 = df_15_21["시점"] == '2015'
c2 = df_15_21["시점"] == '2016'
c3 = df_15_21["시점"] == '2021'

# 15, 16년도 데이터 따로 추출
df_15_16 = df_15_21[c1 + c2]

# 17 ~ 20 년도 데이터 정리
idx_151621 = list(df_15_21[c1 + c2 + c3].index)
df_17_20 = df_15_21.drop(idx_151621)
idx_sum = list(df_17_20[df_17_20["월별(1)"] == "합계"].index)
df_17_20 = df_17_20.drop(idx_sum)

## 데이터 정제
---

In [7]:
df_97_14.head(5)

Unnamed: 0,시점,소계,가정용,공공용,서비스업,농림어업,광업,제조업
0,1997.01,16224867,2814138,557379,3616542,301811,80421,8854576
1,1997.02,15363840,2788312,558610,3551611,327617,70471,8067220
2,1997.03,16094268,2555048,528672,3333225,345359,81877,9250086
3,1997.04,16266027,2629204,555354,3430624,292235,84596,9274014
4,1997.05,16028072,2525076,511468,3333483,302913,80963,9274169


In [41]:
df_17_20.head()

Unnamed: 0,시점,월별(1),합계,가정용,공공용,서비스업 및 기타,농림어업,광업,제조업
4,2017,1월,45196953,5818186,2242732,13615459,1551606,152302,21816668
5,2017,2월,44422400,5908998,2223466,13847567,1664694,143111,20634564
6,2017,3월,42558535,5114346,2055510,11889602,1423129,158768,21917181
7,2017,4월,40811060,5350759,1866996,10995210,1282616,156978,21158500
8,2017,5월,38743894,4948471,1606449,9897405,1051543,153789,21086238


In [55]:
# 컬럼명 리스트 생성
elec_col_names = ["시점", "합계", "가정용", "공공용", "서비스업", "농림어업", "광업", "제조업"]

# 시점 정제
df_97_14_pre = datetime_convert(df_97_14, column="시점", split=".")
df_17_20_pre = connect_year_month(df_17_20, year="시점", month="월별(1)", is_num=False)

# 컬럼명 정리
df_97_14_pre.columns = elec_col_names
df_17_20_pre.columns = elec_col_names

# 데이터프레임 합치기
df_97_20_pre = pd.concat([df_97_14_pre, df_17_20_pre], axis=0, ignore_index=True)

  data_c[column] = data_c[column].astype(str).str.replace(split, "-")


In [70]:
df_97_20_pre

Unnamed: 0,시점,합계,가정용,공공용,서비스업,농림어업,광업,제조업
0,1997-01-01,16224867,2814138,557379,3616542,301811,80421,8854576
1,1997-02-01,15363840,2788312,558610,3551611,327617,70471,8067220
2,1997-03-01,16094268,2555048,528672,3333225,345359,81877,9250086
3,1997-04-01,16266027,2629204,555354,3430624,292235,84596,9274014
4,1997-05-01,16028072,2525076,511468,3333483,302913,80963,9274169
...,...,...,...,...,...,...,...,...
259,2020-08-01,44599216,7171231,2112628,12968200,1432700,110871,20803586
260,2020-09-01,45110794,7643034,1993412,12489373,1535383,124192,21325400
261,2020-10-01,39065310,5598894,1688593,10438455,1164339,121749,20053280
262,2020-11-01,41147027,5821666,1909100,10989999,1580276,156931,20689055
