# **시계열데이터처리**

### 1. 날짜 요소 추출

#### 1-1. 날짜 타입으로 변환
- pd.to_datetime(날짜데이터, format = '입력되는 날짜 형식')
- format = '' -> pd.to_datetime(date, format = '%d/%m/%Y') <br>
      ->'%d/%m/%Y' 입력되는 날짜의 형태를 알려주는 옵션

#### 1-2. 날짜 요소 추출
    1. 연도 : date.dt.year
    2. 월 : date.dt.month
    3. 일 : date.dt.day
    4. 요일 : date.dt.weekday
    5. 요일 이름 : date.dt.day_name()

### 2. 시간에 따른 흐름 추가

#### 2-1. shift
- 시계열 데이터에서 시간의 흐름 전후로 정보를 이동시킬 때 사용
- 참조 : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.shift.html

#### 2-2. rolling + 집계함수
- 시간의 흐름에 따라 일정 기간 동안 평균을 이동하면서 구하기
- rolling(n) : n 기본값은 1
- min_periods : 최소 데이터수
- 참조 : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html

#### 2-3. diff
- 특정 시점 데이터, 이전시점 데이터와의 차이 구하기
- 참조 : https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.diff.html

In [92]:
import pandas as pd
import numpy as np

In [93]:
data = pd.read_csv("./world_bank_development_indicators.csv")
data

Unnamed: 0,country,date,agricultural_land%,forest_land%,land_area,avg_precipitation,trade_in_services%,control_of_corruption_estimate,control_of_corruption_std,access_to_electricity%,...,multidimensional_poverty_headcount_ratio%,gini_index,birth_rate,death_rate,life_expectancy_at_birth,population,rural_population,voice_and_accountability_estimate,voice_and_accountability_std,intentional_homicides
0,Afghanistan,1960-01-01,,,,,,,,,...,,,50.340,31.921,32.535,8622466.0,7898093.0,,,
1,Afghanistan,1961-01-01,57.801696,,652230.0,327.0,,,,,...,,,50.443,31.349,33.068,8790140.0,8026804.0,,,
2,Afghanistan,1962-01-01,57.893688,,652230.0,327.0,,,,,...,,,50.570,30.845,33.547,8969047.0,8163985.0,,,
3,Afghanistan,1963-01-01,57.970348,,652230.0,327.0,,,,,...,,,50.703,30.359,34.016,9157465.0,8308019.0,,,
4,Afghanistan,1964-01-01,58.066940,,652230.0,327.0,,,,,...,,,50.831,29.867,34.494,9355514.0,8458694.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16775,Zimbabwe,2018-01-01,41.876696,45.332093,386850.0,657.0,4.469742,-1.227581,0.125622,45.400288,...,,,32.074,7.972,61.414,15052184.0,10204026.0,-1.140975,0.123371,4.876369
16776,Zimbabwe,2019-01-01,41.876696,45.213002,386850.0,657.0,6.927164,-1.273280,0.135450,46.682095,...,,50.3,31.518,8.043,61.292,15354608.0,10408889.0,-1.164705,0.118156,5.145035
16777,Zimbabwe,2020-01-01,41.876696,45.093912,386850.0,657.0,5.118949,-1.289440,0.142061,52.747667,...,,,31.009,8.132,61.124,15669666.0,10617452.0,-1.113716,0.120647,4.977770
16778,Zimbabwe,2021-01-01,,,,,,-1.257897,0.154067,48.979927,...,,,30.537,9.057,59.253,15993524.0,10827136.0,-1.136934,0.121119,6.139985


In [94]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16780 entries, 0 to 16779
Data columns (total 50 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   country                                    16780 non-null  object 
 1   date                                       16780 non-null  object 
 2   agricultural_land%                         15298 non-null  float64
 3   forest_land%                               7914 non-null   float64
 4   land_area                                  15608 non-null  float64
 5   avg_precipitation                          10086 non-null  float64
 6   trade_in_services%                         9165 non-null   float64
 7   control_of_corruption_estimate             4564 non-null   float64
 8   control_of_corruption_std                  4564 non-null   float64
 9   access_to_electricity%                     7348 non-null   float64
 10  renewvable_energy_cons

In [95]:
# 날짜 타입으로 변환
data['Date'] = pd.to_datetime(data['date'])

In [96]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16780 entries, 0 to 16779
Data columns (total 51 columns):
 #   Column                                     Non-Null Count  Dtype         
---  ------                                     --------------  -----         
 0   country                                    16780 non-null  object        
 1   date                                       16780 non-null  object        
 2   agricultural_land%                         15298 non-null  float64       
 3   forest_land%                               7914 non-null   float64       
 4   land_area                                  15608 non-null  float64       
 5   avg_precipitation                          10086 non-null  float64       
 6   trade_in_services%                         9165 non-null   float64       
 7   control_of_corruption_estimate             4564 non-null   float64       
 8   control_of_corruption_std                  4564 non-null   float64       
 9   access_to_electri

In [97]:
temp = data.loc[:,['Date','population']]

In [98]:
temp

Unnamed: 0,Date,population
0,1960-01-01,8622466.0
1,1961-01-01,8790140.0
2,1962-01-01,8969047.0
3,1963-01-01,9157465.0
4,1964-01-01,9355514.0
...,...,...
16775,2018-01-01,15052184.0
16776,2019-01-01,15354608.0
16777,2020-01-01,15669666.0
16778,2021-01-01,15993524.0


In [99]:
# shift
# 작년 인구 열 추가 - 시계열 데이터에서 시간의 흐름 전후로 인구 정보를 이동시킴
temp['last_year_population'] = temp['population'].shift()

temp

Unnamed: 0,Date,population,last_year_population
0,1960-01-01,8622466.0,
1,1961-01-01,8790140.0,8622466.0
2,1962-01-01,8969047.0,8790140.0
3,1963-01-01,9157465.0,8969047.0
4,1964-01-01,9355514.0,9157465.0
...,...,...,...
16775,2018-01-01,15052184.0,14751101.0
16776,2019-01-01,15354608.0,15052184.0
16777,2020-01-01,15669666.0,15354608.0
16778,2021-01-01,15993524.0,15669666.0


In [100]:
# rolling
# 시간의 흐름에 따라 일정 기간 동안 평균을 이동하면서 구하기
temp['population_2year'] = temp['population'].rolling(2, min_periods = 1).mean()
temp.head(10)

Unnamed: 0,Date,population,last_year_population,population_2year
0,1960-01-01,8622466.0,,8622466.0
1,1961-01-01,8790140.0,8622466.0,8706303.0
2,1962-01-01,8969047.0,8790140.0,8879593.5
3,1963-01-01,9157465.0,8969047.0,9063256.0
4,1964-01-01,9355514.0,9157465.0,9256489.5
5,1965-01-01,9565147.0,9355514.0,9460330.5
6,1966-01-01,9783147.0,9565147.0,9674147.0
7,1967-01-01,10010030.0,9783147.0,9896588.5
8,1968-01-01,10247780.0,10010030.0,10128905.0
9,1969-01-01,10494489.0,10247780.0,10371134.5


In [101]:
# 특정 시점 데이터, 이전시점 데이터와의 차이 구하기
temp['population_diff'] = temp['population'].diff()
temp.head(10)

Unnamed: 0,Date,population,last_year_population,population_2year,population_diff
0,1960-01-01,8622466.0,,8622466.0,
1,1961-01-01,8790140.0,8622466.0,8706303.0,167674.0
2,1962-01-01,8969047.0,8790140.0,8879593.5,178907.0
3,1963-01-01,9157465.0,8969047.0,9063256.0,188418.0
4,1964-01-01,9355514.0,9157465.0,9256489.5,198049.0
5,1965-01-01,9565147.0,9355514.0,9460330.5,209633.0
6,1966-01-01,9783147.0,9565147.0,9674147.0,218000.0
7,1967-01-01,10010030.0,9783147.0,9896588.5,226883.0
8,1968-01-01,10247780.0,10010030.0,10128905.0,237750.0
9,1969-01-01,10494489.0,10247780.0,10371134.5,246709.0


In [102]:
# 작년 대비 인구 수 증감여부 (증가 1, 감소 -1, 동일 0)
temp['pop_Diff'] = np.where(temp['population'].diff() > 0, 1, np.where(temp['population'].diff() < 0, -1, 0))
temp.head(10)

Unnamed: 0,Date,population,last_year_population,population_2year,population_diff,pop_Diff
0,1960-01-01,8622466.0,,8622466.0,,0
1,1961-01-01,8790140.0,8622466.0,8706303.0,167674.0,1
2,1962-01-01,8969047.0,8790140.0,8879593.5,178907.0,1
3,1963-01-01,9157465.0,8969047.0,9063256.0,188418.0,1
4,1964-01-01,9355514.0,9157465.0,9256489.5,198049.0,1
5,1965-01-01,9565147.0,9355514.0,9460330.5,209633.0,1
6,1966-01-01,9783147.0,9565147.0,9674147.0,218000.0,1
7,1967-01-01,10010030.0,9783147.0,9896588.5,226883.0,1
8,1968-01-01,10247780.0,10010030.0,10128905.0,237750.0,1
9,1969-01-01,10494489.0,10247780.0,10371134.5,246709.0,1
