# DatetimeIndex

DatetimeIndex 생성

In [1]:
# 이 장에서는 사용하는 패키지를 임포트하고
# 플로티를 그래프 서버로 설정합니다.
import pandas as pd 
import numpy as np 
pd.options.plotting.backend = "plotly"

In [2]:
# 이 명령은 시작하는 타임스태프, 데이터 포인트 갯수,
# 빈도를 기준으로 DatetimeIndex를 만듭니다. ("D": 매일)
daily_index = pd.date_range("2020-02-28", periods=4, freq="D")
daily_index

DatetimeIndex(['2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02'], dtype='datetime64[ns]', freq='D')

In [3]:
# 이 명령은 시작/끝 타임스탬프를 기준으로 DatetimeIndex를 만듭니다
# 이 빈도는 '매주 일요일'인 "W-SUN"입니다
weekly_index = pd.date_range('2020-01-01', '2020-01-31', freq="W-SUN")
weekly_index

DatetimeIndex(['2020-01-05', '2020-01-12', '2020-01-19', '2020-01-26'], dtype='datetime64[ns]', freq='W-SUN')

In [4]:
# 주 단윌 인텍스로 동작하는 데이터프레임을 만듭니다.
# 일요일에만 개관하는 박물관 방문자 수 같은 경우에는 쓸 수 있습니다
pd.DataFrame(data=[21, 15, 33, 34], columns=["visitors"], index=weekly_index)

Unnamed: 0,visitors
2020-01-05,21
2020-01-12,15
2020-01-19,33
2020-01-26,34


In [5]:
msft = pd.read_csv("MSFT.csv")
msft.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8622 entries, 0 to 8621
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       8622 non-null   object 
 1   Open       8622 non-null   float64
 2   High       8622 non-null   float64
 3   Low        8622 non-null   float64
 4   Close      8622 non-null   float64
 5   Adj Close  8622 non-null   float64
 6   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 471.6+ KB


In [6]:
msft.loc[:, "Date"] = pd.to_datetime(msft["Date"]) 
msft.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume                int64
dtype: object

In [7]:
msft = pd.read_csv("MSFT.csv",
                   index_col="Date", parse_dates=["Date"])
msft.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8622 entries, 1986-03-13 to 2020-05-27
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       8622 non-null   float64
 1   High       8622 non-null   float64
 2   Low        8622 non-null   float64
 3   Close      8622 non-null   float64
 4   Adj Close  8622 non-null   float64
 5   Volume     8622 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 471.5 KB


In [8]:
msft.loc[:, "Volume"] = msft["Volume"].astype("float")
msft["Volume"].dtypes

dtype('float64')

In [9]:
msft = msft.sort_index()

In [10]:
msft.index.date

array([datetime.date(1986, 3, 13), datetime.date(1986, 3, 14),
       datetime.date(1986, 3, 17), ..., datetime.date(2020, 5, 22),
       datetime.date(2020, 5, 26), datetime.date(2020, 5, 27)],
      dtype=object)

DatetimeIndex 필터

In [11]:
msft.loc["2019", "Adj Close"]

Date
2019-01-02     99.099190
2019-01-03     95.453529
2019-01-04     99.893005
2019-01-07    100.020401
2019-01-08    100.745613
                 ...    
2019-12-24    156.515396
2019-12-26    157.798309
2019-12-27    158.086731
2019-12-30    156.724243
2019-12-31    156.833633
Name: Adj Close, Length: 252, dtype: float64

In [12]:
msft.loc["2019-06": "2020-05", "Adj Close"].plot()

시간대

In [13]:
# 날짜에 시간 정보를 추가합니다
msft_close = msft.loc[:, ["Adj Close"]].copy()
msft_close.index = msft_close.index + pd.DateOffset(hours=16)
msft_close.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1986-03-13 16:00:00,0.062205
1986-03-14 16:00:00,0.064427
1986-03-17 16:00:00,0.065537
1986-03-18 16:00:00,0.063871
1986-03-19 16:00:00,0.06276


In [14]:
# 타임스태프가 시간대를 인식하게 만듭니다
msft_close = msft_close.tz_localize("America/New_York")
msft_close.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1986-03-13 16:00:00-05:00,0.062205
1986-03-14 16:00:00-05:00,0.064427
1986-03-17 16:00:00-05:00,0.065537
1986-03-18 16:00:00-05:00,0.063871
1986-03-19 16:00:00-05:00,0.06276


In [15]:
msft_close = msft_close.tz_convert("UTC")
msft_close.loc["2020-01-02", "Adj Close"] # 서머타임이 아닌 21:00

Date
2020-01-02 21:00:00+00:00    159.737595
Name: Adj Close, dtype: float64

In [16]:
msft_close.loc["2020-05-01", "Adj Close"] # 서머타임인 20:00

Date
2020-05-01 20:00:00+00:00    174.085175
Name: Adj Close, dtype: float64

# 널리 쓰이는 시계열 조작

행 이동과 퍼센트 값 변화

In [17]:
msft_close.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1986-03-13 21:00:00+00:00,0.062205
1986-03-14 21:00:00+00:00,0.064427
1986-03-17 21:00:00+00:00,0.065537
1986-03-18 21:00:00+00:00,0.063871
1986-03-19 21:00:00+00:00,0.06276


In [18]:
msft_close.shift(1).head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1986-03-13 21:00:00+00:00,
1986-03-14 21:00:00+00:00,0.062205
1986-03-17 21:00:00+00:00,0.064427
1986-03-18 21:00:00+00:00,0.065537
1986-03-19 21:00:00+00:00,0.063871


In [19]:
returns = np.log(msft_close / msft_close.shift(1))
returns = returns.rename(columns={"Adj Close": "returns"})
returns.head()

Unnamed: 0_level_0,returns
Date,Unnamed: 1_level_1
1986-03-13 21:00:00+00:00,
1986-03-14 21:00:00+00:00,0.035097
1986-03-17 21:00:00+00:00,0.017082
1986-03-18 21:00:00+00:00,-0.025749
1986-03-19 21:00:00+00:00,-0.017547


In [20]:
# 매일매일의 로그 수익율을 히스토그램으로 그립니다
returns.plot.hist()

In [21]:
simple_rets = msft_close.pct_change()
simple_rets = simple_rets.rename(columns={"Adj": "simple rets"})
simple_rets.head()

Unnamed: 0_level_0,Adj Close
Date,Unnamed: 1_level_1
1986-03-13 21:00:00+00:00,
1986-03-14 21:00:00+00:00,0.035721
1986-03-17 21:00:00+00:00,0.017229
1986-03-18 21:00:00+00:00,-0.025421
1986-03-19 21:00:00+00:00,-0.017394


리베이스와 상관관계

In [22]:
parts = []  # 개별 데이터프레임을 모을 리스트
for ticker in ["AAPL", "AMZN", "GOOGL", "MSFT"]:
    # "usecols" 인자를 서서 Date와 Adj Close 열만 읽습니다
    adj_close = pd.read_csv(f"{ticker}.csv",
                            index_col="Date", parse_dates=["Date"],
                            usecols=["Date", "Adj Close"])
    # 열 이름을 증권 시세를 나타내는 약자로 바꿉니다
    adj_close = adj_close.rename(columns={"Adj Close": ticker})
    # parts 리스트에 주식 데이터프레임을 이어붙입니다
    parts.append(adj_close)

In [23]:
# 네 개의 데이터프레임을 하나로 조합합니다
adj_close = pd.concat(parts, axis=1)
adj_close

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1980-12-12,0.405683,,,
1980-12-15,0.384517,,,
1980-12-16,0.356296,,,
1980-12-17,0.365115,,,
1980-12-18,0.375698,,,
...,...,...,...,...
2020-05-22,318.890015,2436.879883,1413.239990,183.509995
2020-05-26,316.730011,2421.860107,1421.369995,181.570007
2020-05-27,318.109985,2410.389893,1420.280029,181.809998
2020-05-28,318.250000,2401.100098,1418.239990,


In [24]:
adj_close = adj_close.dropna()
adj_close.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3970 entries, 2004-08-19 to 2020-05-27
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AAPL    3970 non-null   float64
 1   AMZN    3970 non-null   float64
 2   GOOGL   3970 non-null   float64
 3   MSFT    3970 non-null   float64
dtypes: float64(4)
memory usage: 155.1 KB


In [25]:
# 2019년 6월 - 2020년 5월 데이터를 샘플로 사용합니다
adj_close_sample = adj_close.loc["2019-06":"2020-05", :]
rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100
rebased_prices.head()

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-06-03,100.0,100.0,100.0,100.0
2019-06-04,103.658406,102.178197,101.51626,102.770372
2019-06-05,105.331787,102.706347,100.567998,104.998342
2019-06-06,106.878255,103.643316,100.868362,106.658885
2019-06-07,109.723025,106.577701,102.852495,109.646189


In [26]:
rebased_prices.plot()

In [27]:
returns = np.log(adj_close / adj_close.shift(1))
returns.corr()

Unnamed: 0,AAPL,AMZN,GOOGL,MSFT
AAPL,1.0,0.42491,0.503497,0.486065
AMZN,0.42491,1.0,0.48669,0.485725
GOOGL,0.503497,0.48669,1.0,0.525645
MSFT,0.486065,0.485725,0.525645,1.0


In [28]:
import plotly.express as px

In [29]:
fig = px.imshow(returns.corr(),
                x=adj_close.columns,
                y=adj_close.columns,
                color_continuous_scale=list(
                    reversed(px.colors.sequential.RdBu)),
                zmin=-1, zmax=1)
fig.show()

리샘플링

In [30]:
end_of_month = adj_close.resample("M").last()
end_of_month.head()

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-31,2.132708,38.139999,51.236237,17.67363
2004-09-30,2.396127,40.860001,64.864868,17.900215
2004-10-31,3.240182,34.130001,95.415413,18.107374
2004-11-30,4.146072,39.68,91.081078,19.344421
2004-12-31,3.982207,44.290001,96.491493,19.27948


In [31]:
end_of_month.resample("D").asfreq().head() # 변환 없음

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-08-31,2.132708,38.139999,51.236237,17.67363
2004-09-01,,,,
2004-09-02,,,,
2004-09-03,,,,
2004-09-04,,,,


In [32]:
end_of_month.resample("W-FRI").ffill().head() # 앞으로 채움

Unnamed: 0_level_0,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2004-09-03,2.132708,38.139999,51.236237,17.67363
2004-09-10,2.132708,38.139999,51.236237,17.67363
2004-09-17,2.132708,38.139999,51.236237,17.67363
2004-09-24,2.132708,38.139999,51.236237,17.67363
2004-10-01,2.396127,40.860001,64.864868,17.900215


롤링 윈도우

In [33]:
# MSFT의 2019년 데이터로 산출한 이동 평균 그래프
msft19 = msft.loc["2019", ["Adj Close"]].copy()
# 25일 단위로 움직이는 이동 평귱을 데이터프레임의 새로운 열로 추가합니다
msft19.loc[:, "25day average"] = msft19["Adj Close"].rolling(25).mean()
msft19.plot()