# 11 금융, 경제 데이터 애플리케이션
> ## 어플리케이션 데이터 준비



In [1]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
from pandas.tseries.offsets import Hour, Minute, Day, MonthEnd

In [2]:
%pylab inline
pylab.rcParams['figure.figsize'] = (16, 8)

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [6]:
from pandas_datareader import data, wb

___
## 2. 다른 빈도를 가지는 시계열 연산

In [11]:
ts1 = Series(np.random.randn(3), 
            index = pd.date_range('2012/6/13', periods = 3, freq = 'W-WED'))
ts1

2012-06-13   -1.208427
2012-06-20   -0.176913
2012-06-27    0.233428
Freq: W-WED, dtype: float64

In [15]:
ts1.resample('B').first()

2012-06-13   -1.208427
2012-06-14         NaN
2012-06-15         NaN
2012-06-18         NaN
2012-06-19         NaN
2012-06-20   -0.176913
2012-06-21         NaN
2012-06-22         NaN
2012-06-25         NaN
2012-06-26         NaN
2012-06-27    0.233428
Freq: B, dtype: float64

In [16]:
ts1.resample('B').ffill()

2012-06-13   -1.208427
2012-06-14   -1.208427
2012-06-15   -1.208427
2012-06-18   -1.208427
2012-06-19   -1.208427
2012-06-20   -0.176913
2012-06-21   -0.176913
2012-06-22   -0.176913
2012-06-25   -0.176913
2012-06-26   -0.176913
2012-06-27    0.233428
Freq: B, dtype: float64

- ffill로 NA 값 채우는게 가능하지만, 효과적인 방법은 아님

In [18]:
dates = pd.DatetimeIndex(['2012/6/12', '2012/6/17', '2012/6/18',
                          '2012/6/21', '2012/6/22', '2012/6/29'])
ts2 = Series(np.random.randn(len(dates)), index = dates)
ts2

2012-06-12   -1.095421
2012-06-17    0.976283
2012-06-18    0.969631
2012-06-21    0.434578
2012-06-22   -0.767224
2012-06-29    1.478586
dtype: float64

In [20]:
ts1.reindex(ts2.index, method = 'ffill')

2012-06-12         NaN
2012-06-17   -1.208427
2012-06-18   -1.208427
2012-06-21   -0.176913
2012-06-22   -0.176913
2012-06-29    0.233428
dtype: float64

- 낼짜 색인 유지 위해 reindex 사용 가능
- 동일한 빈도로 리샘플링하는 방식으로 합치기 가능

In [21]:
ts2 + ts1.reindex(ts2.index, method = 'ffill')

2012-06-12         NaN
2012-06-17   -0.232144
2012-06-18   -0.238796
2012-06-21    0.257664
2012-06-22   -0.944137
2012-06-29    1.712015
dtype: float64


___
Periods

In [25]:
gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],
            index = pd.period_range('1984Q2', periods = 7, freq = 'Q-SEP'))
infl = Series([0.025, 0.045, 0.037, 0.04],
             index = pd.period_range('1982',periods = 4, freq = 'A-SEP'))
gdp

1984Q2    1.78
1984Q3    1.94
1984Q4    2.08
1985Q1    2.01
1985Q2    2.15
1985Q3    2.31
1985Q4    2.46
Freq: Q-SEP, dtype: float64

In [26]:
infl

1982    0.025
1983    0.045
1984    0.037
1985    0.040
Freq: A-SEP, dtype: float64

In [28]:
infl_q = infl.asfreq('Q-SEP', how = 'end')
infl_q

1982Q4    0.025
1983Q4    0.045
1984Q4    0.037
1985Q4    0.040
Freq: Q-SEP, dtype: float64

In [29]:
infl_q.reindex(gdp.index, method = 'ffill')

1984Q2    0.045
1984Q3    0.045
1984Q4    0.037
1985Q1    0.037
1985Q2    0.037
1985Q3    0.037
1985Q4    0.040
Freq: Q-SEP, dtype: float64

- 연말 시점의 infl 값과 빈도에 맞추어 Q-SEP로 변환 후 병합
___
## 3. 일별 시간과 현재 최신 데이터 선택하기

In [30]:
rng = pd.date_range('2012/6/1 09:30', '2012/06/1 15:59', freq = 'T')
rng

DatetimeIndex(['2012-06-01 09:30:00', '2012-06-01 09:31:00',
               '2012-06-01 09:32:00', '2012-06-01 09:33:00',
               '2012-06-01 09:34:00', '2012-06-01 09:35:00',
               '2012-06-01 09:36:00', '2012-06-01 09:37:00',
               '2012-06-01 09:38:00', '2012-06-01 09:39:00',
               ...
               '2012-06-01 15:50:00', '2012-06-01 15:51:00',
               '2012-06-01 15:52:00', '2012-06-01 15:53:00',
               '2012-06-01 15:54:00', '2012-06-01 15:55:00',
               '2012-06-01 15:56:00', '2012-06-01 15:57:00',
               '2012-06-01 15:58:00', '2012-06-01 15:59:00'],
              dtype='datetime64[ns]', length=390, freq='T')

In [31]:
rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1,4)])
rng

DatetimeIndex(['2012-06-01 09:30:00', '2012-06-01 09:31:00',
               '2012-06-01 09:32:00', '2012-06-01 09:33:00',
               '2012-06-01 09:34:00', '2012-06-01 09:35:00',
               '2012-06-01 09:36:00', '2012-06-01 09:37:00',
               '2012-06-01 09:38:00', '2012-06-01 09:39:00',
               ...
               '2012-06-06 15:50:00', '2012-06-06 15:51:00',
               '2012-06-06 15:52:00', '2012-06-06 15:53:00',
               '2012-06-06 15:54:00', '2012-06-06 15:55:00',
               '2012-06-06 15:56:00', '2012-06-06 15:57:00',
               '2012-06-06 15:58:00', '2012-06-06 15:59:00'],
              dtype='datetime64[ns]', length=1560, freq=None)

In [35]:
ts = Series(np.arange(len(rng), dtype = float), index = rng)
ts

2012-06-01 09:30:00       0.0
2012-06-01 09:31:00       1.0
2012-06-01 09:32:00       2.0
2012-06-01 09:33:00       3.0
2012-06-01 09:34:00       4.0
2012-06-01 09:35:00       5.0
2012-06-01 09:36:00       6.0
2012-06-01 09:37:00       7.0
2012-06-01 09:38:00       8.0
2012-06-01 09:39:00       9.0
2012-06-01 09:40:00      10.0
2012-06-01 09:41:00      11.0
2012-06-01 09:42:00      12.0
2012-06-01 09:43:00      13.0
2012-06-01 09:44:00      14.0
2012-06-01 09:45:00      15.0
2012-06-01 09:46:00      16.0
2012-06-01 09:47:00      17.0
2012-06-01 09:48:00      18.0
2012-06-01 09:49:00      19.0
2012-06-01 09:50:00      20.0
2012-06-01 09:51:00      21.0
2012-06-01 09:52:00      22.0
2012-06-01 09:53:00      23.0
2012-06-01 09:54:00      24.0
2012-06-01 09:55:00      25.0
2012-06-01 09:56:00      26.0
2012-06-01 09:57:00      27.0
2012-06-01 09:58:00      28.0
2012-06-01 09:59:00      29.0
                        ...  
2012-06-06 15:30:00    1530.0
2012-06-06 15:31:00    1531.0
2012-06-06

In [37]:
from datetime import time

ts[time(10,0)]

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [38]:
ts.at_time(time(10, 0))

2012-06-01 10:00:00      30.0
2012-06-04 10:00:00     420.0
2012-06-05 10:00:00     810.0
2012-06-06 10:00:00    1200.0
dtype: float64

In [39]:
ts.between_time(time(10, 0), time(10, 1))

2012-06-01 10:00:00      30.0
2012-06-01 10:01:00      31.0
2012-06-04 10:00:00     420.0
2012-06-04 10:01:00     421.0
2012-06-05 10:00:00     810.0
2012-06-05 10:01:00     811.0
2012-06-06 10:00:00    1200.0
2012-06-06 10:01:00    1201.0
dtype: float64

- 정해진 기간에 데이터가 있을 경우 원하는 값을 얻어내는 것이 어렵지는 않음

In [41]:
indexer = np.sort(np.random.permutation(len(ts))[700:])
irr_ts = ts.copy()
irr_ts[indexer] = np.nan
irr_ts['2012/06/01 09:50' : '2012/06/01 10:00']

2012-06-01 09:50:00    20.0
2012-06-01 09:51:00     NaN
2012-06-01 09:52:00     NaN
2012-06-01 09:53:00     NaN
2012-06-01 09:54:00    24.0
2012-06-01 09:55:00     NaN
2012-06-01 09:56:00     NaN
2012-06-01 09:57:00    27.0
2012-06-01 09:58:00     NaN
2012-06-01 09:59:00    29.0
2012-06-01 10:00:00     NaN
dtype: float64

In [42]:
selection = pd.date_range('2012/06/01 10:00', periods = 4, freq = 'B')
irr_ts.asof(selection)

2012-06-01 10:00:00      29.0
2012-06-04 10:00:00     418.0
2012-06-05 10:00:00     809.0
2012-06-06 10:00:00    1200.0
Freq: B, dtype: float64

- asof 메서드에 타임스탬프 배열을 남기면, 정해진 값이 없을 시 최소 **'직전의 유효한 값이 담긴 배열'** 을 반환 받을 수 있음
___
## 4. 데이터와 함께 나누기

In [44]:
data1 = DataFrame(np.ones((6,3), dtype = float),
                 columns = list('abc'),
                 index = pd.date_range('2012/6/12', periods = 6))
data2 = DataFrame(np.ones((6,3), dtype = float)*2,
                 columns = list('abc'),
                 index = pd.date_range('2012/6/13', periods = 6))

spliced = pd.concat([data1.loc[:'2012/6/14'], data2.loc['2012/6/15':]])
spliced


Unnamed: 0,a,b,c
2012-06-12,1.0,1.0,1.0
2012-06-13,1.0,1.0,1.0
2012-06-14,1.0,1.0,1.0
2012-06-15,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0


In [45]:
data1.loc[:'2012/6/14']

Unnamed: 0,a,b,c
2012-06-12,1.0,1.0,1.0
2012-06-13,1.0,1.0,1.0
2012-06-14,1.0,1.0,1.0


In [46]:
data2.loc['2012/6/15':]

Unnamed: 0,a,b,c
2012-06-15,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0


- pd.concat 함수로 2개의 시계열 데이터 병합

In [47]:
data2 = DataFrame(np.ones((6,4), dtype = float)*2,
                 columns = list('abcd'),
                 index = pd.date_range('2012/6/13', periods = 6))
spliced = pd.concat([data1.loc[:'2012/6/14'], data2.loc['2012/6/15':]])
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,
2012-06-14,1.0,1.0,1.0,
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


- 없는 값 병합 시 NaN 반환

In [49]:
data2

Unnamed: 0,a,b,c,d
2012-06-13,2.0,2.0,2.0,2.0
2012-06-14,2.0,2.0,2.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


In [50]:
spliced_filled = spliced.combine_first(data2)
spliced_filled

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


- combine 메서드 사용 시 지정 테이블에서 누락된 값 가져와서 병합 시킴

In [51]:
spliced.update(data2, overwrite = False)
spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,2.0,2.0,2.0,2.0
2012-06-16,2.0,2.0,2.0,2.0
2012-06-17,2.0,2.0,2.0,2.0
2012-06-18,2.0,2.0,2.0,2.0


- overwrite = False 지정 시 누락된 값만 갱신 가능

In [52]:
cp_spliced = spliced.copy()
cp_spliced[['a', 'c']] = data1[['a', 'c']]
cp_spliced

Unnamed: 0,a,b,c,d
2012-06-12,1.0,1.0,1.0,
2012-06-13,1.0,1.0,1.0,2.0
2012-06-14,1.0,1.0,1.0,2.0
2012-06-15,1.0,2.0,1.0,2.0
2012-06-16,1.0,2.0,1.0,2.0
2012-06-17,1.0,2.0,1.0,2.0
2012-06-18,,2.0,,2.0


- 위에 소개한 메서드들 사용이 가능하지만, 직접 DataFrame의 칼럼에 값을 대입하는 것이 더 간단함
___
## 5. 수익 지수와 누적 수익

In [53]:
from pandas_datareader import data

In [54]:
price = data.get_data_google('AAPL', '2011-01-01')['Close']

price[-5:]

The Google Finance API has not been stable since late 2017. Requests seem
to fail at random. Failure is especially common when bulk downloading.



RemoteDataError: Unable to read URL: https://finance.google.com/finance/historical?q=AAPL&startdate=Jan+01%2C+2011&enddate=Jun+14%2C+2018&output=csv
Response Text:
b'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"/><title>Sorry...</title><style> body { font-family: verdana, arial, sans-serif; background-color: #fff; color: #000; }</style></head><body><div><table><tr><td><b><font face=sans-serif size=10><font color=#4285f4>G</font><font color=#ea4335>o</font><font color=#fbbc05>o</font><font color=#4285f4>g</font><font color=#34a853>l</font><font color=#ea4335>e</font></font></b></td><td style="text-align: left; vertical-align: bottom; padding-bottom: 15px; width: 50%"><div style="border-bottom: 1px solid #dfdfdf;">Sorry...</div></td></tr></table></div><div style="margin-left: 4em;"><h1>We\'re sorry...</h1><p>... but your computer or network may be sending automated queries. To protect our users, we can\'t process your request right now.</p></div><div style="margin-left: 4em;">See <a href="https://support.google.com/websearch/answer/86640">Google Help</a> for more information.<br/><br/></div><div style="text-align: center; border-top: 1px solid #dfdfdf;"><a href="https://www.google.com">Google Home</a></div></body></html>'

> # GOOGLE API 문제로 예제 진행 불가능