# Crawling Stock Data

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
# Downloading HTML : requests
url = "http://finance.naver.com/item/main.nhn?code=000660"
print(requests.get(url)) # Response라는 이름의 객체가 나온다

<Response [200]>


In [3]:
html = requests.get(url).text
# print(html) # the same with the original HTML code

In [4]:
# Parsing : BS (BS class의 object 생성)
soup = BeautifulSoup(html, "html5lib")
tags = soup.select("#_dvr")
print(tags)

[<em id="_dvr">0.97</em>]


In [5]:
em_tag = tags[0]
print(em_tag.text, type(em_tag.text))

0.97 <class 'str'>


In [6]:
# Excersise : parsing PER
soup = BeautifulSoup(html, "html5lib")
tags = soup.select("#_per")
print(tags)

[<em id="_per">4.83</em>]


In [7]:
em_tag = tags[0]
print(em_tag.text, type(em_tag.text))

4.83 <class 'str'>


In [8]:
# Chrome에서 구하고자 하는 데이터 '검사' → Copy Selector
# 단, BS에선 nth-child 지원하지 않으므로 다른 방법 찾아야

In [9]:
# 외국인 소진율을 가져와보자
tags = soup.select("#tab_con1 > div.gray > table > tbody > tr.strong > td > em")
#tags = soup.select("#tab_con1 > div:nth-of-type(2) > table > tbody > tr.strong > td > em")
print(tags[0].text)

50.88%


In [10]:
# Default Argument

def hap2(a, b=3):
    return a + b

ret = hap2(3)
print(ret) # 6

6


In [11]:
ret2 = hap2(4, 5)
print(ret2) # 9

9


# Korbit API

In [12]:
# https://apidocs.korbit.co.kr/ko/
# https://api.korbit.co.kr/v1/ticker?currency_pair=btc_krw

# * timestamp : 1970-01-01부터 기록
# 대략 15억 정도면 초 단위
# 15조 정도면 ms 단위

# HTTP SET/POST 프로토콜

In [13]:
# KORBIT 최종 체결가

import requests

payload = {"currency_pair" :"btc_krw"}
url = "https://api.korbit.co.kr/v1/ticker"

r = requests.get(url, params=payload)
print(r.text, type(r.text))

{"timestamp":1582457430496,"last":"11703000"} <class 'str'>


In [14]:
# JSON to Dictionary
contents = r.json()
print(contents, type(contents))

{'timestamp': 1582457430496, 'last': '11703000'} <class 'dict'>


In [15]:
# 응용
print('비트코인 현재가 : ', contents['last'])

비트코인 현재가 :  11703000


In [16]:
# 시간 가져오기
import datetime
timestamp = contents['timestamp']
cur_time = datetime.datetime.fromtimestamp(timestamp/1000) # ms to sec
print(cur_time, type(cur_time))

2020-02-23 20:30:30.496000 <class 'datetime.datetime'>


In [17]:
# 연습문제 - 실패???
pairs = ("btc_krw", "bch_krw", "etc_krw")
payload = {"currency_pair" : pairs}
print(payload, type(payload))

{'currency_pair': ('btc_krw', 'bch_krw', 'etc_krw')} <class 'dict'>


In [18]:
url = "https://api.korbit.co.kr/v1/ticker"

r = requests.get(url, params=payload)

for r in pairs : 
    print(r, contents)

btc_krw {'timestamp': 1582457430496, 'last': '11703000'}
bch_krw {'timestamp': 1582457430496, 'last': '11703000'}
etc_krw {'timestamp': 1582457430496, 'last': '11703000'}


# Pandas

In [19]:
# R의 dataframe 자료구조를 Python에 이식
# 새롭지 않다!

In [20]:
from pandas import Series # 가장 많이 쓰는 importing 방법

data = [100, 200, 300]
data *3

[100, 200, 300, 100, 200, 300, 100, 200, 300]

In [21]:
s = Series(data)
print(s, type(s))

0    100
1    200
2    300
dtype: int64 <class 'pandas.core.series.Series'>


In [22]:
print(s * 3)

0    300
1    600
2    900
dtype: int64


In [23]:
# index 부여

s = Series([8286500, 8146000, 7430000, 7410000, 7433000],
           index=['2018-04-13', '2018-04-12', '2018-04-11', '2018-04-10', '2018-04-09'])

print(s[0])

8286500


In [24]:
print(s['2018-04-13'])

8286500


In [25]:
print(s[[0,2,4]])

2018-04-13    8286500
2018-04-11    7430000
2018-04-09    7433000
dtype: int64


In [26]:
print(s['2018-04-13':'2018-04-11'])

2018-04-13    8286500
2018-04-12    8146000
2018-04-11    7430000
dtype: int64


In [27]:
# Dataframe 생성

from pandas import DataFrame

raw_data = {'open': [100, 90, 80, 70],
        'high': [110, 112, 90, 80],
        'low' : [90, 80, 70, 60],
        'close': [95, 85, 75, 63]}
df = DataFrame(raw_data)
print(df)

   open  high  low  close
0   100   110   90     95
1    90   112   80     85
2    80    90   70     75
3    70    80   60     63


In [28]:
df = DataFrame(raw_data, columns=['open', 'high', 'low', 'close'])
print(df)
# 파이썬 3.7 이전 버전에선 딕셔너리에 순서 존재 X, 따로 지정해주면 좋다.

   open  high  low  close
0   100   110   90     95
1    90   112   80     85
2    80    90   70     75
3    70    80   60     63


In [29]:
# 엑셀로 보내기
df.to_excel("dump.xlsx")
# saved at root directory

In [30]:
# 날짜는 날짜데이터로 넣자
# 아래 예시는 날짜를 datetime으로 받아오는 과정이 더 필요함
date_list = ['2018-04-14', '2018-04-13', '2018-04-12', '2018-04-11']
df = DataFrame(raw_data, columns=['open', 'high', 'low', 'close'], index=date_list)
print(df)

            open  high  low  close
2018-04-14   100   110   90     95
2018-04-13    90   112   80     85
2018-04-12    80    90   70     75
2018-04-11    70    80   60     63


In [31]:
print(df['open'])

2018-04-14    100
2018-04-13     90
2018-04-12     80
2018-04-11     70
Name: open, dtype: int64


In [32]:
print(df[['open', 'close']])

            open  close
2018-04-14   100     95
2018-04-13    90     85
2018-04-12    80     75
2018-04-11    70     63


In [None]:
df04 = df['2018-04'] # 이 상태로는 실행 X
df04.iloc[0] # work well on Spyder

In [34]:
# datetime.strptime()
import datetime

date = '2018-12-02'

date2 = datetime.datetime.strptime(date, '%Y-%m-%d')
print(date2, type(date2))

2018-12-02 00:00:00 <class 'datetime.datetime'>


# pybithumb

In [35]:
# Anaconda Prompt에서
# python -m pip install Pybithumb -- upgrade pip

import pybithumb

df = pybithumb.get_ohlcv("BTC")
print(df)

                           open       close        high         low  \
BTC                                                                   
2013-12-27 00:00:00    737000.0    755000.0    755000.0    737000.0   
2013-12-28 00:00:00    750000.0    750000.0    750000.0    750000.0   
2013-12-29 00:00:00    750000.0    739000.0    750000.0    728000.0   
2013-12-30 00:00:00    740000.0    768000.0    772000.0    740000.0   
2013-12-31 00:00:00    768000.0    768000.0    800000.0    763000.0   
...                         ...         ...         ...         ...   
2020-02-19 00:00:00  11421000.0  11855000.0  11940000.0  11418000.0   
2020-02-20 00:00:00  11858000.0  11460000.0  11959000.0  11350000.0   
2020-02-21 00:00:00  11460000.0  11536000.0  11613000.0  11350000.0   
2020-02-22 00:00:00  11536000.0  11489000.0  11591000.0  11397000.0   
2020-02-23 20:00:00  11489000.0  11709000.0  11800000.0  11442000.0   

                          volume  
BTC                               
2013-1

In [36]:
print(df['2018-12-02'])

                 open      close       high        low       volume
BTC                                                                
2018-12-02  4742000.0  4710000.0  4848000.0  4613000.0  3660.440148
