## yahoo, google finance api 가 중지 되었으니 다른 방법으로 크롤링을 해보자

In [1]:
import pandas as pd

code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0] 

code_df.종목코드 = code_df.종목코드.map('{:06d}'.format)

code_df = code_df[['회사명','종목코드']]

code_df = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'}) 
code_df.head()

Unnamed: 0,name,code
0,DSR,155660
1,GS,78930
2,GS글로벌,1250
3,HDC현대산업개발,294870
4,LG이노텍,11070


In [2]:
# 종목 이름을 입력하면 종목에 해당하는 코드를 불러와
# 네이버 금융("http://finance.naver.com") 에 넣어줌

def get_url(item_name, code_df):
    code = code_df.query("name=='{}'".format(item_name))['code'].to_string(index=False) 
    
    url = 'http://finance.naver.com/item/sise_day.nhn?code={}'.format(code) 
    
    # 이상하게 오랜만에 확인해보니 띄어쓰기가 발생한다?
    # 2020/7/7일 기준 임시 코드 적용
    url = url.split(' ')
    url = ''.join(url)
    print("요청 URL = {}".format(url)) 
    return url


item_name='GS'
url = get_url(item_name, code_df)

# 일자 데이터를 담은 df라는 DataFrame 정의
df = pd.DataFrame()

for page in range(1,21) :
    pg_url = '{url}&page={page}'.format(url=url, page=page) 
    df = df.append(pd.read_html(pg_url, header=0)[0], ignore_index=True) 

    
df = df.dropna()

df.head()

요청 URL = http://finance.naver.com/item/sise_day.nhn?code=078930


Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
1,2020.07.07,37000.0,0.0,37000.0,37000.0,37000.0,0.0
2,2020.07.06,37000.0,50.0,37000.0,37200.0,36400.0,148789.0
3,2020.07.03,36950.0,250.0,37200.0,37500.0,36600.0,180077.0
4,2020.07.02,37200.0,850.0,36350.0,37200.0,36200.0,201004.0
5,2020.07.01,36350.0,100.0,36500.0,36800.0,36250.0,110097.0


In [3]:
# 한글로 된 컬럼명을 영어로 바꿔줌 
df = df.rename(columns= {'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'}) 
# 데이터의 타입을 int형으로 바꿔줌 
df[['close', 'diff', 'open', 'high', 'low', 'volume']] \
= df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int) 
# 컬럼명 'date'의 타입을 date로 바꿔줌 
df['date'] = pd.to_datetime(df['date']) 
# 일자(date)를 기준으로 오름차순 정렬 
df = df.sort_values(by=['date'], ascending=True) 
# 상위 5개 데이터 확인 
df.head()


Unnamed: 0,date,close,diff,open,high,low,volume
298,2019-09-17,50400,200,51000,51000,50200,82769
297,2019-09-18,50700,300,50500,50800,49800,128242
296,2019-09-19,50000,700,50400,50400,49650,119660
295,2019-09-20,51200,1200,49550,51200,49550,174895
294,2019-09-23,50700,500,50900,50900,50300,55549


In [4]:
!pip install plotly

Collecting plotly
  Using cached plotly-4.8.2-py2.py3-none-any.whl (11.5 MB)
Installing collected packages: plotly
Successfully installed plotly-4.8.2


In [5]:
import plotly.offline as offline
import plotly.graph_objs as go

In [12]:
offline.init_notebook_mode(connected=True)
trace = go.Scatter(x=df.date,y=df.close,name=item_name)
data = [trace]

In [13]:
layout = dict(
    title='{}의 종가(close) Time Series'.format(item_name), 
    xaxis=dict( 
        rangeselector=dict( 
            buttons=list([ 
                dict(count=1, 
                     label='1m', 
                     step='month', 
                     stepmode='backward'), 
                dict(count=3, 
                     label='3m', 
                     step='month', 
                     stepmode='backward'),
                dict(count=6, 
                     label='6m', 
                     step='month', 
                     stepmode='backward'), 
                dict(step='all') 
            ]) 
        ), 
        rangeslider=dict(), 
        type='date' 
                                                                              ) 
             )


In [14]:
fig = go.Figure(data=data,layout=layout)

In [15]:
offline.iplot(fig)