In [1]:
%matplotlib inline
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
from sys import argv
from urllib.request import urlopen
from urllib.error import HTTPError
import requests
import itertools
import datetime as dt


# 함수 및 기본 세팅

In [2]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return list(itertools.zip_longest(*args, fillvalue=fillvalue))

In [3]:
def get_item(soup):
    return soup.find('table', attrs={'summary':"외국인 기관 순매매 거래량에 관한표이며 날짜별로 정보를 제공합니다."}).text.strip()

In [4]:
def get_num(soup):
    return soup.find('table', attrs={'class':"type2"}).text.strip()

In [5]:
column = [
 '날짜',
 '종가',
 '전일비',
 '등락률',
 '거래량',
 '기관 순매매량',
 '외국인 순매매량',
 '외국인 보유주수',
 '외국인 보유율']

In [6]:
column2 = ['날짜',
 '종가',
 '전일비',
 '시가',
 '고가',
 '저가',
 '거래량']

# 종목 코드 / 기간설정

In [7]:
code="005930"  #stock code

In [8]:
length = 5       #length of stock. 1page => about 20~30days.

# 파싱

## 외국인 및 기관 거래량

In [22]:
stock = []

In [23]:
for num in np.arange(1,length):
    page = requests.get("https://finance.naver.com/item/frgn.nhn?code={0}&page={1}".format(code,num))
    data = page.content
    stock_surround = BeautifulSoup(data, "html.parser")
    a = get_item(stock_surround).splitlines()
    ab = [i.split('\t')[-1] for i in a]
    stockdata = list(filter(lambda item: item.strip(), ab))[12:]
    smalldata = grouper(stockdata,9)
    for i in range(len(smalldata)):
        stock.append(smalldata[i])

In [None]:
stockDF = pd.DataFrame(data=stock, columns=column)


## 기본 주식 데이터

In [35]:
stock2 = []

In [36]:
for num in np.arange(1,length*2-1):
    page2 = requests.get("https://finance.naver.com/item/sise_day.nhn?code={0}&page={1}".format(code,num))
    data2 = page2.content
    stock_surround2 = BeautifulSoup(data2, "html.parser")
    a2 = get_num(stock_surround2).splitlines()
    ab2 = [i.split('\t')[-1] for i in a2]
    stocktable2 = list(filter(lambda item: item.strip(), ab2))
    stockdata2 = stocktable2[7:]
    smalldata2 = grouper(stockdata2,7)
    for i in range(len(smalldata2)):
        stock2.append(smalldata2[i])

In [37]:
stockDF2 = pd.DataFrame(data=stock2, columns=column2)


# data merge

In [67]:
stockDF2 = stockDF2.drop(["전일비"],axis=1)

In [64]:
stockDF = stockDF.drop(["종가","거래량","전일비","외국인 보유주수"],axis=1)

KeyError: "['종가' '거래량' '전일비' '외국인 보유주수'] not found in axis"

In [68]:
totalstock = pd.merge(stockDF, stockDF2, how='inner', on='날짜')

# data wrangling

In [70]:
totalstock["종가"] = totalstock["종가"].str.replace(",","").astype(float) 

In [71]:
totalstock["시가"] = totalstock["시가"].str.replace(",","").astype(float) 

In [72]:
totalstock["고가"] = totalstock["고가"].str.replace(",","").astype(float) 

In [73]:
totalstock["저가"] = totalstock["저가"].str.replace(",","").astype(float) 

In [74]:
totalstock["거래량"] = totalstock["거래량"].str.replace(",","").astype(float) 

In [75]:
totalstock["외국인 순매매량"] = totalstock["외국인 순매매량"].str.replace(",","").astype(float) 

In [76]:
totalstock["기관 순매매량"] = totalstock["기관 순매매량"].str.replace(",","").astype(float) 

In [77]:
totalstock["외국인 보유율"] = totalstock["외국인 보유율"].str[:-1].astype(float)

In [78]:
totalstock.등락률 = totalstock.등락률.str[:-1].astype(float)

In [96]:
totalstock["날짜"] = pd.to_datetime(stockDF.날짜)
totalstock["날짜"] = totalstock["날짜"]- totalstock.iloc[-1].날짜
totalstock["날짜"] = totalstock["날짜"].dt.days

In [99]:
totalstock

Unnamed: 0,날짜,등락률,기관 순매매량,외국인 순매매량,외국인 보유율,종가,시가,고가,저가,거래량
0,118,-1.99,180682.0,-1820539.0,52.46,44250.0,43800.0,44650.0,43700.0,10088229.0
1,116,0.22,265249.0,181822.0,52.49,45150.0,44850.0,45400.0,44850.0,6409259.0
2,115,-0.77,368891.0,-596295.0,52.49,45050.0,44950.0,45100.0,44650.0,9803831.0
3,112,-3.20,-4204792.0,-1439825.0,52.49,45400.0,46150.0,46400.0,44850.0,16670643.0
4,111,0.21,2295371.0,-2095059.0,52.52,46900.0,47000.0,47050.0,46450.0,12010886.0
5,110,0.21,-909721.0,1249339.0,52.55,46800.0,47000.0,47000.0,46550.0,6429338.0
6,109,1.97,-1043219.0,2234436.0,52.53,46700.0,46300.0,46750.0,45900.0,9058174.0
7,108,0.11,-781180.0,1677733.0,52.50,45800.0,46150.0,46150.0,45750.0,6771979.0
8,105,0.44,-954176.0,1797.0,52.48,45750.0,45850.0,45900.0,45450.0,8274944.0
9,104,-2.15,-2068770.0,-335326.0,52.48,45550.0,46550.0,46800.0,45500.0,8560924.0


## 피클데이터 저장

In [98]:
totalstock.to_pickle("{0}.bz2".format(code))

In [None]:
stockDF2