In [1]:
%matplotlib inline
import numpy as np
import re
import pandas as pd
from bs4 import BeautifulSoup
from sys import argv
from urllib.request import urlopen
from urllib.error import HTTPError
import requests
import itertools
import datetime as dt


# Define Function and Setting

In [2]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return list(itertools.zip_longest(*args, fillvalue=fillvalue))

In [3]:
def get_item(soup):
    return soup.find('table', attrs={'summary':"외국인 기관 순매매 거래량에 관한표이며 날짜별로 정보를 제공합니다."}).text.strip()

In [4]:
def get_num(soup):
    return soup.find('table', attrs={'class':"type2"}).text.strip()

In [5]:
column = [
 '날짜',
 '종가',
 '전일비',
 '등락률',
 '거래량',
 '기관 순매매량',
 '외국인 순매매량',
 '외국인 보유주수',
 '외국인 보유율']

In [6]:
column2 = ['날짜',
 '종가',
 '전일비',
 '시가',
 '고가',
 '저가',
 '거래량']

# Stock Code, length of Stock

In [7]:
code="035720"  #stock code

In [8]:
length = 20       #length of stock. 1page => about 20~30days.

# Parshing

## Foreign and institutional volume

In [9]:
stock = []

In [10]:
for num in np.arange(1,length):
    page = requests.get("https://finance.naver.com/item/frgn.nhn?code={0}&page={1}".format(code,num))
    data = page.content
    stock_surround = BeautifulSoup(data, "html.parser")
    a = get_item(stock_surround).splitlines()
    ab = [i.split('\t')[-1] for i in a]
    stockdata = list(filter(lambda item: item.strip(), ab))[12:]
    smalldata = grouper(stockdata,9)
    for i in range(len(smalldata)):
        stock.append(smalldata[i])

In [11]:
stockDF = pd.DataFrame(data=stock, columns=column)


## Stock Data

In [12]:
stock2 = []

In [13]:
for num in np.arange(1,length*2-1):
    page2 = requests.get("https://finance.naver.com/item/sise_day.nhn?code={0}&page={1}".format(code,num))
    data2 = page2.content
    stock_surround2 = BeautifulSoup(data2, "html.parser")
    a2 = get_num(stock_surround2).splitlines()
    ab2 = [i.split('\t')[-1] for i in a2]
    stocktable2 = list(filter(lambda item: item.strip(), ab2))
    stockdata2 = stocktable2[7:]
    smalldata2 = grouper(stockdata2,7)
    for i in range(len(smalldata2)):
        stock2.append(smalldata2[i])

In [14]:
stockDF2 = pd.DataFrame(data=stock2, columns=column2)


# data merge

In [15]:
stockDF2 = stockDF2.drop(["전일비"],axis=1)

In [16]:
stockDF = stockDF.drop(["종가","거래량","전일비","외국인 보유주수"],axis=1)

In [17]:
totalstock = pd.merge(stockDF, stockDF2, how='inner', on='날짜')

# data wrangling

In [18]:
totalstock["종가"] = totalstock["종가"].str.replace(",","").astype(float) 

In [19]:
totalstock["시가"] = totalstock["시가"].str.replace(",","").astype(float) 

In [20]:
totalstock["고가"] = totalstock["고가"].str.replace(",","").astype(float) 

In [21]:
totalstock["저가"] = totalstock["저가"].str.replace(",","").astype(float) 

In [22]:
totalstock["거래량"] = totalstock["거래량"].str.replace(",","").astype(float) 

In [23]:
totalstock["외국인 순매매량"] = totalstock["외국인 순매매량"].str.replace(",","").astype(float) 

In [24]:
totalstock["기관 순매매량"] = totalstock["기관 순매매량"].str.replace(",","").astype(float) 

In [25]:
totalstock["외국인 보유율"] = totalstock["외국인 보유율"].str[:-1].astype(float)

In [26]:
totalstock.등락률 = totalstock.등락률.str[:-1].astype(float)

In [27]:
totalstock["날짜"] = pd.to_datetime(stockDF.날짜)
totalstock["기간"] = totalstock["날짜"]- totalstock.iloc[-1].날짜
totalstock["기간"] = totalstock["기간"].dt.days

In [28]:
totalstock

Unnamed: 0,날짜,등락률,기관 순매매량,외국인 순매매량,외국인 보유율,종가,시가,고가,저가,거래량,기간
0,2018-08-17,-0.39,65833.0,-7821.0,24.84,127000.0,128500.0,129000.0,126500.0,325943.0,568
1,2018-08-16,0.00,107192.0,-28291.0,24.85,127500.0,124500.0,128000.0,124000.0,478674.0,567
2,2018-08-14,2.82,117535.0,-16977.0,24.91,127500.0,125000.0,128500.0,123000.0,518080.0,565
3,2018-08-13,-3.12,25179.0,-4321.0,24.93,124000.0,126500.0,127500.0,122500.0,636617.0,564
4,2018-08-10,1.99,419755.0,36761.0,24.99,128000.0,125500.0,129500.0,124000.0,1269641.0,561
5,2018-08-09,5.46,325660.0,199748.0,24.93,125500.0,121000.0,126000.0,119500.0,1622449.0,560
6,2018-08-08,-0.83,105446.0,-15822.0,24.65,119000.0,122000.0,124500.0,118000.0,1216684.0,559
7,2018-08-07,5.73,57785.0,-59047.0,24.69,120000.0,114000.0,120000.0,113000.0,742353.0,558
8,2018-08-06,0.44,17136.0,15476.0,24.75,113500.0,113500.0,114500.0,112500.0,189971.0,557
9,2018-08-03,1.80,-6462.0,26766.0,24.73,113000.0,111500.0,113000.0,111500.0,169521.0,554


## Save as a pickle

In [29]:
totalstock.to_pickle("{0}.bz2".format(code))