In [1]:
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
plt.rcParams['axes.unicode_minus'] = False

import platform
path = 'c:/Windows/Fonts/malgun.ttf'
from matplotlib import font_manager, rc
if platform.system() == 'Darwin':
    rc('font', family='AppleGothic')
elif platform.system() == 'Windows':
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family=font_name)
else:
    print('Unknown system... sorry~~~~~')



In [2]:
import requests
from bs4 import BeautifulSoup

In [3]:

code_df = pd.read_html('http://kind.krx.co.kr/corpgeneral/corpList.do?method=download&searchType=13', header=0)[0] 
# 종목코드가 6자리이기 때문에 6자리를 맞춰주기 위해 설정해줌 
code_df.종목코드 = code_df.종목코드.map('{:06d}'.format) 
# 우리가 필요한 것은 회사명과 종목코드이기 때문에 필요없는 column들은 제외해준다. 
code_df = code_df[['회사명', '종목코드']] # 한글로된 컬럼명을 영어로 바꿔준다. 
code_df = code_df.rename(columns={'회사명': 'name', '종목코드': 'code'}) 
code_df.head()


Unnamed: 0,name,code
0,DSR,155660
1,GS,78930
2,GS글로벌,1250
3,HDC현대산업개발,294870
4,LG이노텍,11070


In [4]:
# 종목 이름을 입력하면 종목에 해당하는 코드를 불러와 
# 네이버 금융(http://finance.naver.com)에 넣어줌 
def get_url(item_name, code_df): 
    code = code_df.query("name=='{}'".format(item_name))['code'].to_string(index=False) 
    code = code.strip()
    url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code)
    
    print("요청 URL = {}".format(url)) 
    return url # 기업은행의 일자데이터 url 가져오기 


In [5]:

item_name='기업은행' 
url = get_url(item_name, code_df) 


# 일자 데이터를 담을 df라는 DataFrame 정의 
df = pd.DataFrame() 

요청 URL = http://finance.naver.com/item/sise_day.nhn?code=024110


In [6]:
for page in range(1, 257): 
    pg_url = '{url}&page={page}'.format(url=url, page=page) 
    df = df.append(pd.read_html(pg_url, header=0)[0], ignore_index=True)

In [7]:
df

Unnamed: 0,날짜,종가,전일비,시가,고가,저가,거래량
0,,,,,,,
1,2020.06.02,9000.0,520.0,8470.0,9050.0,8460.0,5531440.0
2,2020.06.01,8480.0,200.0,8300.0,8500.0,8300.0,2861703.0
3,2020.05.29,8280.0,40.0,8110.0,8400.0,8030.0,4819669.0
4,2020.05.28,8240.0,280.0,8170.0,8380.0,8060.0,7549725.0
...,...,...,...,...,...,...,...
3835,2010.01.20,14350.0,400.0,14850.0,14850.0,14200.0,2113118.0
3836,2010.01.19,14750.0,300.0,14400.0,14900.0,14350.0,1845561.0
3837,2010.01.18,14450.0,100.0,14250.0,14550.0,14000.0,1084530.0
3838,2010.01.15,14350.0,150.0,14250.0,14500.0,14150.0,1137035.0


In [8]:
df = df.dropna()

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2560 entries, 1 to 3838
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   날짜      2560 non-null   object 
 1   종가      2560 non-null   float64
 2   전일비     2560 non-null   float64
 3   시가      2560 non-null   float64
 4   고가      2560 non-null   float64
 5   저가      2560 non-null   float64
 6   거래량     2560 non-null   float64
dtypes: float64(6), object(1)
memory usage: 160.0+ KB


In [10]:
# 한글로 된 컬럼명을 영어로 바꿔줌 
df = df.rename(columns= {'날짜': 'date', '종가': 'close', '전일비': 'diff', '시가': 'open', '고가': 'high', '저가': 'low', '거래량': 'volume'}) 

In [11]:
# 데이터의 타입을 int형으로 바꿔줌 
df[['close', 'diff', 'open', 'high', 'low', 'volume']]  = df[['close', 'diff', 'open', 'high', 'low', 'volume']].astype(int)
# 컬럼명 'date'의 타입을 date로 바꿔줌 
df['date'] = pd.to_datetime(df['date']) 
# 일자(date)를 기준으로 오름차순 정렬 
df = df.sort_values(by=['date'], ascending=False) 
# 상위 5개 데이터 확인 
df.head()


Unnamed: 0,date,close,diff,open,high,low,volume
1,2020-06-02,9000,520,8470,9050,8460,5531440
2,2020-06-01,8480,200,8300,8500,8300,2861703
3,2020-05-29,8280,40,8110,8400,8030,4819669
4,2020-05-28,8240,280,8170,8380,8060,7549725
5,2020-05-27,7960,190,7860,7990,7810,4295846


In [12]:
df.at[1,'close']

9000

In [13]:
df['diff+-'] = df['close'].diff(-1)

In [14]:
df

Unnamed: 0,date,close,diff,open,high,low,volume,diff+-
1,2020-06-02,9000,520,8470,9050,8460,5531440,520.0
2,2020-06-01,8480,200,8300,8500,8300,2861703,200.0
3,2020-05-29,8280,40,8110,8400,8030,4819669,40.0
4,2020-05-28,8240,280,8170,8380,8060,7549725,280.0
5,2020-05-27,7960,190,7860,7990,7810,4295846,190.0
...,...,...,...,...,...,...,...,...
3834,2010-01-21,14200,150,14200,14450,14100,1208311,-150.0
3835,2010-01-20,14350,400,14850,14850,14200,2113118,-400.0
3836,2010-01-19,14750,300,14400,14900,14350,1845561,300.0
3837,2010-01-18,14450,100,14250,14550,14000,1084530,100.0


In [16]:
updown =[]
for row in df['diff+-']:
   
    if row > 0: 
        updown.append('1') 
    elif row < 0:  
        updown.append('0')
    else:
        updown.append('none')

In [17]:
df['up/down'] = updown

In [18]:
df

Unnamed: 0,date,close,diff,open,high,low,volume,diff+-,up/down
1,2020-06-02,9000,520,8470,9050,8460,5531440,520.0,1
2,2020-06-01,8480,200,8300,8500,8300,2861703,200.0,1
3,2020-05-29,8280,40,8110,8400,8030,4819669,40.0,1
4,2020-05-28,8240,280,8170,8380,8060,7549725,280.0,1
5,2020-05-27,7960,190,7860,7990,7810,4295846,190.0,1
...,...,...,...,...,...,...,...,...,...
3834,2010-01-21,14200,150,14200,14450,14100,1208311,-150.0,0
3835,2010-01-20,14350,400,14850,14850,14200,2113118,-400.0,0
3836,2010-01-19,14750,300,14400,14900,14350,1845561,300.0,1
3837,2010-01-18,14450,100,14250,14550,14000,1084530,100.0,1


In [19]:
df.to_csv("IBK 크롤링-jm.csv")