# 라이브러리 및 옵션

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta

In [2]:
# 데이터 분석 기본 라이브러리 관련
import os
import sys
import platform
import warnings
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook

# EDA 관련
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#import missingno as msno

# 데이터 전처리 관련
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [3]:
warnings.filterwarnings(action='ignore')

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 200
plt.style.use('ggplot')

current_os = platform.platform().split("-")[0]
if current_os == 'Windows':
    print(f'현재 사용되는 운영체제: {current_os}')
    plt.rc('font', family='Malgun Gothic') # For Windows
    plt.rc("axes",unicode_minus=False)
elif current_os == 'Darwin': # macOS
    print(f"본 내용은 Windows 환경에서 분석됐습니다. 다른 OS로 하시게 될 경우 에러가 날 수 있습니다.")
    plt.rc('font', family='AppleGothic') # For Windows

현재 사용되는 운영체제: Windows


# 크롤링

- 삼성전자(시총 기준 large cap)
- SK하이닉스(시총 기준 large cap)
- NAVER(시총 기준 large cap)
- 씨젠(코로나 테마)
- 우리들휴브레인(코로나)
- 현대차(자동차 대표주)
- DGB금융지주(은행)
- 마스터블루(출판)
- 셀트리온
- 데일리 블록체인
- 소리바다
- 한화솔루션
- 아모레퍼시픽
- CJ대한통운
- GS건설

In [4]:
startdate = datetime.datetime.strftime(datetime.datetime(year=2000, month=1, day=1), '%Y.%m.%d')

In [24]:
codes=['005930','000660','035420','096530','118000','005380','139130','207760','068270','139050','053110','009830','090430','000120','006360']
names=['삼성전자','SK하이닉스','NAVER','씨젠','우리들휴브레인','현대차','DGB금융지주','미스터블루','셀트리온','데일리블록체인','소리바다','한화솔루션','아모레퍼시픽','CJ대한통운','GS건설']

In [25]:
for code,name in zip(codes,names):
    url = 'http://finance.naver.com/item/sise_day.nhn?code={code}'.format(code=code)
    request = requests.get(url)
    request.encoding = 'utf-8'
    #request.status_code

    bs=BeautifulSoup(request.text,'html.parser')

    navi=bs.find('table',class_='Nnavi')
    td=navi.find('td',class_='pgRR')
    last_page=td.a.get('href').rsplit('&')[1]
    last_page=last_page.split('=')[1]
    last_page=int(last_page)

    def parse_page(code, page):
        try:
            url = 'http://finance.naver.com/item/sise_day.nhn?code={code}&page={page}'.format(code=code, page=page)
            request=requests.get(url)
            bs = BeautifulSoup(request.text, 'html.parser')
            df = pd.read_html(str(bs.find("table")),header=0)[0]
            df = df.dropna()
            return df
        except Exception as e:
            traceback.print_exc()
        return None

    df=None
    for page in range(1,last_page+1):
        data=parse_page(code,page)
        data_filtered=data[data['날짜']>startdate]
        if df is None:
            df = data_filtered
        else:
            df = pd.concat([df,data_filtered])
        if len(data) > len(data_filtered):
            break

    df=df.sort_values(by=['날짜'],axis=0)
    df.to_csv(name+'.csv',index=False,encoding='cp949')