In [1]:
import pandas as pd
import sqlite3
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import urllib
import time
import numpy as np
from datetime import date, timedelta

#### 1. DB 생성

In [2]:
# DB 생성
conn = sqlite3.connect('./data/meat_price.db')
cur = conn.cursor()

In [3]:
# 테이블 생성
try:
    cur.execute('create table 가격정보(고유번호, 기준일자, 페이지, 지역, 판매점, 상품명, 가격)')
except:
    pass
# 초기화
#cur.execute('delete from 가격정보')

<sqlite3.Cursor at 0x7e40340>

#### 2. 데이터 수집

In [4]:
# 홈페이지 접근
def accessPage():
    driver = webdriver.Chrome('chromedriver')
    # 홈페이지 접근하기
    print('Start Accessing Page...')
    url = 'https://www.price.go.kr/tprice/portal/dailynecessitypriceinfo/priceiteminfo/getPriceItemInfoList.do'
    price = {'goodClassCode':'030101000',
             'goodSmlclsCode':'030101002',
             'pageUnit':'50'}
    url_args = '{}?{}'.format(url, urllib.parse.urlencode(price))
    driver.get(url_args)
    
    # 전체 업태 조회하기
    buttons = ['chk_LM','chk_DP','chk_SM','chk_TR','chk_CS']
    for bt in buttons:
        driver.find_element_by_id(bt).click()
    driver.find_element_by_id('search_btn').click()
    print('Accessing done...')
    
    return driver

In [5]:
# 크롤러
def getData(driver, code):
    
    # 기준일자 및 상품 설정
    goods = {'50':'쇠고기등심(1등급)','247':'쇠고기불고기(1+등급)','248':'쇠고기등심(1+등급)','157':'쇠고기불고기(1등급)'}
    year = code[:4]
    month = code[4:6]
    day = code[6:8]
    goodid = code[-3:] if code[-3:] != '050' else '50'
    goodnm = goods[goodid]
    dt = year+'-'+month+'-'+day
    
    # 환경설정
    try:
        Select(driver.find_element_by_id('inspectYear')).select_by_value(year)
    except:
        Select(driver.find_element_by_id('inspectYear')).select_by_value(year)
    time.sleep(2)
    Select(driver.find_element_by_id('inspectMonth')).select_by_value(month)
    time.sleep(2)
    Select(driver.find_element_by_id('inspectDay')).select_by_value(day)
    Select(driver.find_element_by_id('goodId')).select_by_value(goodid)
    
    # 시작페이지 설정
    sql = "select max(페이지) from 가격정보 where 기준일자='"+dt+"' and 상품명='"+goodnm+"'"
    cur.execute(sql)
    fetch = cur.fetchall()[0][0]
    page = (0 if fetch == None else fetch) + 1

    # 데이터 수집하기
    print('Start batch from {:} page...(code : {:})'.format(page, code))
    while True:
        # 페이지 이동 및 수집
        driver.execute_script("fn_gotoPage('"+str(page)+"')")
        html = WebDriverWait(driver, 10).until(EC.presence_of_element_located(('css selector', "table.table_t1"))).get_attribute('innerHTML')
        df = pd.read_html('<table>'+html+'</table>')[0].iloc[:,:4]
        if df.iloc[0,0] == "검색된 내용이 없습니다.":
            break
        # 데이터 적재하기
        sql = 'insert into 가격정보 values (?,?,?,?,?,?,?)'
        for row in df.iterrows():
            cur.execute(sql, [code, dt, page]+list(row[1]))
        conn.commit()
        print('{} page... done'.format(page))
        page += 1
        time.sleep(max(10+2*np.random.normal(),5))
    print('End batch...(code : {:})'.format(code))    
    
    return None

In [6]:
# 수집일자 생성
def genDate(start): 
    days = start
    end = date(2018,7,20)
    dates = []
    while days <= end:
        dates.append(days.strftime('%Y-%m-%d'))
        days += timedelta(days=7)
    return dates

In [7]:
# 고유번호 생성
def genCode(dates, goodids):
    codes = []
    for dt in dates:
        for goodid in goodids:
            code = dt[:4] + dt[5:7] + dt[-2:] + (goodid if goodid != '50' else '0'+goodid)
            codes.append(code)  
    return codes

In [8]:
# 최신코드 설정
def setLastCode():
    sql = 'select max(고유번호) from 가격정보'
    cur.execute(sql)
    last_code = cur.fetchall()[0][0]
    if last_code == None:
        last_code = -1
    else:
        last_code = int(last_code) 
    return last_code

In [9]:
# 환경설정
dates = genDate(date(2017,1,6))
goodids = ['50','157','247','248']
codes = genCode(dates, goodids)

In [None]:
# 초기화
if 'driver' not in dir():
    driver = accessPage()
    
last_code = setLastCode()
for code in codes:
    if int(code) < last_code:
        print('{:} is passed'.format(code))
        continue
    getData(driver, code)

20170106050 is passed
Start batch from 2 page...(code : 20170106157)
2 page... done
End batch...(code : 20170106157)
Start batch from 1 page...(code : 20170106247)
1 page... done
2 page... done


In [13]:
# 확인
sql = 'select * from 가격정보'
cur.execute(sql)
data = pd.DataFrame(cur.fetchall(), columns=('고유번호','기준일자', '페이지', '지역', '판매점', '상품명', '가격'))
data

Unnamed: 0,고유번호,기준일자,페이지,지역,판매점,상품명,가격
0,20170106050,2017-01-06,1,경상남도,마산어시장,쇠고기등심(1등급),"4,980원/100g 4,980원/100g"
1,20170106050,2017-01-06,1,충청남도,천안남산중앙시장,쇠고기등심(1등급),"5,300원/100g 5,300원/100g"
2,20170106050,2017-01-06,1,전라북도,전주중앙시장,쇠고기등심(1등급),"5,830원/100g 5,830원/100g"
3,20170106050,2017-01-06,1,서울특별시,농협유통창동점,쇠고기등심(1등급),"5,980원/100g 5,980원/100g"
4,20170106050,2017-01-06,1,경상북도,롯데슈퍼경주점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
5,20170106050,2017-01-06,1,경기도,롯데슈퍼공도점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
6,20170106050,2017-01-06,1,경기도,롯데슈퍼광명소하2점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
7,20170106050,2017-01-06,1,경기도,롯데슈퍼광주역동점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
8,20170106050,2017-01-06,1,부산광역시,롯데슈퍼괴정점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
9,20170106050,2017-01-06,1,충청북도,롯데슈퍼금천점,쇠고기등심(1등급),"5,990원/100g 5,990원/100g"
