In [66]:
# -*- coding: utf-8 -*-

import os
import re
import sys
import csv
import time
import string
import logging
import requests
import argparse
from datetime import datetime, timedelta

from os import mkdir
from os.path import isdir

class Crawler():
    def __init__(self, prefix="data"):
        ''' Make directory if not exist when initialize '''
        if not isdir(prefix):
            mkdir(prefix)
        self.prefix = prefix

    def _clean_row(self, row):
        ''' Clean comma and spaces '''
        for index, content in enumerate(row):
            row[index] = re.sub(",", "", content.strip())
        return row

    def _record(self, stock_id, row):
        ''' Save row to csv file '''
        f = open('{}/{}.csv'.format(self.prefix, stock_id), 'a')
        cw = csv.writer(f, lineterminator='\n')
        cw.writerow(row)
        f.close()

    def _get_tse_data(self, date_tuple):
        date_str = '{0}{1:02d}{2:02d}'.format(date_tuple[0], date_tuple[1], date_tuple[2])
        url = 'http://www.twse.com.tw/exchangeReport/MI_INDEX'

        query_params = {
            'date': date_str,
            'response': 'json',
            'type': 'ALL',
            '_': str(round(time.time() * 1000) - 500)
        }

        # Get json data
        page = requests.get(url, params=query_params)

        if not page.ok:
            logging.error("Can not get TSE data at {}".format(date_str))
            return

        content = page.json()

        # For compatible with original data
        date_str_mingguo = '{0}/{1:02d}/{2:02d}'.format(date_tuple[0] - 1911, date_tuple[1], date_tuple[2])

        for data in content['data9']:
            sign = '-' if data[9].find('green') > 0 else ''
            row = self._clean_row([
                date_str_mingguo, # 日期
                data[2], # 成交股數
                data[4], # 成交金額
                data[5], # 開盤價
                data[6], # 最高價
                data[7], # 最低價
                data[8], # 收盤價
                sign + data[10], # 漲跌價差
                data[3], # 成交筆數
            ])

            self._record(data[0].strip(), row)

    def _get_otc_data(self, date_tuple):
        date_str = '{0}/{1:02d}/{2:02d}'.format(date_tuple[0] - 1911, date_tuple[1], date_tuple[2])
        ttime = str(int(time.time()*100))
        url = 'http://www.tpex.org.tw/web/stock/aftertrading/daily_close_quotes/stk_quote_result.php?l=zh-tw&d={}&_={}'.format(date_str, ttime)
        page = requests.get(url)

        if not page.ok:
            logging.error("Can not get OTC data at {}".format(date_str))
            return

        result = page.json()

        if result['reportDate'] != date_str:
            logging.error("Get error date OTC data at {}".format(date_str))
            return

        for table in [result['mmData'], result['aaData']]:
            for tr in table:
                row = self._clean_row([
                    date_str,
                    tr[8], # 成交股數
                    tr[9], # 成交金額
                    tr[4], # 開盤價
                    tr[5], # 最高價
                    tr[6], # 最低價
                    tr[2], # 收盤價
                    tr[3], # 漲跌價差
                    tr[10] # 成交筆數
                ])
                self._record(tr[0], row)


    def get_data(self, date_tuple):
        print('Crawling {}'.format(date_tuple))
        self._get_tse_data(date_tuple)
        self._get_otc_data(date_tuple)

In [67]:
if not os.path.isdir('log'):
    os.makedirs('log')
logging.basicConfig(filename='log/crawl-error.log',
    level=logging.ERROR,
    format='%(asctime)s\t[%(levelname)s]\t%(message)s',
    datefmt='%Y/%m/%d %H:%M:%S')

# Get arguments
parser = argparse.ArgumentParser(description='Crawl data at assigned day')
parser.add_argument('day', type=int, nargs='*',
    help='assigned day (format: YYYY MM DD), default is today')
parser.add_argument('-b', '--back', action='store_true',
    help='crawl back from assigned day until 2004/2/11')
parser.add_argument('-c', '--check', action='store_true',
    help='crawl back 10 days for check data')

_StoreTrueAction(option_strings=['-c', '--check'], dest='check', nargs=0, const=True, default=False, type=None, choices=None, help='crawl back 10 days for check data', metavar=None)

In [73]:
first_day = datetime.today()
last_day = datetime(2004, 2, 11)
crawler = Crawler()
max_error = 5
error_times = 0
while error_times < max_error and first_day >= last_day:
    try:
        crawler.get_data((first_day.year, first_day.month, first_day.day))
        error_times = 0
    except:
        date_str = first_day.strftime('%Y/%m/%d')
        logging.error('Crawl raise error {}'.format(date_str))
        error_times += 1
        continue
    finally:
        first_day -= timedelta(1)

Crawling (2019, 9, 19)
Crawling (2019, 9, 18)
Crawling (2019, 9, 17)
Crawling (2019, 9, 16)
Crawling (2019, 9, 15)


ERROR:root:Crawl raise error 2019/09/15


Crawling (2019, 9, 14)


ERROR:root:Crawl raise error 2019/09/14


Crawling (2019, 9, 13)


ERROR:root:Crawl raise error 2019/09/13


Crawling (2019, 9, 12)
Crawling (2019, 9, 11)
Crawling (2019, 9, 10)
Crawling (2019, 9, 9)
Crawling (2019, 9, 8)


ERROR:root:Crawl raise error 2019/09/08


Crawling (2019, 9, 7)


ERROR:root:Crawl raise error 2019/09/07


Crawling (2019, 9, 6)
Crawling (2019, 9, 5)
Crawling (2019, 9, 4)
Crawling (2019, 9, 3)
Crawling (2019, 9, 2)
Crawling (2019, 9, 1)


ERROR:root:Crawl raise error 2019/09/01


Crawling (2019, 8, 31)


ERROR:root:Crawl raise error 2019/08/31


Crawling (2019, 8, 30)
Crawling (2019, 8, 29)
Crawling (2019, 8, 28)
Crawling (2019, 8, 27)
Crawling (2019, 8, 26)
Crawling (2019, 8, 25)


ERROR:root:Crawl raise error 2019/08/25


Crawling (2019, 8, 24)


ERROR:root:Crawl raise error 2019/08/24


Crawling (2019, 8, 23)
Crawling (2019, 8, 22)
Crawling (2019, 8, 21)
Crawling (2019, 8, 20)
Crawling (2019, 8, 19)
Crawling (2019, 8, 18)


ERROR:root:Crawl raise error 2019/08/18


Crawling (2019, 8, 17)


ERROR:root:Crawl raise error 2019/08/17


Crawling (2019, 8, 16)
Crawling (2019, 8, 15)
Crawling (2019, 8, 14)
Crawling (2019, 8, 13)
Crawling (2019, 8, 12)
Crawling (2019, 8, 11)


ERROR:root:Crawl raise error 2019/08/11


Crawling (2019, 8, 10)


ERROR:root:Crawl raise error 2019/08/10


Crawling (2019, 8, 9)


ERROR:root:Crawl raise error 2019/08/09


Crawling (2019, 8, 8)
Crawling (2019, 8, 7)
Crawling (2019, 8, 6)
Crawling (2019, 8, 5)
Crawling (2019, 8, 4)


ERROR:root:Crawl raise error 2019/08/04


Crawling (2019, 8, 3)


ERROR:root:Crawl raise error 2019/08/03


Crawling (2019, 8, 2)
Crawling (2019, 8, 1)
Crawling (2019, 7, 31)
Crawling (2019, 7, 30)
Crawling (2019, 7, 29)
Crawling (2019, 7, 28)


ERROR:root:Crawl raise error 2019/07/28


Crawling (2019, 7, 27)


ERROR:root:Crawl raise error 2019/07/27


Crawling (2019, 7, 26)
Crawling (2019, 7, 25)
Crawling (2019, 7, 24)
Crawling (2019, 7, 23)
Crawling (2019, 7, 22)
Crawling (2019, 7, 21)


ERROR:root:Crawl raise error 2019/07/21


Crawling (2019, 7, 20)


ERROR:root:Crawl raise error 2019/07/20


Crawling (2019, 7, 19)
Crawling (2019, 7, 18)
Crawling (2019, 7, 17)
Crawling (2019, 7, 16)
Crawling (2019, 7, 15)
Crawling (2019, 7, 14)


ERROR:root:Crawl raise error 2019/07/14


Crawling (2019, 7, 13)


ERROR:root:Crawl raise error 2019/07/13


Crawling (2019, 7, 12)
Crawling (2019, 7, 11)
Crawling (2019, 7, 10)
Crawling (2019, 7, 9)
Crawling (2019, 7, 8)
Crawling (2019, 7, 7)


ERROR:root:Crawl raise error 2019/07/07


Crawling (2019, 7, 6)


ERROR:root:Crawl raise error 2019/07/06


Crawling (2019, 7, 5)
Crawling (2019, 7, 4)
Crawling (2019, 7, 3)
Crawling (2019, 7, 2)
Crawling (2019, 7, 1)
Crawling (2019, 6, 30)


ERROR:root:Crawl raise error 2019/06/30


Crawling (2019, 6, 29)


ERROR:root:Crawl raise error 2019/06/29


Crawling (2019, 6, 28)
Crawling (2019, 6, 27)
Crawling (2019, 6, 26)
Crawling (2019, 6, 25)
Crawling (2019, 6, 24)
Crawling (2019, 6, 23)


ERROR:root:Crawl raise error 2019/06/23


Crawling (2019, 6, 22)


ERROR:root:Crawl raise error 2019/06/22


Crawling (2019, 6, 21)
Crawling (2019, 6, 20)
Crawling (2019, 6, 19)
Crawling (2019, 6, 18)
Crawling (2019, 6, 17)
Crawling (2019, 6, 16)


ERROR:root:Crawl raise error 2019/06/16


Crawling (2019, 6, 15)


ERROR:root:Crawl raise error 2019/06/15


Crawling (2019, 6, 14)
Crawling (2019, 6, 13)
Crawling (2019, 6, 12)
Crawling (2019, 6, 11)
Crawling (2019, 6, 10)
Crawling (2019, 6, 9)


ERROR:root:Crawl raise error 2019/06/09


Crawling (2019, 6, 8)


ERROR:root:Crawl raise error 2019/06/08


Crawling (2019, 6, 7)


ERROR:root:Crawl raise error 2019/06/07


Crawling (2019, 6, 6)
Crawling (2019, 6, 5)
Crawling (2019, 6, 4)
Crawling (2019, 6, 3)
Crawling (2019, 6, 2)


ERROR:root:Crawl raise error 2019/06/02


Crawling (2019, 6, 1)


ERROR:root:Crawl raise error 2019/06/01


Crawling (2019, 5, 31)
Crawling (2019, 5, 30)
Crawling (2019, 5, 29)
Crawling (2019, 5, 28)
Crawling (2019, 5, 27)
Crawling (2019, 5, 26)


ERROR:root:Crawl raise error 2019/05/26


Crawling (2019, 5, 25)


ERROR:root:Crawl raise error 2019/05/25


Crawling (2019, 5, 24)
Crawling (2019, 5, 23)
Crawling (2019, 5, 22)
Crawling (2019, 5, 21)
Crawling (2019, 5, 20)
Crawling (2019, 5, 19)


ERROR:root:Crawl raise error 2019/05/19


Crawling (2019, 5, 18)


ERROR:root:Crawl raise error 2019/05/18


Crawling (2019, 5, 17)
Crawling (2019, 5, 16)
Crawling (2019, 5, 15)
Crawling (2019, 5, 14)
Crawling (2019, 5, 13)
Crawling (2019, 5, 12)


ERROR:root:Crawl raise error 2019/05/12


Crawling (2019, 5, 11)


ERROR:root:Crawl raise error 2019/05/11


Crawling (2019, 5, 10)
Crawling (2019, 5, 9)
Crawling (2019, 5, 8)
Crawling (2019, 5, 7)
Crawling (2019, 5, 6)
Crawling (2019, 5, 5)


ERROR:root:Crawl raise error 2019/05/05


Crawling (2019, 5, 4)


ERROR:root:Crawl raise error 2019/05/04


Crawling (2019, 5, 3)
Crawling (2019, 5, 2)
Crawling (2019, 5, 1)


ERROR:root:Crawl raise error 2019/05/01


Crawling (2019, 4, 30)
Crawling (2019, 4, 29)
Crawling (2019, 4, 28)


ERROR:root:Crawl raise error 2019/04/28


Crawling (2019, 4, 27)


ERROR:root:Crawl raise error 2019/04/27


Crawling (2019, 4, 26)
Crawling (2019, 4, 25)
Crawling (2019, 4, 24)
Crawling (2019, 4, 23)
Crawling (2019, 4, 22)
Crawling (2019, 4, 21)


ERROR:root:Crawl raise error 2019/04/21


Crawling (2019, 4, 20)


ERROR:root:Crawl raise error 2019/04/20


Crawling (2019, 4, 19)
Crawling (2019, 4, 18)
Crawling (2019, 4, 17)
Crawling (2019, 4, 16)
Crawling (2019, 4, 15)
Crawling (2019, 4, 14)


ERROR:root:Crawl raise error 2019/04/14


Crawling (2019, 4, 13)


ERROR:root:Crawl raise error 2019/04/13


Crawling (2019, 4, 12)
Crawling (2019, 4, 11)
Crawling (2019, 4, 10)
Crawling (2019, 4, 9)
Crawling (2019, 4, 8)
Crawling (2019, 4, 7)


ERROR:root:Crawl raise error 2019/04/07


Crawling (2019, 4, 6)


ERROR:root:Crawl raise error 2019/04/06


Crawling (2019, 4, 5)


ERROR:root:Crawl raise error 2019/04/05


Crawling (2019, 4, 4)


ERROR:root:Crawl raise error 2019/04/04


Crawling (2019, 4, 3)
Crawling (2019, 4, 2)
Crawling (2019, 4, 1)
Crawling (2019, 3, 31)


ERROR:root:Crawl raise error 2019/03/31


Crawling (2019, 3, 30)


ERROR:root:Crawl raise error 2019/03/30


Crawling (2019, 3, 29)
Crawling (2019, 3, 28)
Crawling (2019, 3, 27)
Crawling (2019, 3, 26)
Crawling (2019, 3, 25)
Crawling (2019, 3, 24)


ERROR:root:Crawl raise error 2019/03/24


Crawling (2019, 3, 23)


ERROR:root:Crawl raise error 2019/03/23


Crawling (2019, 3, 22)
Crawling (2019, 3, 21)
Crawling (2019, 3, 20)
Crawling (2019, 3, 19)
Crawling (2019, 3, 18)


ERROR:root:Crawl raise error 2019/03/18


Crawling (2019, 3, 17)


ERROR:root:Crawl raise error 2019/03/17


Crawling (2019, 3, 16)


ERROR:root:Crawl raise error 2019/03/16


Crawling (2019, 3, 15)
Crawling (2019, 3, 14)
Crawling (2019, 3, 13)
Crawling (2019, 3, 12)
Crawling (2019, 3, 11)


ERROR:root:Crawl raise error 2019/03/10


Crawling (2019, 3, 10)
Crawling (2019, 3, 9)


ERROR:root:Crawl raise error 2019/03/09


Crawling (2019, 3, 8)
Crawling (2019, 3, 7)
Crawling (2019, 3, 6)
Crawling (2019, 3, 5)
Crawling (2019, 3, 4)
Crawling (2019, 3, 3)


ERROR:root:Crawl raise error 2019/03/03


Crawling (2019, 3, 2)


ERROR:root:Crawl raise error 2019/03/02


Crawling (2019, 3, 1)


ERROR:root:Crawl raise error 2019/03/01


Crawling (2019, 2, 28)


ERROR:root:Crawl raise error 2019/02/28


Crawling (2019, 2, 27)
Crawling (2019, 2, 26)
Crawling (2019, 2, 25)
Crawling (2019, 2, 24)


ERROR:root:Crawl raise error 2019/02/24


Crawling (2019, 2, 23)


ERROR:root:Crawl raise error 2019/02/23


Crawling (2019, 2, 22)
Crawling (2019, 2, 21)
Crawling (2019, 2, 20)
Crawling (2019, 2, 19)
Crawling (2019, 2, 18)
Crawling (2019, 2, 17)


ERROR:root:Crawl raise error 2019/02/17


Crawling (2019, 2, 16)


ERROR:root:Crawl raise error 2019/02/16


Crawling (2019, 2, 15)
Crawling (2019, 2, 14)
Crawling (2019, 2, 13)
Crawling (2019, 2, 12)
Crawling (2019, 2, 11)
Crawling (2019, 2, 10)


ERROR:root:Crawl raise error 2019/02/10


Crawling (2019, 2, 9)


ERROR:root:Crawl raise error 2019/02/09


Crawling (2019, 2, 8)


ERROR:root:Crawl raise error 2019/02/08


Crawling (2019, 2, 7)


ERROR:root:Crawl raise error 2019/02/07


Crawling (2019, 2, 6)


ERROR:root:Crawl raise error 2019/02/06
