In [2]:
import requests
import pandas as pd
import sqlite3
import os
import time


def retry_requests(url, headers):
    
    for i in range(3):
        try:
            return requests.get(url, headers=headers)
        except:
            print('發生錯誤，等待1分鐘後嘗試')
            time.sleep(60)
    
    return None

def get_monthly_reports(date):
        
    url = 'https://mops.twse.com.tw/nas/t21/sii/t21sc03_{year}_{month}_0.html'.format(year=date.year - 1911,
                                                                                      month=date.month)
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
    }
    
    response = retry_requests(url, headers)
    response.encoding = 'big5'

    try:
        dfs = pd.read_html(response.text)
    except:
        return None
    
    df = pd.concat([df for df in dfs if len(df.columns) == 11])
    
    df.columns = df.columns.droplevel(0)
    
    df.drop(['公司名稱', '備註'], axis=1, inplace = True)
    
    df = df.rename(columns = {'公司代號':'證券代號'})

    df.reset_index(drop=True, inplace=True)
    df.drop(df[df['證券代號'] == '合計'].index, inplace=True)

    df['日期'] = pd.to_datetime(date)
    
    df = df.set_index(['證券代號', '日期'])
    
    return df

def save_monthly_reports(new_df):
    
    connection = sqlite3.connect('data.db')
    
    try:
        df = pd.read_sql('select * from monthly_reports', connection, parse_dates=['日期'], index_col=['證券代號', '日期'])
    except:
        df = pd.DataFrame()
    
    combined_df = df.append(new_df) 
    
    final_df = combined_df.reset_index().drop_duplicates(subset=['證券代號', '日期'], keep='last').set_index(['證券代號', '日期']).sort_index()
    
    final_df.to_sql('monthly_reports', connection, if_exists='replace')
    
    connection.close()
    
    excel_file = os.path.join('data',
                              'excel_files',
                              'monthly_reports.xlsx')
    
    os.makedirs(os.path.dirname(excel_file), exist_ok=True)
    
    final_df.to_excel(excel_file)   
    

In [3]:
for date in pd.date_range('20200101', '20200301', freq='MS'):
    print(date)

2020-01-01 00:00:00
2020-02-01 00:00:00
2020-03-01 00:00:00


In [4]:
def update_monthly_reports(start_date, end_date):
    
    main_df = pd.DataFrame()

    for date in pd.date_range(start_date, end_date, freq='MS'):

        df = get_monthly_reports(date)

        if df is not None:
            main_df = main_df.append(df)
            print('{} 抓取完成'.format(date.strftime('%Y%m%d')))
        else:
            print('{} 找不到資料'.format(date.strftime('%Y%m%d')))

        time.sleep(15)
        
    save_monthly_reports(main_df)
    
    return main_df

In [5]:
update_monthly_reports('20200101', '20200301')

20200101 抓取完成
20200201 抓取完成
發生錯誤，等待1分鐘後嘗試
20200301 抓取完成


Unnamed: 0_level_0,Unnamed: 1_level_0,當月營收,上月營收,去年當月營收,上月比較增減(%),去年同月增減(%),當月累計營收,去年累計營收,前期比較增減(%)
證券代號,日期,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1101,2020-01-01,7502141,12349908,9394240,-39.25,-20.14,7502141,9394240,-20.14
1102,2020-01-01,5210319,8115203,7905737,-35.79,-34.09,5210319,7905737,-34.09
1103,2020-01-01,165029,157879,177060,4.52,-6.79,165029,177060,-6.79
1104,2020-01-01,359966,457657,479518,-21.34,-24.93,359966,479518,-24.93
1108,2020-01-01,296395,377849,353189,-21.55,-16.08,296395,353189,-16.08
...,...,...,...,...,...,...,...,...,...
9941,2020-03-01,2482208,2227332,2129670,11.44,16.55,7134163,6196210,15.13
9942,2020-03-01,255403,157955,265779,61.69,-3.90,606568,721602,-15.94
9944,2020-03-01,259318,167653,330005,54.67,-21.41,609643,823249,-25.94
9945,2020-03-01,1364091,1373005,1352660,-0.64,0.84,3905806,3596279,8.60
