### To Do List
##### Done
* check all data form (sii, otc, pub, rotc)

##### Not Yet
* design storage structure
* build up sqlite3 db table / csv version as well
* parsing with loop (and using tenacity)
* make it a independ .py file
* set cron job and keep the code clean

In [39]:
import requests
import pandas as pd
import numpy as np
import math
import os
from tenacity import retry, stop_after_attempt, wait_fixed
from datetime import datetime, timedelta

In [77]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def financial_statement(year, season, type='PL'):

    if year >= 1000:
        year -= 1911

    if type == 'PL': # 綜合損益彙總表
        url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb04'
    elif type == 'BS': # 資產負債彙總表
        url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb05'
    else:
        print('type does not match')
    
    df_final = pd.DataFrame()
    
    for corp_type in ["sii", "otc", "pub", "rotc"]:
        r = requests.post(url, {
            'encodeURIComponent':1,
            'step':1,
            'firstin':1,
            'off':1,
            'TYPEK':corp_type,# sii上市，otc上櫃，rotc興櫃，pub公開發行
            'year':str(year),
            'season':str(season),
        })

        r.encoding = 'utf8'
            
        dfs = pd.read_html(r.text, header=None)
        df = pd.concat(dfs[1:], axis=0, sort=False)
        df['年份'] = pd.Series([year] * df.shape[0])
        df['季度'] = pd.Series([season] * df.shape[0])
        df = pd.set_index(['公司名稱']).apply(lambda s: pd.to_numeric(s, errors='ceorce'))
        df['年份'] = pd.Series([year] * df.shape[0])
        df['季度'] = pd.Series([season] * df.shape[0])
        df['公司名稱'] = df.index
        df['公司代號'] = df['公司代號'].astype(str)
        df = df.set_index('公司代號')
        df_final = pd.concat([df_final, df], axis=0, sort=False)
            
    return df_final

@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def financial_analysis(year, season): # 營益分析彙總表
    
    if year >= 1000:
        year -= 1911
    
    url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb06'
    
    df_final = pd.DataFrame()
    
    for corp_type in ["sii", "otc", "pub", "rotc"]:
        try:
            r = requests.post(url, {
                'encodeURIComponent':1,
                'step':1,
                'firstin':1,
                'off':1,
                'TYPEK':corp_type, #otc pub rotc sii
                'year':str(year),
                'season':str(season),
            })
        
            r.encoding = 'utf8'
            dfs = pd.read_html(r.text, header=None)
            dfs[0].columns = dfs[0].iloc[0]
            df = dfs[0]
            df['年份'] = pd.Series([year] * df.shape[0])
            df['季度'] = pd.Series([season] * df.shape[0])
            df = df.set_index(['公司名稱']).apply(lambda s: pd.to_numeric(s, errors='ceorce'))
            df = df[~df['公司代號'].apply(lambda x: math.isnan(x))]
            df['公司名稱'] = df.index
            df['公司代號'] = df['公司代號'].astype(int).astype(str)
            df = df.set_index('公司代號')
            df_final = pd.concat([df_final, df], axis=0, sort=False)
        except Exception as e:
            continue

    return df_final

In [41]:
storage = "financial_statement/"

if not os.path.exists(storage+'duration_coverage_FS.csv'):
    pd.DataFrame({'Season':[], 'Created_at':[]}).to_csv(storage+'duration_coverage_FS.csv', index=False)
existed_season = pd.read_csv(storage+'duration_coverage_FS.csv')['Season'].tolist()

df_PL = pd.read_csv('P&L.csv') if os.path.exists('P&L.csv') else pd.DataFrame()
df_BS = pd.read_csv('Balance_Sheet.csv') if os.path.exists('Balance_Sheet.csv') else pd.DataFrame()
df_FA = pd.read_csv('Financial_Analysis.csv') if os.path.exists('Financial_Analysis.csv') else pd.DataFrame()

In [None]:
year = 2013 - 1911
season = 3
url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb04'
df_final = pd.DataFrame()
r = requests.post(url, {
    'encodeURIComponent':1,
    'step':1,
    'firstin':1,
    'off':1,
    'isQuery': 'Y',
    'TYPEK':"otc", # sii上市，otc上櫃，rotc興櫃，pub公開發行
    'year':str(year),
    'season':str(season),

})
r.encoding = 'utf8'
r.text

In [100]:
#dfs = pd.read_html(r.text, header=None)
#df = pd.concat(dfs[1:], axis=0, sort=False)
#df['年份'] = pd.Series([year] * df.shape[0])
#df['季度'] = pd.Series([season] * df.shape[0])
#df = df.set_index(['公司名稱']).apply(lambda s: pd.to_numeric(s, errors='ceorce'))
df['公司名稱'] = df.index
df['公司代號'] = df['公司代號'].astype(str)
df = df.set_index('公司代號')
df

Unnamed: 0_level_0,收益,支出及費用,營業利益,營業外損益,稅前淨利（淨損）,所得稅利益（費用）,繼續營業單位本期淨利（淨損）,停業單位損益,合併前非屬共同控制股權損益,本期淨利（淨損）,...,繼續營業單位稅前損益,所得稅（費用）利益,本期稅後淨利（淨損）,收入,支出,繼續營業單位稅前淨利（淨損）,其他綜合損益,年份,季度,公司名稱
公司代號,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6015,786345.0,726791.0,59554.0,142376.0,201930.0,5447.0,207377,,,207377.0,...,,,,,,,,102,3,宏遠證券
6016,1437683.0,1576334.0,-138651.0,86777.0,-51874.0,-17381.0,-69255,,,-69255.0,...,,,,,,,,102,3,康和證券
6020,349513.0,124016.0,225497.0,21221.0,246718.0,18707.0,265425,0.0,,265425.0,...,,,,,,,,102,3,大展證
6021,525912.0,376332.0,149580.0,56071.0,205651.0,-5357.0,200294,,,200294.0,...,,,,,,,,102,3,大慶證
6023,1876831.0,1693253.0,183578.0,369604.0,553182.0,-98233.0,454949,,,454949.0,...,,,,,,,,102,3,元大期貨
1258,,,,,47350.0,,36642,,,36642.0,...,,,,,,,,102,3,其祥-KY
1259,,,,,43198.0,,25616,,,25616.0,...,,,,,,,,102,3,安心
1333,,,,,14755.0,,10727,0.0,,10727.0,...,,,,,,,,102,3,恩得利
1336,,,,,-22348.0,,-19066,,,-19066.0,...,,,,,,,,102,3,台翰
1565,,,,,1344054.0,,1158217,,,1158217.0,...,,,,,,,,102,3,精華光學


In [78]:
financial_statement(2014, 3, type='PL')

RetryError: RetryError[<Future at 0x11f24df28 state=finished raised AttributeError>]

In [42]:
# Main Part - Start from 2013-1
for year in list(range(2013, datetime.now().year+1)):
    for season in list(range(1, 5)):
        handling_season = "{0}-{1}".format(str(year), str(season))
        
        record_str = handling_season+" - 綜合損益彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
	            df_PL = pd.concat([df_PL, financial_statement(year, season, type='PL')], axis=0, sort=False)
	            duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
	            duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
            	print("Failed")
        
        record_str = handling_season+" - 資產負債彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
                df_BS = pd.concat([df_BS, financial_statement(year, season, type='BS')], axis=0, sort=False)
                duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
                duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
                print("Failed")
                
        
        record_str = handling_season+" - 營益分析彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
                df_FA = pd.concat([df_FA, financial_analysis(year, season)], axis=0, sort=False)
                duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
                duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
                print("Failed")
                

Pass:  2013-1 - 綜合損益彙總表
Pass:  2013-1 - 資產負債彙總表
Pass:  2013-1 - 營益分析彙總表
Pass:  2013-2 - 綜合損益彙總表
Pass:  2013-2 - 資產負債彙總表
Pass:  2013-2 - 營益分析彙總表
Handling:  2013-3 - 綜合損益彙總表
Failed
Handling:  2013-3 - 資產負債彙總表


KeyboardInterrupt: 

In [None]:
df_PL.to_csv(storage+'P&L.csv')
df_BS.to_csv(storage+'Balance_Sheet.csv')
df_FA.to_csv(storage+'Financial_Analysis.csv') 