### To Do List
##### Done
* check all data form (sii, otc, pub, rotc)

##### Not Yet
* design storage structure
* build up sqlite3 db table / csv version as well
* parsing with loop (and using tenacity)
* make it a independ .py file
* set cron job and keep the code clean

In [None]:
import requests
import pandas as pd
import numpy as np
import math
import os
from tenacity import retry, stop_after_attempt, wait_fixed
from datetime import datetime, timedelta

In [None]:
@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def financial_statement(year, season, type='PL'):

    if year >= 1000:
        year -= 1911

    if type == 'PL': # 綜合損益彙總表
        url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb04'
    elif type == 'BS': # 資產負債彙總表
        url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb05'
    else:
        print('type does not match')
    
    df_final = pd.DataFrame()
    
    for corp_type in ["sii", "otc", "pub", "rotc"]:
        r = requests.post(url, {
            'encodeURIComponent':1,
            'step':1,
            'firstin':1,
            'off':1,
            'TYPEK':corp_type,# sii上市，otc上櫃，rotc興櫃，pub公開發行
            'year':str(year),
            'season':str(season),
        })

        r.encoding = 'utf8'
            
        dfs = pd.read_html(r.text, header=None)
        df = pd.concat(dfs[1:], axis=0, sort=False)
        df['年份'] = pd.Series([year] * df.shape[0])
        df['季度'] = pd.Series([season] * df.shape[0])
        df = pd.set_index(['公司名稱']).apply(lambda s: pd.to_numeric(s, errors='ceorce'))
        df['年份'] = pd.Series([year] * df.shape[0])
        df['季度'] = pd.Series([season] * df.shape[0])
        df['公司名稱'] = df.index
        df['公司代號'] = df['公司代號'].astype(str)
        df = df.set_index('公司代號')
        df_final = pd.concat([df_final, df], axis=0, sort=False)
            
    return df_final

@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def financial_analysis(year, season): # 營益分析彙總表
    
    if year >= 1000:
        year -= 1911
    
    url = 'https://mops.twse.com.tw/mops/web/ajax_t163sb06'
    
    df_final = pd.DataFrame()
    
    for corp_type in ["sii", "otc", "pub", "rotc"]:
        try:
            r = requests.post(url, {
                'encodeURIComponent':1,
                'step':1,
                'firstin':1,
                'off':1,
                'TYPEK':corp_type, #otc pub rotc sii
                'year':str(year),
                'season':str(season),
            })
        
            r.encoding = 'utf8'
            dfs = pd.read_html(r.text, header=None)
            dfs[0].columns = dfs[0].iloc[0]
            df = dfs[0]
            df['年份'] = pd.Series([year] * df.shape[0])
            df['季度'] = pd.Series([season] * df.shape[0])
            df = df.set_index(['公司名稱']).apply(lambda s: pd.to_numeric(s, errors='ceorce'))
            df = df[~df['公司代號'].apply(lambda x: math.isnan(x))]
            df['公司名稱'] = df.index
            df['公司代號'] = df['公司代號'].astype(int).astype(str)
            df = df.set_index('公司代號')
            df_final = pd.concat([df_final, df], axis=0, sort=False)
        except Exception as e:
            continue

    return df_final

In [None]:
storage = "financial_statement/"

if not os.path.exists(storage+'duration_coverage_FS.csv'):
    pd.DataFrame({'Season':[], 'Created_at':[]}).to_csv(storage+'duration_coverage_FS.csv', index=False)
existed_season = pd.read_csv(storage+'duration_coverage_FS.csv')['Season'].tolist()

df_PL = pd.read_csv('P&L.csv') if os.path.exists('P&L.csv') else pd.DataFrame()
df_BS = pd.read_csv('Balance_Sheet.csv') if os.path.exists('Balance_Sheet.csv') else pd.DataFrame()
df_FA = pd.read_csv('Financial_Analysis.csv') if os.path.exists('Financial_Analysis.csv') else pd.DataFrame()

In [None]:
# Main Part - Start from 2013-1
for year in list(range(2013, datetime.now().year+1)):
    for season in list(range(1, 5)):
        handling_season = "{0}-{1}".format(str(year), str(season))
        
        record_str = handling_season+" - 綜合損益彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
	            df_PL = pd.concat([df_PL, financial_statement(year, season, type='PL')], axis=0, sort=False)
	            duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
	            duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
            	print("Failed")
        
        record_str = handling_season+" - 資產負債彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
                df_BS = pd.concat([df_BS, financial_statement(year, season, type='BS')], axis=0, sort=False)
                duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
                duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
                print("Failed")
                
        
        record_str = handling_season+" - 營益分析彙總表"

        if record_str in existed_season:
            print("Pass: ", record_str)
        else:
            print("Handling: ", record_str)
            try:
                df_FA = pd.concat([df_FA, financial_analysis(year, season)], axis=0, sort=False)
                duration_covered = duration_covered.append(pd.DataFrame({'Season':[record_str], 'Created_at':[datetime.now()]}), sort=True)
                duration_covered.to_csv(storage+'duration_coverage_FS.csv', index=False)
            except Exception as e:
                print("Failed")
                

In [None]:
df_PL.to_csv(storage+'P&L.csv')
df_BS.to_csv(storage+'Balance_Sheet.csv')
df_FA.to_csv(storage+'Financial_Analysis.csv') 

In [107]:
df_FA = pd.read_csv(storage+'Financial_Analysis.csv')
df_FA.head()

Unnamed: 0,公司代號,營業收入(百萬元),毛利率(%)(營業毛利)/(營業收入),營業利益率(%)(營業利益)/(營業收入),稅前純益率(%)(稅前純益)/(營業收入),稅後純益率(%)(稅後純益)/(營業收入),年份,季度,公司名稱
0,1101,24114.05,12.95,8.4,9.71,8.29,102,1,台泥
1,1102,13931.55,6.4,2.44,11.41,9.84,102,1,亞泥
2,1103,741.19,-6.06,-20.21,4.56,8.05,102,1,嘉泥
3,1104,1248.07,9.78,2.42,14.92,14.7,102,1,環球水泥
4,1108,1203.67,12.61,8.16,7.03,5.31,102,1,幸福水泥


In [109]:
df_FA[df_FA.公司代號 == 8477]

Unnamed: 0,公司代號,營業收入(百萬元),毛利率(%)(營業毛利)/(營業收入),營業利益率(%)(營業利益)/(營業收入),稅前純益率(%)(稅前純益)/(營業收入),稅後純益率(%)(稅後純益)/(營業收入),年份,季度,公司名稱
16429,8477,1099.54,16.49,1.79,2.25,1.87,104,2,創業家
20186,8477,2417.0,16.24,1.75,2.15,1.79,104,4,創業家
22281,8477,834.06,15.44,3.4,3.72,3.09,105,1,創業家
23965,8477,1622.06,15.21,2.02,2.36,1.96,105,2,創業家
26053,8477,2317.4,15.73,1.63,1.99,1.65,105,3,創業家
27768,8477,3130.5,16.44,1.61,1.99,1.65,105,4,創業家
29885,8477,835.65,18.55,0.83,1.36,1.12,106,1,創業家
31587,8477,1720.38,17.76,0.46,0.91,0.69,106,2,創業家
33710,8477,2626.12,17.27,0.32,0.75,0.58,106,3,創業家
35416,8477,3745.81,16.4,0.24,0.65,0.5,106,4,創業家
