# Main function

In [63]:
import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup as bs4
import re
import pandas as pd
import numpy as np
import json

class MOPS_ALL_2018(object):
    
    def __init__(self, sid, year, season, rid):
        self.sid=sid
        self.year=year
        self.season=season
        self.rid=rid
        keyword={'sid':self.sid, 'y':self.year, 's':self.season, 'rid':self.rid}
        self.header= {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 Safari/537.36'}
        self.url='https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID={sid}&SYEAR={y}&SSEASON={s}&REPORT_ID={rid}'.format(**keyword)
        self.get_tbl()
        
    def fetchall(self):
        self.fetch_BS()
        self.fetch_PLS()
        self.fetch_CFS()
        #self.fetch_SES()
        self.jsondata = {'data': [self.Balance, self.ProfitLoss , self.CashFlows]}
        
        return self.jsondata        
    
    def get_tbl(self):
        web_ss = requests.session()
        ss_adapter = HTTPAdapter(max_retries=3)
        web_ss.mount("https://", adapter=ss_adapter)
        res = web_ss.get(url=self.url, headers=self.header, timeout=5)
        res.encoding = 'big5'
        try:
            if res.status_code == 200:
                #soup = bs4(res.text, "lxml")
                soup = bs4(res.text, "lxml")
                self.tables = soup.select('table')
        except Exception as e:
            print(e)
            
        return self.tables
            
            
    def fetch_BS(self, Ntable=1):
        tr_list = self.tables[Ntable].select('tr')
        
        Balance_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
    
                    level=i.text.count(u'\u3000')
                    #print(i.text, level)
                    #print(i.text, level)
                    p=i.text.strip().replace(',','')
                    #p=i.text.replace(',','')
                    if re.search(u'[\u4e00-\u9fff]', p):
                        row.append([level, p])
                    else:
                        row.append(p)
                        
                        
                        
                if len(row) > 0:
                    Balance_list.append(row[:2])

#         self.Balance = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.Balance = self.Balance[self.Balance['value'] != '']  
        
        self.Balance={'Balance':Balance_list}

        return self.Balance
        
    def fetch_PLS(self, Ntable=2):        
        tr_list = self.tables[Ntable].select('tr')
        
        ProfitLoss_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
                    
                    level=i.text.count(u'\u3000')
                    p=i.text.strip().replace(',','')
                    if re.search(u'[\u4e00-\u9fff]', p):
                        row.append([level, p])
                    else:
                        row.append(p)
        #             print(i.text.strip().replace(',',''))
        #     print(tr.select("td"))
                if len(row) > 0:
                    ProfitLoss_list.append(row[:2])
        
#         self.ProfitLoss = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.ProfitLoss = self.ProfitLoss[self.ProfitLoss['value'] != '']  
        self.ProfitLoss={'ProfitLoss':ProfitLoss_list}
        return self.ProfitLoss
    
    
        
    def fetch_CFS(self, Ntable = 3):
        tr_list = self.tables[Ntable].select('tr')
        
        CashFlows_list = []
        for tr in tr_list:
            td_list = tr.select("td")
            if len(td_list) != 0:
                row = []
                for i in td_list:
                    level=i.text.count(u'\u3000')
                    p=i.text.strip().replace(',','')
                    if re.search(u'[\u4e00-\u9fff]', p):
                        row.append([level, p])
                    else:
                        row.append(p)
        #     print(tr.select("td"))
                if len(row) > 0:
                    CashFlows_list.append(row[:2])
        
#         self.CashFlows = pd.DataFrame(tbl, columns = ['cht','value'])
#         self.CashFlows = self.CashFlows[self.CashFlows['value'] != '']  
         
        self.CashFlows={'CashFlows':CashFlows_list}
        return self.CashFlows
        
        
    def fetch_SES(self, Ntable = 4):

        Etable = self.tables[Ntable]
        #print(Etable)
#         print(len(Etable.select("tr")))
        
        tr_list = Etable.select("tr")
        tbl = []
        column = []
        
        for tr in tr_list:
            td_list = tr.select("td") #row
            th_list = tr.select("th") #column
        
            row = []
            for td in td_list:
                row.append(td.text.replace("\n","").replace(",",""))
            if len(row) > 0:
                tbl.append(row)
            
            for th in th_list:
                column.append(th.text.replace("\n",""))
#         print(len(tbl))
#         print(column)
        self.equity_dict = {'col_index':column,'row_value':tbl}
#         print(self.equity_dict)
        return self.equity_dict
        
#         self.equity = pd.DataFrame(tbl,columns = column)


#         self.equity = self.equity.replace("", np.nan)
#         self.equity = self.equity.fillna(0)
#         self.equity['items'] = self.equity.iloc[:,0]
#         self.equity = self.equity.iloc[:,1:]
        
#         self.equity.set_index('items', inplace=True) 
#         return self.equity
#         return print(tbl)

In [64]:
year='2017'
season='3'
code='3031'
rid='C'
stock=MOPS_ALL_2018(code,year,season,rid)
data = stock.fetchall()
data

{'data': [{'Balance': [[[1, '資產'], ''],
    [[2, '流動資產'], ''],
    [[3, '現金及約當現金'], ''],
    [[4, '現金及約當現金總額'], '532105'],
    [[3, '透過損益按公允價值衡量之金融資產－流動'], ''],
    [[4, '透過損益按公允價值衡量之金融資產－流動合計'], '100234'],
    [[3, '應收票據淨額'], ''],
    [[4, '應收票據淨額'], '60058'],
    [[3, '應收帳款淨額'], ''],
    [[4, '應收帳款淨額'], '476343'],
    [[3, '應收帳款－關係人淨額'], ''],
    [[4, '應收帳款－關係人淨額'], '87583'],
    [[3, '應收建造合約款'], '215896'],
    [[3, '當期所得稅資產'], ''],
    [[4, '本期所得稅資產合計'], '0'],
    [[3, '存貨'], ''],
    [[4, '存貨合計'], '324816'],
    [[3, '其他流動資產'], ''],
    [[4, '其他金融資產－流動'], '318422'],
    [[4, '其他流動資產－其他'], '29969'],
    [[4, '其他流動資產合計'], '348391'],
    [[3, '流動資產合計'], '2145426'],
    [[2, '非流動資產'], ''],
    [[3, '備供出售金融資產－非流動'], ''],
    [[4, '備供出售金融資產－非流動淨額'], '181592'],
    [[3, '以成本衡量之金融資產－非流動'], ''],
    [[4, '以成本衡量之金融資產－非流動淨額'], '319330'],
    [[3, '採用權益法之投資'], ''],
    [[4, '採用權益法之投資淨額'], '121950'],
    [[3, '不動產、廠房及設備'], ''],
    [[4, '不動產、廠房及設備合計'], '766283'],
    [[3, '遞延所得稅資產'], '19908'],


In [53]:
stock.url

'https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=3031&SYEAR=2017&SSEASON=3&REPORT_ID=C'

# Fetch DATA

In [None]:
import os
import time
import random
from requests.exceptions import ConnectionError
from codes import codes


def dlcheck(path):
    filename=os.listdir(path)
    code_cap=[]
    for _file in filename:
        code_cap.append(_file[0:4])
    return code_cap

year='2017'
season='3'
path=year+'Q'+season
csvpath=year+'Q'+season+'SES'

if not os.path.exists(path):
    os.mkdir(path)

if not os.path.exists(csvpath):
    os.mkdir(csvpath)

#取得兩個資料夾的交集，以便確認是否抓過
code_cap=set(dlcheck(path)) & set(dlcheck(csvpath))
print(code_cap)


for code,v in codes.items():
    if v.type=="股票" and v.market=="上市":
        try:
            if code not in code_cap:
                rid ='C'           
                stock=MOPS_ALL_2018(code,year,season,rid)
#                 print(stock.tables)
                if len(stock.tables) < 3:
                    time.sleep(random.uniform(2,5))
                    rid = 'A'
                    stock=MOPS_ALL_2018(code,year,season,rid)
                    if len(stock.tables) < 3:
                        with open('nofinreport.txt', 'a') as f:
                            f.write('nodata:'+code+':'+stock.url+'\n')
                        print(stock.url)
                        print(code +' no finance data!')
                        continue
                    
                if stock.tables != []:
                    print('fetch BS, PLS and CFS from:' ,stock.url)
                    data = stock.fetchall()
                    filename=path+'/'+code+'-'+year+'-'+'Q'+season+'.json'
                    with open(filename, 'w', encoding='utf8') as f:
                        json.dump(data, f)
                    
                    
                    print('fetch SES from:' ,stock.url)

                    jsonname=csvpath+'/'+code+'-'+year+'-'+'Q'+season+'_ses.json'
                    ses_data = stock.fetch_SES()
                    with open(jsonname, 'w', encoding='utf8') as f:
                        json.dump(ses_data, f)
                        
                        
                else:
                    print('error-nodata:', code) 
                    with open('error.log', 'a+') as f:   
                        f.write('nodata:'+ code+':' + stock.url+'\n\n')
                time.sleep(random.uniform(2,5))             
                
        except ConnectionError:
            code_cap=set(dlcheck(path)) & set(dlcheck(csvpath))
            time.sleep(120)
            continue
print('download finished!')

set()
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1101&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1101&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1102&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1102&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1103&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1103&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1104&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1104&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.t

fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1312&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1312&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1313&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1313&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1314&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1314&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1315&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1315&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.co

fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1442&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1442&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1443&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1443&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1444&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1444&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1445&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch SES from: https://mops.twse.com.tw/server-java/t164sb01?step=1&CO_ID=1445&SYEAR=2017&SSEASON=3&REPORT_ID=C
fetch BS, PLS and CFS from: https://mops.twse.co

In [47]:
stock.tables

[<table border="0" width="800px">
 <tr>
 <th align="center" nowrap="">公司代號：</th>
 <td align="center" nowrap=""><input id="CO_ID" maxlength="6" name="CO_ID" size="15" type="text" value="1418"/></td>
 <td align="center" nowrap=""><input id="qryId" name="qryId" onclick="showDialog('','/server-java/t164sb01?step=2','success');" type="button" value="代號查詢"/></td>
 <th align="center" nowrap="">年度：</th>
 <td align="center" nowrap=""> <select id="SYEAR" name="SYEAR"> <option value="2013"> 102 </option> <option value="2014"> 103 </option> <option value="2015"> 104 </option> <option value="2016"> 105 </option> <option selected="selected" value="2017"> 106 </option> <option value="2018"> 107 </option> <option value="2019"> 108 </option> <option value="2020"> 109 </option> <option value="2021"> 110 </option> </select> </td>
 <th align="center" nowrap="">季別：</th>
 <td align="center" nowrap=""><b>第 <select id="SSEASON" name="SSEASON"> <option value="1"> 1 </option> <option value="2"> 2 </option> <opt