In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import sys
import csv
from random import randint
import re
import json

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


In [56]:
def getIndusValue(keySoup):
    #========================
    '''
    keySoup : soup of industry key, i.e. 
    output : Cleaned string of the value soup
    '''
    #========================
    # value soup is the sibling of the key soup.
    try:
        indus_type = keySoup.find_next_sibling('td').text
        # substitute all the \r\n\t
        indus_type = re.sub(r'[\r|\n|\t| |\xa0]', ' ', indus_type)
        # replace the space between digits and chinese
        indus_type = re.sub(r'([0-9])[ ]+([\u4E00-\u9FFF\u3000-\u303F])', r'\1_\2', indus_type)
        # concate all the indust type
        indus_type = re.sub(r'[ ]+', r';', indus_type)
    except:
        return 'indusValueError'
    return indus_type

# getIndusValue(keySoups[9])



def parseElements(keySoups):
    #========================
    '''
    keySoups : soups of all keys, i.e. soup.find('tbody').findAll('td', {'class':'txt_td'})
    output : dict of all soups value
    '''
    #========================
    dataDict = dict()
    for keysoup in keySoups:
        # test if the keysoup has attribute 'text'
        if not hasattr(keysoup, 'text'):
            continue
        if keysoup.text == '所營事業資料':
            # 所營事業資料's value needs getIndusValue() to parse
            value = getIndusValue(keysoup)
            dataDict[keysoup.text] = value
            continue
        if keysoup.text == '公司名稱':
            # 公司名稱'x value needs a special way to parse
            try:
                compName = keysoup.find_next_sibling('td').span['onclick']
                value = re.sub(r'[^\u4E00-\u9FFF\u3000-\u303F]', '', compName)
                dataDict[keysoup.text] = value
            except:
                dataDict[keysoup.text] = 'compNameError'
            continue
        # Most of the value can be parsed by the way following
        try:
            value = keysoup.find_next_sibling('td').find(text=True, recursive=False)
            dataDict[keysoup.text] = re.sub(r'[\r|\n|\t| |\xa0]', '', value)
        except:
            dataDict[keysoup.text] = keysoup.text + 'Error'
    return dataDict

def getDataFromUrl(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
              'Referer': 'https://findbiz.nat.gov.tw/fts/query/QueryList/queryList.do'}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    keySoups = soup.find('tbody').findAll('td', {'class':'txt_td'})
    return parseElements(keySoups)

url = 'https://findbiz.nat.gov.tw/fts/query/QueryCmpyDetail/queryCmpyDetail.do?objectId=SEM3MDQ0MDE2Mw==&banNo=70440163'
url = 'https://findbiz.nat.gov.tw/fts/query/QueryCmpyDetail/queryCmpyDetail.do?objectId=SEMyODY3OTA5Mg==&banNo=28679092'
# url = 'https://findbiz.nat.gov.tw/fts/query/QueryCmpyDetail/queryCmpyDetail.do?objectId=SEM1MjU4MDY5Nw==&banNo=52580697'
# url = 'https://findbiz.nat.gov.tw/fts/query/QueryCmpyDetail/queryCmpyDetail.do?objectId=SEMxNjIxMTQxOQ==&banNo=16211419'


# with open('data.txt', 'w') as outfile:
# json.dumps(getDataFromUrl(url)).decode('unicode-escape').encode('utf8')
def appendJsonInfo(outputFile, url, value):
    getStringWithDecodedUnicode = lambda str : re.sub( '\\\\u([\da-f]{4})', (lambda x : chr( int( x.group(1), 16 ) )), str )
    jsonString = json.dumps( getDataFromUrl(url) )
    jsonString = getStringWithDecodedUnicode( jsonString )
    with open(outputFile, "a") as text_file:
        text_file.write(',\n"{}":'.format(url))
        text_file.write(jsonString)

Decoded Unicode: {"統一編號": "28679092", "公司所在地": "臺北市大安區杭州南路2段9巷3號1樓", "所營事業資料": ";F401010_國際貿易業;F111090_建材批發業;F211010_建材零售業;E801010_室內裝潢業;J101030_廢棄物清除業;J101010_建築物清潔服務業;J101090_廢棄物清理業;E599010_配管工程業;E801020_門窗安裝工程業;E801030_室內輕鋼架工程業;E801040_玻璃安裝工程業;E801070_廚具、衛浴設備安裝工程業;E603120_噴砂工程業;A102080_園藝服務業;I503010_景觀、室內設計業;F101100_花卉批發業;E903010_防蝕、防銹工程業;F106010_五金批發業;F107010_漆料、塗料批發業;F207010_漆料、塗料零售業;F399040_無店面零售業;E901010_油漆工程業;ZZ99999_除許可業務外，得經營法令非禁止或限制之業務;", "公司狀況": "核准設立", "核准設立日期": "096年05月24日", "代表人姓名": "王慮成", "公司名稱": "宏實業有限公司", "最後核准變更日期": "105年06月27日", "資本總額(元)": "1,000,000", "登記機關": "臺北市政府"}


In [57]:
data = json.load(open('data2.json'))
data

{'1': {'代表人姓名': '',
  '停業日期(起)': '088年08月09日',
  '停業日期(迄)': '089年08月08日',
  '公司名稱': '實有限公司',
  '公司所在地': '臺北市大安區敦化南路２段２１６號２０樓',
  '公司狀況': '解散(089年01月11日建商二字第89251862號)',
  '所營事業資料': ';０１一般進出口貿易業務︹許可業務除外︺;０２電子五金機械電器器材之買賣及進出口業務;０３電腦設備通信器材之設計按裝及買賣進出口業務;０４事務用品器械之買賣進出口業務;０５Ｊ６０２０１０演藝活動業;',
  '最後核准變更日期': '089年01月11日',
  '核准設立日期': '086年09月02日',
  '登記機關': '臺北市政府',
  '統一編號': '16211419',
  '資本總額(元)': '5,000,000'},
 'second': {'代表人姓名': '張貞琳',
  '公司名稱': '釆烽有限公司',
  '公司所在地': '臺北市松山區敦化北路170號3樓',
  '公司狀況': '核准設立',
  '所營事業資料': ';F101040_家畜家禽批發業;F101050_水產品批發業;F101130_蔬果批發業;F101990_其他農、畜、水產品批發業;F102020_食用油脂批發業;F102040_飲料批發業;F102050_茶葉批發業;F102170_食品什貨批發業;F103010_飼料批發業;F107020_染料、顏料批發業;F107170_工業助劑批發業;F107200_化學原料批發業;F107990_其他化學製品批發業;F112040_石油製品批發業;F113010_機械批發業;F113990_其他機械器具批發業;F119010_電子材料批發業;F121010_食品添加物批發業;F203010_食品什貨、飲料零售業;F207020_染料、顏料零售業;F207170_工業助劑零售業;F207200_化學原料零售業;F207990_其他化學製品零售業;F213080_機械器具零售業;F213990_其他機械器具零售業;F219010_電子材料零售業;F221010_食品添加物零售業;F399040_無店面零售業;F401010_國際貿易業;ZZ99999_除許可業務

In [27]:
# companies = soup.findAll('div', {'class': 'panel panel-default'})
def getCompanyHref(company):
    '''
    company : soup of company
    Example :
    companies = soup.findAll('div', {'class': 'panel panel-default'})
    company = companies[7]
    '''
    try:
        href = company.a['href']
    except:
        return 'urlError'
    href_fix = 'https://findbiz.nat.gov.tw' + re.sub(r'[\r|\n]', '', company.a['href'])
    return href_fix
# getCompanyHref(companies[2])

def getCompanyName(company):
    '''
    company : soup of company
    Example :
    companies = soup.findAll('div', {'class': 'panel panel-default'})
    company = companies[7]
    '''
    try:
        company_name = re.sub(r'[\r|\n|\t| ]', '', company.a.text)
    except:
        return 'companyNameError'
    # company's info: for augment
    # company_info = re.sub(r'[\r|\n|\t| ]', '', company.findAll('div')[1].text)
    return company_name
# getCompanyName(companies[2])


def getPageTotal(soup):
    '''
    soup : request page's soup
    '''
    try:
        totalTab = soup.find('span', {'id': 'lblTopTotal'})
    except:
        print('getPageTotal fail !')
        return 1
    try:
        testString = re.sub(r'[\n|\t|\r|\xa0]', '', totalTab.parent.text)
        pageTotal = re.sub(r'(^.*[分]|[頁].*$|,)', '', testString)
    except:
        print('getPageTotal parsing fail !')
        return 1
    return pageTotal
# getPageTotal(soup)

def setPayload(currentPage, busiItemSub):
    '''
    currentPage: '103'
    busiItemSub: 'C301010'
    '''
    payload = {
        'pagingModel.currentPage':str(currentPage),
        'model.qryCond':'公司',
        'model.isAlive':'true',
        'model.cmpyType':'true',
        'model.infoType':'D',
        'model.busiItemSub':str(busiItemSub)
    }
    return payload
# setPayload(1234, 'asdf')


headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
          'Referer': 'https://findbiz.nat.gov.tw/fts/query/QueryList/queryList.do'}
url = 'https://findbiz.nat.gov.tw/fts/query/QueryList/queryList.do'

items = pd.read_csv('config/items.csv')
items = items['itemId']
outputFile = 'urls/url.csv'


with open(outputFile, "w") as text_file:
    text_file.write('{')
for item in items:
    # Find pageTotal of every item, say, C301010 has 21 pages.
    print('Iterating item: %s now...' % (item))
    payload = setPayload(1, item)
    time.sleep(randint(2,6))
    try:
        r = requests.post(url, headers=headers, data=payload)
    except:
        print('Request item fail...')
        continue
    soup = BeautifulSoup(r.text, 'html.parser')
    pageTotal = getPageTotal(soup)
    for page in range(1, pageTotal + 1):
        # Request all pages of the item, say, C301010.
        print('Iterating item: %s, page: %d now...' % (item, page))
        payload = setPayload(page, item)
        time.sleep(randint(2,6))
        try:
            r = requests.post(url, headers=headers, data=payload)
        except:
            print('Request page fail...')
            continue
        soup = BeautifulSoup(r.text, 'html.parser')
        companies = soup.findAll('div', {'class': 'panel panel-default'})
        for company in companies:
            # Parse all company's href and name from the page.
            # companyName = getCompanyName(company)
            href = getCompanyHref(company)
            # Write information into outputFile
            appendJsonInfo(outputFile, href, value)
with open(outputFile, "a") as text_file:
    text_file.write('}')


{'model.busiItemSub': 'asdf',
 'model.cmpyType': 'true',
 'model.infoType': 'D',
 'model.isAlive': 'true',
 'model.qryCond': '公司',
 'pagingModel.currentPage': '1234'}