<a href="https://colab.research.google.com/github/henryho1324/taiwan_business_dat_API/blob/main/%E4%BB%A5%E7%B5%B1%E7%B7%A8%E6%9F%A5%E8%A9%A2%E6%94%BF%E5%BA%9C%E7%87%9F%E6%A5%AD%EF%BC%8F%E5%85%AC%E5%8F%B8%E7%99%BB%E8%A8%98%E8%B3%87%E6%96%99.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 定義API 串接函數
目的：串連政府API，以統編呼叫營業登記、公司登記之資料。


In [None]:
def tax_id_basic_info(tax_id):
    # import package
    import requests
    
    # variables for f-string
    format = "json"
    President_No = str(tax_id)
    Business_Accounting_NO = str(tax_id)
    
    # 根據商家規模，可分為「商業登記」與「公司登記」
    # try 商業登記
    try:
        # request data

        # taiwan gov open data api: 商業登記基本資料-應用三 (https://reurl.cc/Vj32yy)
        # 「商家負責人」、「資料最後更改日期」等重要資訊存放於「應用一」之中，但必須先透過「應用三」取得 'Agency' 欄位，才可使用「應用一」
        api_str = f"https://data.gcis.nat.gov.tw/od/data/api/426D5542-5F05-43EB-83F9-F1300F14E1F1?$format={format}&$filter=President_No eq {President_No}"
        r = requests.get(api_str, verify=False)
        list_of_dicts = r.json()
        # print(list_of_dicts)
        # get the 'Agency' column for next step
        Agency = list_of_dicts[0]['Agency']

        # taiwan gov open data API: 商業登記基本資料-應用一 (https://reurl.cc/Vj32yy)
        api_str2 = f'https://data.gcis.nat.gov.tw/od/data/api/7E6AFA72-AD6A-46D3-8681-ED77951D912D?$format={format}&$filter=President_No eq {President_No} and Agency eq {Agency}'
    
        # save the info into json format
        r = requests.get(api_str2, verify=False)
        biz_info = r.json()
        # print(biz_info)

        # get the necessary columns from json
        tax_id_is_valid = True
        business_name = biz_info[0]['Business_Name'] # 行號名稱
        responsible_name = biz_info[0]['Responsible_Name'] # 負責人姓名
        business_last_change_datestr = biz_info[0]['Business_Last_Change_Date'] # 最後修改日期
        business_type = biz_info[0]['Business_Organization_Type_Desc'] # 商家類別（獨資、合資 etc.)

        # change the date format from 民國 to A.C. for business_last_change_date
        business_last_change_datestr = str(int(business_last_change_datestr) + 19110000)

        biz_info_dict = {'tax_id': President_No,'tax_id_is_valid':tax_id_is_valid, 'business_name':business_name, 'responsible_name':responsible_name, 
                         'business_type':business_type,'business_last_change_datestr':business_last_change_datestr}
        return biz_info_dict

    except:
        try:
            # taiwan gov open data api: 公司登記基本資料-應用一 (https://reurl.cc/Vj32yy)
            # 公司登記基本資料之API，僅需統編(Business_Accounting_NO) 即可使用
            api_str = f'https://data.gcis.nat.gov.tw/od/data/api/5F64D864-61CB-4D0D-8AD9-492047CC1EA6?$format={format}&$filter=Business_Accounting_NO eq {Business_Accounting_NO}'
            r = requests.get(api_str, verify=False)
            biz_info = r.json()

            tax_id_is_valid = True
            business_name = biz_info[0]['Company_Name'] # 行號名稱
            responsible_name = biz_info[0]['Responsible_Name'] # 負責人姓名
            business_last_change_datestr = biz_info[0]['Change_Of_Approval_Data'] # 最後修改日期

            # change the date format from 民國 to A.C. for business_last_change_date
            business_last_change_datestr = str(int(business_last_change_datestr) + 19110000) 

            biz_info_dict = {'tax_id': President_No, 'tax_id_is_valid':tax_id_is_valid, 'business_name':business_name, 'responsible_name':responsible_name,
                             'business_last_change_datestr':business_last_change_datestr}
            return biz_info_dict
        except:
            tax_id_is_valid = False
            biz_info_dict = {'tax_id': President_No, 'tax_id_is_valid':tax_id_is_valid}
            return biz_info_dict


#### 自定義函數測試

In [None]:
# function test
tax_id_basic_info('50819934')



{'business_last_change_datestr': '20210906',
 'business_name': '台灣星豆有限公司',
 'responsible_name': '馬雅芬',
 'tax_id': '50819934',
 'tax_id_is_valid': True}

### 將 tax_id_basic_info 函數應用於存有統編資料之csv檔

In [None]:
# import packages
import numpy as np
import pandas as pd
from google.colab import files

In [None]:
# csv檔資料來源：全國營業(稅籍)登記資料集 (https://data.gov.tw/dataset/9400)
df = pd.read_csv('/content/drive/MyDrive/Data/vat_num/全國營業(稅籍)登記資料集.csv', dtype={'統一編號':str, '總機構統一編號':str})

# 「全國營業(稅籍)登記資料集」中，中小企業之 '總機構統一編號' 為 Null，僅有'統一編號'資料
df['總機構統一編號'] = df['總機構統一編號'].fillna(df['統一編號'])

# 將所有不重複之總機構統編存為list
tax_id_list = df['總機構統一編號'].unique()

#### 測試 (100 observations)

In [None]:
# select a 100 observations subset for testing
tax_id_list_short = tax_id_list[0:100]

print(len(tax_id_list_short))
print(tax_id_list_short)

100
[nan '38965019' '61194605' '82554400' '47588972' '21822468' '99925659'
 '19246243' '87004192' '99944261' '25277284' '72397281' '45540235'
 '89771986' '82551213' '72430556' '91016889' '17713903' '38965257'
 '45736596' '28921040' '47525098' '10925136' '61432901' '61755042'
 '87160263' '42586052' '26825181' '34889201' '94927616' '61362905'
 '39756399' '55891293' '87160828' '45676767' '61410505' '77370108'
 '42764318' '88561272' '83128593' '55967194' '87003412' '21471364'
 '84987494' '45593598' '12041266' '99381678' '72597942' '72443665'
 '85996750' '82376975' '47522228' '61782646' '81507204' '18138479'
 '25473954' '47590484' '77375618' '82404727' '87159603' '82403305'
 '87163374' '14605472' '39701114' '87820130' '61502103' '82552331'
 '84859977' '10920575' '82400877' '12817106' '61918968' '82554295'
 '39698856' '77385658' '82550397' '38880758' '47353124' '25228159'
 '14609815' '14603220' '25419220' '82373512' '82551065' '18126847'
 '88409376' '61799436' '39919106' '87841698' '81509414

In [None]:
# prepare a empty list to store the output dict from 'tax_id_basic_info' function
list_of_dict = []
for tax_id in tax_id_list_short:
    list_of_dict.append(tax_id_basic_info(tax_id))
    
list_of_dict



[{'tax_id': 'nan', 'tax_id_is_valid': False},
 {'business_last_change_datestr': '20150413',
  'business_name': '原味商行',
  'business_type': '獨資',
  'responsible_name': '連麗雀',
  'tax_id': '38965019',
  'tax_id_is_valid': True},
 {'business_last_change_datestr': '20180918',
  'business_name': '和興商店',
  'business_type': '獨資',
  'responsible_name': '黃秀英',
  'tax_id': '61194605',
  'tax_id_is_valid': True},
 {'business_last_change_datestr': '20191106',
  'business_name': '啓輝環管企業社',
  'business_type': '獨資',
  'responsible_name': '簡裕政',
  'tax_id': '82554400',
  'tax_id_is_valid': True},
 {'business_last_change_datestr': '20161101',
  'business_name': '龍昇工程行',
  'business_type': '獨資',
  'responsible_name': '張玉井',
  'tax_id': '47588972',
  'tax_id_is_valid': True},
 {'tax_id': '21822468', 'tax_id_is_valid': False},
 {'business_last_change_datestr': '20141209',
  'business_name': '百味香小吃部',
  'business_type': '獨資',
  'responsible_name': '蕭英志',
  'tax_id': '99925659',
  'tax_id_is_valid': True},
 {

In [None]:
# save the business info as DataFrame
api_info_df = pd.DataFrame(list_of_dict)
api_info_df

Unnamed: 0,tax_id,tax_id_is_valid,business_name,responsible_name,business_type,business_last_change_datestr
0,,False,,,,
1,38965019,True,原味商行,連麗雀,獨資,20150413
2,61194605,True,和興商店,黃秀英,獨資,20180918
3,82554400,True,啓輝環管企業社,簡裕政,獨資,20191106
4,47588972,True,龍昇工程行,張玉井,獨資,20161101
...,...,...,...,...,...,...
95,14610088,True,品瑄食品,柯,獨資,20210823
96,87003503,True,氏修商行,丁氏修,獨資,20200309
97,88455743,True,金栩工程行,張羽淳,獨資,20211020
98,25321768,True,吉軒水產行,田淑綿,獨資,20171024
