## 從公益資訊中心網站抓取NPO組織之名稱與電子郵件信箱

Website: http://www.npo.org.tw/npolist.aspx?nowPage=1&tid=146  
Lecture notes: https://drive.google.com/drive/u/0/folders/1R16elMdFuviTiYnFvSigca9_ljnblMlY

### Steps
1. Get the 機構代碼, 機構屬性, 非營利組織名稱 for a NGO
2. Get the email address for the same NGO
3. Loop through NGOs in the same page
4. Loop through all pages

In [None]:
from urllib.request import urlopen
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import re

In [None]:
def get_email(orgid):
    url = urlopen(f"http://www.npo.org.tw/orgnpointroduction.aspx?tid=200&orgid={orgid}")
    url_npo = url.read().decode('utf-8')
    parser = BeautifulSoup(url_npo, 'html.parser')

    l = parser.find_all("h3")
    for line in l:
        if line.find(string='Email：'):
            email = list(line.children)[1].get_text()
    return email


# Create a dict to store all values
# npos = {'機構代碼': [], "機構屬性": [], "非營利組織名稱": [], "Email": []}

for page_num in range(51, 491):
    print(f'Parsing page {page_num}...')
    url = urlopen(f"http://www.npo.org.tw/npolist.aspx?nowPage={page_num}&tid=146")
    url_html = url.read().decode('utf-8')
    soup = BeautifulSoup(url_html, 'html.parser')
    ls = soup.find_all('td')

    cnt = 0
    for line in ls:
        # Get org id
        if line.attrs['data-th'] == '機構代碼':
            npos['機構代碼'].append(line.get_text())
            # Get email of that orgnization
            npos['Email'].append(get_email(int(line.get_text())))
            cnt += 1

        # Get the type of org
        if line.attrs['data-th'] == '機構屬性':
            npos['機構屬性'].append(line.get_text())

        # Get the name of org
        if line.attrs['data-th'] == '非營利組織名稱':
            npos['非營利組織名稱'].append(line.get_text())


    print(f'Number of NPOs in page {page_num}: {cnt}')
    print('----------')

    if page_num % 50 == 0:
        # Check if the length of lists in the dict are the same
        print("Check: ", [len(v) for k, v in npos.items()])
        df = pd.DataFrame.from_dict(npos)
        df.to_csv(f"~/Downloads/NPOs_List/df{page_num}.csv", index=False)
        print(f"Export df{page_num}.csv")
        print('----------')

Parsing page 51...


NameError: ignored

In [None]:
page_num = 490
df = pd.DataFrame.from_dict(npos)
df.to_csv(f"~/Downloads/NPOs_List/df{page_num}.csv", index=False)

### Grab detailed infomation about NPOs

In [None]:
df = pd.read_csv("~/Downloads/NPOs_List/NPOs_Contact_Info.csv")
total = df.shape[0]

# Store info in dict
npos_context = {'機構代碼': [], '成立日期': [], '成立主旨': [], '工作重點': [],
                '服務區域': [], '服務項目': []}

for i, orgid in enumerate(df.iloc[:,0]):
    # Get Org ID
    npos_context['機構代碼'].append(orgid)

    # Open URL and grab HTML
    url = urlopen(f"http://www.npo.org.tw/orgnpointroduction.aspx?tid=200&orgid={orgid}")
    url_npo = url.read().decode('utf-8')
    parser = BeautifulSoup(url_npo, 'html.parser')
    l = parser.find_all('h4')

    try:
        # 成立日期
        npos_context['成立日期'].append(list(l[10].children)[0][5:])
    except IndexError:
        npos_context['成立日期'].append(np.nan)

    try:
        # 成立主旨
        npos_context['成立主旨'].append(list(l[13].children)[2])
    except IndexError:
        npos_context['成立主旨'].append(np.nan)

    try:
        # 工作重點
        npos_context['工作重點'].append(list(l[14].children)[2])
    except IndexError:
        npos_context['工作重點'].append(np.nan)

    try:
        # 服務區域
        npos_context['服務區域'].append(list(l[15].children)[0][5:])
    except IndexError:
        npos_context['服務區域'].append(np.nan)

    try:
        # 服務項目
        npos_context['服務項目'].append(list(l[18].children)[0][5:])
    except IndexError:
        npos_context['服務項目'].append(np.nan)

    # Check progress
    if i % 50 == 0:
        # Check if the length of lists in the dict are the same
        print("Check: ", [len(v) for k, v in npos_context.items()])
        print(f"Progress: {i}/{total} {int(round(i/total*100))}%")
        print('----------')

    if i % 1500 == 0:
        df_context = pd.DataFrame.from_dict(npos_context)
        df_new = df.merge(df_context, on="機構代碼")
        df_new.to_csv(f"~/Downloads/NPOs_List_New/df{i}.csv", index=False)


# Merge with old file and export to csv
df_context = pd.DataFrame.from_dict(npos_context)
df_new = df.merge(df_context, on="機構代碼")
df_new.to_csv(f"~/Downloads/NPOs_List_New/df_final.csv", index=False, sep=';')

Check:  [1, 1, 1, 1, 1, 1]
Progress: 0/7350 0%
----------
Check:  [51, 51, 51, 51, 51, 51]
Progress: 50/7350 1%
----------
Check:  [101, 101, 101, 101, 101, 101]
Progress: 100/7350 1%
----------
Check:  [151, 151, 151, 151, 151, 151]
Progress: 150/7350 2%
----------
Check:  [201, 201, 201, 201, 201, 201]
Progress: 200/7350 3%
----------
Check:  [251, 251, 251, 251, 251, 251]
Progress: 250/7350 3%
----------
Check:  [301, 301, 301, 301, 301, 301]
Progress: 300/7350 4%
----------
Check:  [351, 351, 351, 351, 351, 351]
Progress: 350/7350 5%
----------
Check:  [401, 401, 401, 401, 401, 401]
Progress: 400/7350 5%
----------
Check:  [451, 451, 451, 451, 451, 451]
Progress: 450/7350 6%
----------
Check:  [501, 501, 501, 501, 501, 501]
Progress: 500/7350 7%
----------
Check:  [551, 551, 551, 551, 551, 551]
Progress: 550/7350 7%
----------
Check:  [601, 601, 601, 601, 601, 601]
Progress: 600/7350 8%
----------
Check:  [651, 651, 651, 651, 651, 651]
Progress: 650/7350 9%
----------
Check:  [701,

Check:  [5251, 5251, 5251, 5251, 5251, 5251]
Progress: 5250/7350 71%
----------
Check:  [5301, 5301, 5301, 5301, 5301, 5301]
Progress: 5300/7350 72%
----------
Check:  [5351, 5351, 5351, 5351, 5351, 5351]
Progress: 5350/7350 73%
----------
Check:  [5401, 5401, 5401, 5401, 5401, 5401]
Progress: 5400/7350 73%
----------
Check:  [5451, 5451, 5451, 5451, 5451, 5451]
Progress: 5450/7350 74%
----------
Check:  [5501, 5501, 5501, 5501, 5501, 5501]
Progress: 5500/7350 75%
----------
Check:  [5551, 5551, 5551, 5551, 5551, 5551]
Progress: 5550/7350 76%
----------
Check:  [5601, 5601, 5601, 5601, 5601, 5601]
Progress: 5600/7350 76%
----------
Check:  [5651, 5651, 5651, 5651, 5651, 5651]
Progress: 5650/7350 77%
----------
Check:  [5701, 5701, 5701, 5701, 5701, 5701]
Progress: 5700/7350 78%
----------
Check:  [5751, 5751, 5751, 5751, 5751, 5751]
Progress: 5750/7350 78%
----------
Check:  [5801, 5801, 5801, 5801, 5801, 5801]
Progress: 5800/7350 79%
----------
Check:  [5851, 5851, 5851, 5851, 5851, 5

In [None]:
# df_new.to_csv(f"~/Downloads/NPOs_List_New/df_final.csv", index=False, sep=';')
df_new.to_clipboard(index=False)

In [None]:
df_new.loc[7036:7050,]

Unnamed: 0,機構代碼,機構屬性,非營利組織名稱,Email,成立日期,成立主旨,工作重點,服務區域,服務項目
7036,190,,陳茂榜紀念文教基金會,naoki@sampo.org.tw,1970/01/14,1.舉辦或獎助文教事業 2.推動兩岸學術交流,1.新力行講學金：大專學生依產業、社會或國際三類任選其一撰寫企劃報告，經評審後，優勝者給予講...,,
7037,189,,中華啟能基金會,,1983/10/29,1創設庇護機構，施以技能訓練及生活輔導.2推廣智能障礙者的救育.3提供智能障礙者的工作機會4...,,,
7038,188,,祐生研究基金會,archilife@archilife.org,1978/12/30,"籌設生態研究中心, 提供研究生活居住環境定性定量之品質改進方案, 舉辦獎助鼓勵有志之士就各該...","1.桑思特組:\n2.共生生態研究:\n3.知識庫:拆解各學科知識,成為可即時使用之資料\n...",,
7039,187,綜合性服務,善立文教慈愛基金會,sunlit@seed.net.tw,1994/03/22,倡導藝術創作及鑑賞活動，提昇國民生活品質，舉辦教育、公益事業為宗旨。,1.以”推愛洗車場”作為啟智學生就業中繼站，八十六年九月獲天下雜誌報導．,台中市,"營隊活動, 急難救助, 獎助學金, 職訓補助, 社區服務"
7040,2891,,台北市林盤文化教育基金會,,1900/01/01,,,,
7041,2885,,林公熊徵學田基金會,,1900/01/01,,,,
7042,2878,,三清道家道教文化基金會,skyaaa@ms19.hinet.net,1900/01/01,,,,
7043,2875,,念慈文教基金會,,1900/01/01,,,,
7044,186,,三光文教科技基金會,,1900/01/01,為推廣文化教育事業，社會公益活動為宗旨,,,
7045,185,,吳修齊紀念先嚴吳克讀公、先慈陳氏勤娘文教公益基金會,,1979/08/21,以文教公益基金獎勵優秀清寒學生，救濟貧困殘胞，贊助教育文化及慈善公益事業為目的,,,
