# Charity Scraper
### The Plan:
1. Stage 1 scraping: Scrape Each Charity's register_index URL from the main search page
1. Stage 2 scraping: Open each charity and scrape characteristics
1. Stage 3 analysis:
  - clean date data
  - Use generated charity_index to filter for websites with desired characteristics
    * income
    * reporting
    * cause helped
    * has website and charity_url

## Stage 1 Scraping
#### todo:
- scrape 10 at a time
- rerun
- add placeholder headings

In [1]:
# Initialisation and loadup
from http import server
import pandas as pd
import os
import time
import numpy as np
import copy

import selenium as sl
from selenium import webdriver

from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
driverpath=ChromeDriverManager().install()
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
size = 'Medium'

In [2]:
# initialise the pandas dataframe
columns = ["Index_Link", "Legal_Name", "Town/Suburb","State", "Status", "Size","ABN","Website","Revenue","Expenses","Cause/s","Last_report_date","ref_religion"]
try:
    os.mkdir('s1scrapedata')
except:
    None

try:
    os.mkdir('s2scrapedata')
except:
    None

In [3]:

def getpageurl(p):
    lefturl="https://www.acnc.gov.au/charity/charities?items_per_page=100&page="
    pagenumber=str(p)
    righturl="&f[]=size%3A"+size
    
    url = lefturl+pagenumber+righturl #pre filtered url for small businesses and to show 100 results - Page 1
    return url

In [4]:
def extract_page_data(d):
    try:
        WebDriverWait(d,90).until(EC.presence_of_element_located((By.TAG_NAME,"tbody")))
        time.sleep(1)
        table=d.find_element(by=By.TAG_NAME, value="tbody")
        rows=table.find_elements(by=By.TAG_NAME,value="tr")
        
        for row in rows:
            link   = row.find_element(by=By.TAG_NAME,value="a").get_attribute("href")
            name   = row.find_elements(by=By.TAG_NAME, value="td")[0].text
            town   = row.find_elements(by=By.TAG_NAME, value="td")[1].text
            state  = row.find_elements(by=By.TAG_NAME, value="td")[2].text
            status = row.find_elements(by=By.TAG_NAME, value="td")[3].text
            size   = row.find_elements(by=By.TAG_NAME, value="td")[4].text
            abn    = row.find_elements(by=By.TAG_NAME, value="td")[5].text

            row_data = pd.DataFrame([link,name,town,state,status,size,abn,None,None,None,None,None,None], index=columns).transpose()
            try:
                page_data = pd.concat([page_data,row_data],ignore_index=True)
            except:
                page_data = row_data
        
        return page_data
    except:
        print('Table could not be found on page'+str(page))
        return None 


    

In [5]:
#   df=pd.DataFrame([0,1,'b'], index=[10,20,30]).transpose()
#   b=pd.DataFrame([2,3,5], index=[10,20,30]).transpose()
#   df=pd.concat([df,b],ignore_index=True)
#   print(df)
#   df=pd.concat([df,copy.deepcopy(b)],ignore_index=True)
#   print(df)
#   
#   df.to_csv('./folder/file.csv')

In [6]:
a=np.array([0,0,0])
print(a)
a=np.vstack((a,[1,2,3]))
print(a)


[0 0 0]
[[0 0 0]
 [1 2 3]]


In [7]:
def open_drivers(num_drivers=10):
    drivers =[]
    for i in range(num_drivers):
        drivers.append(webdriver.Chrome(service=Service(driverpath)))
    return drivers

In [8]:
def run_scraper():
    drivers = open_drivers(3)
    try:
        #navigate into pages and scrape
        page=0
        totalpages= 83 - 1 #actual pages-1
        try:
            ScrapedSearchPages = np.load('./s1scrapedata/scrapedpageindex.npy')
        except:
            ScrapedSearchPages = np.array([])

        tempdf = pd.DataFrame()

        while page < totalpages:
            #load up sites
            pagecopy = copy.deepcopy(page)
            for d in drivers:
                while pagecopy in ScrapedSearchPages:
                    pagecopy+=1
                if pagecopy < totalpages:
                    d.get(getpageurl(pagecopy))        
                    pagecopy+=1

            ## scraping
            for D in drivers:
                while page in ScrapedSearchPages:
                    page +=1
                page_data = extract_page_data(d=D)
                tempdf=pd.concat((tempdf,page_data),ignore_index=True)
                ScrapedSearchPages=np.append(ScrapedSearchPages, page)

                if (page+1) % 15 == 0:
                    np.save('./s1scrapedata/scrapedpageindex.npy',ScrapedSearchPages)
                    tempdf.to_csv('./s1scrapedata/S1scrape_topage'+str(page+1)+'.csv')
                    tempdf = pd.DataFrame()
    
                page += 1
                print("Progress: "+str(page)+"/"+str(totalpages+1))

        #final Save
        np.save('./s1scrapedata/scrapedpageindex.npy',ScrapedSearchPages)
        tempdf.to_csv('./s1scrapedata/S1scrape_topage'+str(page+1)+'.csv')
        tempdf = pd.DataFrame()


        #kill drivers
        for d in drivers:
            d.quit()

    except Exception as e:
        for d in drivers:
            d.quit()
        raise e
    

In [9]:
run_scraper()

Progress: 31/83
Progress: 32/83
Progress: 33/83
Progress: 34/83
Progress: 35/83
Progress: 36/83
Progress: 37/83
Progress: 38/83
Progress: 39/83
Progress: 40/83
Progress: 41/83
Progress: 42/83
Progress: 43/83
Progress: 44/83
Progress: 45/83
Progress: 46/83
Progress: 47/83
Progress: 48/83
Progress: 49/83
Progress: 50/83
Progress: 51/83
Progress: 52/83
Progress: 53/83
Progress: 54/83
Progress: 55/83
Progress: 56/83
Progress: 57/83
Progress: 58/83
Progress: 59/83
Progress: 60/83
Progress: 61/83
Progress: 62/83
Progress: 63/83
Progress: 64/83
Progress: 65/83
Progress: 66/83
Progress: 67/83
Progress: 68/83
Progress: 69/83
Progress: 70/83
Progress: 71/83
Progress: 72/83
Progress: 73/83
Progress: 74/83
Progress: 75/83
Progress: 76/83
Progress: 77/83
Progress: 78/83
Progress: 79/83
Progress: 80/83
Progress: 81/83
Progress: 82/83
Progress: 83/83
Progress: 84/83
