In [7]:
import os
import pandas as pd
import numpy as np


In [8]:
#get list of files to rescrape
files = os.listdir(os.getcwd()+'/s2scrapedata')
files = [i for i in files if '__.csv' in i]

megadf = pd.DataFrame()

for f in files:
    tempdf = pd.read_csv('./s2scrapedata/'+f)
    megadf  = pd.concat((megadf,tempdf))

officialdf = pd.read_csv('../datadotgov_main.csv')
officialdf['ABN'] = officialdf['ABN'].astype(str)




In [9]:
#begin cleaning

megadf.index = megadf['ABN']
megadf = megadf.drop(['Website', 'Revenue', 'Expenses', 'Cause/s', 'Last_report_date', 'ref_religion','Unnamed: 0','ABN'], axis=1)



In [10]:
#add in interesting official data (namely no responsible persons, reg date and establishment date)

megadf = pd.merge(megadf,officialdf[["ABN",'Number_of_Responsible_Persons','Registration_Date','Date_Organisation_Established']],on='ABN',how='left')
megadf.to_csv('combined_scraped_list.csv')

In [11]:
#develop search/filter lists

# for free websites
freedomainlist = [  'webnode',
                    'wixsite',
                    'weebly',
                    'wordpress',
                    'hubspot',
                    'hs-sites',
                    'godaddy',
                    'site123',
                    'mozello',
                    'webstarts',
                    'jimdofree',
                    'ucraft',
                    'webflow',
                    'strinkingly',
                    'blog']
regexsearchstring = ''
for d in freedomainlist:
    regexsearchstring = (regexsearchstring+d+"|")[:-1]

webfilter_freesites     = megadf['website'].str.contains(pat=regexsearchstring,regex=True,na=False)
webfilter_free_orNone   = megadf['website'].str.contains(pat=regexsearchstring,regex=True,na=True)

# for religious reference
relfilter = megadf['religious_ref']==0

# for upto date reporting
recentreport_filter = megadf['lastreport'].str.contains('2021|2022',na=False)

# for state
statefilter = megadf['State']=='QLD'

# greater than one responsible person filter
teamsizefilter = megadf['Number_of_Responsible_Persons']>1

# name doesn't contain "trust" or "fund"
trustfundfilter  = ~megadf['Legal_Name'].str.contains('trust|fund|Trust|Fund|TRUST|FUND',regex=True,na=False)

# name doesn't contain "RSL", "Meals on Wheels" or'lifesave'
bigbusinessfilter  = ~megadf['Legal_Name'].str.contains('RSL|Returned And Services League|Returned Service League|Returned Services League|Rsl|Returned & Services League|Meals on Wheels|Meals On Wheels|Lifesave|Surf Life|Parents And Friends',regex=True,na=False)

In [12]:


filters = webfilter_free_orNone*relfilter*recentreport_filter*teamsizefilter*statefilter*trustfundfilter*bigbusinessfilter


megadf[filters].sort_values(by=['revenue'], ascending=False).iloc[91:110]


Unnamed: 0,ABN,Index_Link,Legal_Name,Town/Suburb,State,Status,Size,website,revenue,expenses,causes,lastreport,religious_ref,Number_of_Responsible_Persons,Registration_Date,Date_Organisation_Established
13421,44825274317,https://www.acnc.gov.au/charity/charities/46af...,Home Of St Francis,Annerley,QLD,Registered,Small,,42144.0,36000.0,Adults - aged 25 to under 65; Adults - aged 65...,6 Jun 2022,0.0,2.0,03/12/2012,01/01/1974
10762,11759339591,https://www.acnc.gov.au/charity/charities/18a8...,Stuartholme Sacre Coeur Associaton,Toowong,QLD,Registered,Small,,42083.0,30632.0,Adults - aged 25 to under 65; Children - aged ...,15 Jun 2022,0.0,5.0,03/12/2012,01/01/1926
22780,51746985267,https://www.acnc.gov.au/charity/charities/4f17...,Bonza Banga Charity Bbq's Street Outreach,Caboolture,QLD,Registered,Small,,42000.0,38000.0,Adults - aged 25 to under 65; People at risk o...,8 Apr 2022,0.0,7.0,03/12/2012,01/01/2007
1188,95137744526,https://www.acnc.gov.au/charity/charities/da6e...,Kids R Us National Aog Children's Ministry Mov...,Springwood,QLD,Registered,Small,,41030.0,80086.0,Children - aged 6 to under 15,17 Feb 2022,0.0,2.0,03/12/2012,01/01/1996
12616,67095850849,https://www.acnc.gov.au/charity/charities/ac74...,Seq Catchments Members' Association,Brisbane,QLD,Registered,Small,,40008.0,40008.0,General community in Australia,29 Jan 2022,0.0,6.0,03/12/2012,01/01/2009
9384,88705721797,https://www.acnc.gov.au/charity/charities/729f...,Surat Hospital Auxiliary,Surat,QLD,Registered,Small,,39039.0,9996.0,Families,27 Jan 2022,0.0,3.0,03/12/2012,01/01/1959
13938,81609213323,https://www.acnc.gov.au/charity/charities/55a9...,Mutchilba Community Centre Inc,Dimbulah,QLD,Registered,Small,,38675.0,39101.0,Families; General community in Australia; Peop...,6 Apr 2021,0.0,5.0,03/12/2012,01/01/1993
3923,88491208849,https://www.acnc.gov.au/charity/charities/e0f5...,Kingsley College P&F INC.,Berserker,QLD,Registered,Small,,38009.0,1792.0,Children - aged 6 to under 15; Youth - 15 to u...,12 Jul 2022,0.0,3.0,03/12/2012,01/07/2010
21028,21219909767,https://www.acnc.gov.au/charity/charities/640d...,Mackay Woodturners Assn Inc,Mackay,QLD,Registered,Small,,37033.0,22110.0,Adults - aged 25 to under 65; Adults - aged 65...,7 Feb 2022,0.0,3.0,03/12/2012,01/01/1987
7095,16613668009,https://www.acnc.gov.au/charity/charities/2913...,Charleville Airfield Museum Ltd,Charleville,QLD,Registered,Small,,37008.0,37747.0,Families; People in rural/regional/remote comm...,7 Apr 2022,0.0,7.0,08/08/2016,14/07/2016
