# ACMA Scrapper
##### ©Haris Hassan

## Libraries

In [None]:
##=============================================================================
## ACMAscrapper V6.0.2
##=============================================================================
##
## Scrapper for Register of Radiocommunications Licences of Australian Communications and Media Authority
# 
#Author Haris Hassan
#Email haris.hassan@radhaz.com.au 
#linkedin https://www.linkedin.com/in/hassanharis/
#
##=============================================================================
# Import libraries

import re
import time
import requests
import random
import string
from bs4 import BeautifulSoup
import pandas as pd
pd.set_option('display.max_colwidth', None)
#Reset index of dataframe after sorting
import cProfile, pstats



## User Input

In [None]:
#### Replace Site 
sitecode = 35842

##### Filters
TRANSMITTER_ONLY = True

THIS_CLIENT_ONLY = False
THIS_CLIENT_ONLY_NAME = ''

IGNORE_CLIENT = False
IGNORE_CLIENT_NAME = ''

FREQUENCY_FILTER = False
MIN_FREQ = 650
MAX_FREQ = 3800

## Functions

In [None]:
def AssignmentsFilter(ACMAdata, TRANSMITTER_ONLY, THIS_CLIENT_ONLY, 
                       THIS_CLIENT_ONLY_NAME, IGNORE_CLIENT, IGNORE_CLIENT_NAME,
                      FREQUENCY_FILTER, MAX_FREQ, MIN_FREQ):
    TO_RETURN = ACMAdata
    if TRANSMITTER_ONLY:
        TO_RETURN = TO_RETURN.loc[TO_RETURN['T/R']=='T']
    if THIS_CLIENT_ONLY:
        TO_RETURN = TO_RETURN.loc[TO_RETURN['Client']==THIS_CLIENT_ONLY_NAME]
    if IGNORE_CLIENT:
        TO_RETURN = TO_RETURN.loc[TO_RETURN['Client']!=IGNORE_CLIENT_NAME]
    if FREQUENCY_FILTER:
        TO_RETURN = TO_RETURN.loc[(TO_RETURN['Frequency (MHz)'] < MIN_FREQ) | (TO_RETURN['Frequency (MHz)']>MAX_FREQ)]
    return TO_RETURN

In [None]:
def FreqToMhz(FREQ):
    frequency = []
    for x in FREQ:
        if 'GHz' in x:
            frequency.append(float(re.sub(' GHz','',x))*1000)
        elif 'MHz' in x:
            frequency.append(float(re.sub(' MHz','',x)))
    return frequency

In [None]:
def AssignmentsNextPage(ACMAdata, ACMAwebpage):

    AssignmentsSoup = BeautifulSoup(ACMAwebpage.text, 'lxml')
    Assignments = AssignmentsSoup.find('table',{"class": "tablelist responsive"})
    AssignmentsHeaders = [td.text.strip() for td in Assignments.select('th')]
    ACMAdatap2 = pd.DataFrame(columns = AssignmentsHeaders)

    ACMAlinks = []
    for j in Assignments.find_all('tr')[1:]:
        AssignmentsValues = [tv.text.strip() for tv in j.find_all('td')]
        ACMAdatap2.loc[len(ACMAdatap2)] = AssignmentsValues
        ACMAlinks.append([tl.get('href') for tl in j.find_all('a')][0])
    ACMAdatap2.insert(1,'links', ACMAlinks)
    ACMAdata = pd.concat([ACMAdata, ACMAdatap2], axis=0)

    #Check if there's another Page
    NEXT_PAGE_LINK = ''.join(['https://web.acma.gov.au' + x for x in  [tl.get('href') for tl in AssignmentsSoup.findAll('a',{'title':"Next Page"})]])

    if NEXT_PAGE_LINK and NEXT_PAGE_LINK.strip():
        NEXT_PAGE_LINK = requests.get(NEXT_PAGE_LINK)
        print(NEXT_PAGE_LINK)
        return AssignmentsNextPage(ACMAdata, NEXT_PAGE_LINK)
    else:
        return ACMAdata

In [None]:
UserAgents = [
    'Mozilla/5.0 (Linux; Android 12; SM-S906N Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.119 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 10; SM-G996U Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 10; SM-G980F Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/78.0.3904.96 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 9; SM-G973U Build/PPR1.180610.011) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 8.0.0; SM-G960F Build/R16NW) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.84 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 12; Pixel 6 Build/SD1A.210817.023; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/94.0.4606.71 Mobile Safari/537.36',
    'Mozilla/5.0 (Linux; Android 11; Pixel 5 Build/RQ3A.210805.001.A1; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/92.0.4515.159 Mobile Safari/537.36',
    'Mozilla/5.0 (iPhone14,3; U; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Mobile/19A346 Safari/602.1',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 5.0.2; SAMSUNG SM-T550 Build/LRX22G) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/3.3 Chrome/38.0.2125.102 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 7.0; SM-T827R4 Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.116 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 7.0; Pixel C Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/52.0.2743.98 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 11; Lenovo YT-J706X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
    'Mozilla/5.0 (Linux; Android 12; SM-X906C Build/QP1A.190711.020; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/80.0.3987.119 Mobile Safari/537.36',        
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'
]

In [None]:
site_url = 'https://web.acma.gov.au/rrl/site_search.site_lookup?pSITE_ID=' + str(sitecode)
ACMAwebpage = requests.get(site_url, headers= {'User-Agent': UserAgents[random.randint(0,len(UserAgents)-1)]})
soup = BeautifulSoup(ACMAwebpage.text, 'lxml')

try:
    SiteDetailsTitle = [td.text for td in soup.select_one('table',{"class": "tabledetail"}).select('td:nth-of-type(1)')]
    SiteDetails = [td.text.strip() for td in soup.select_one('table',{"class": "tabledetail"}).select('td:nth-of-type(2)')]
except Exception as e:
    print (e)
    pass

In [None]:
SiteDetailsDictionary = {SiteDetailsTitle[i]: SiteDetails[i] for i in range(len(SiteDetailsTitle))}
SiteDetailsDictionary['Location'] = ' '.join(SiteDetailsDictionary['Location'].split())

In [None]:
SiteDetailsDictionary

In [None]:
ACMAtable = soup.find('table',{"class": "tablelist responsive"})
ACMAheaders = [td.text.strip() for td in ACMAtable.select('th')]
ACMAdata = pd.DataFrame(columns = ACMAheaders)

In [None]:
ACMAdata = AssignmentsNextPage(ACMAdata, ACMAwebpage)
ACMAdata = ACMAdata.reset_index(drop=True)

In [None]:
ACMAdata['Frequency (MHz)'] = FreqToMhz(ACMAdata['Frequency'])
#ACMAdata_final['Frequency (MHz)']
#ACMAdata_final.loc[(ACMAdata_final['Frequency (MHz)'] < 650) | (ACMAdata_final['Frequency (MHz)']>3800), 'links']

In [None]:
AcmaDataFiltered = AssignmentsFilter(ACMAdata, TRANSMITTER_ONLY, THIS_CLIENT_ONLY, THIS_CLIENT_ONLY_NAME, IGNORE_CLIENT, IGNORE_CLIENT_NAME,FREQUENCY_FILTER, MAX_FREQ, MIN_FREQ)

In [None]:
#removing duplicates
ACMAdatalinks = list( dict.fromkeys(AcmaDataFiltered['links']) )
display(ACMAdatalinks)

## Pulling Antennas Details from Web links 

In [None]:
profiler = cProfile.Profile()
profiler.enable()

AntennaTableHeader = ['Device Registration ID', 'Antenna', 'Client','Device Type','Emission Center Frequency','Transmitter Power',
                      'Antenna Height (AGL)', 
                      'Antenna Polarisation','Antenna Azimuth', 'Antenna Tilt','Licence Number','Date Authorised',
                      'Destination Link']
AntennaTable = pd.DataFrame(columns = AntennaTableHeader)
NotFoundLinks = []

for acmalink in ACMAdatalinks:
    st = time.time()
    try:
        url = 'https://web.acma.gov.au'+ acmalink
        headstr = UserAgents[random.randint(0,len(UserAgents)-1)]
        page = requests.get(url, headers= {'User-Agent': headstr})
        soup2 = BeautifulSoup(page.text, 'lxml')

        try:
            Antennaheaders = [td.text for td in soup2.select_one('table',{"class": "tabledetail"}).select('td:nth-of-type(1)') if not td.has_attr('colspan')]
            AntennaValues = [td.text.strip() for td in soup2.select_one('table',{"class": "tabledetail"}).select('td:nth-of-type(2)')]
        except Exception as e:
            print (e)
            continue

        LinkToList = []
        LinkToTable = soup2.find("table", {"class": "tablelist linked-responsive"})
        if LinkToTable:
            for j in LinkToTable.find_all('tr')[1:]:
                LinkToList.append([tv.text.strip() for tv in j.select('td:nth-of-type(5)')] )

        if url[-2:]=='/1':
            try:
                Antennaheaders2 = [td.text for td in soup2.select_one('table:nth-of-type(2)',{"class": "tabledetail"}).select('td:nth-of-type(1)') if not td.has_attr('colspan')]
                AntennaValues2 = [td.text.strip() for td in soup2.select_one('table:nth-of-type(2)',{"class": "tabledetail"}).select('td:nth-of-type(2)')]
            except Exception as e:
                print(e)
                print(acmalink)
                continue

        if not LinkToList:
            AntennaValues.extend(['n/a'])
            if url[-2:]=='/1':
                AntennaValues2.extend(['n/a'])

        if LinkToList:
            LinkToListTemp = []
            for x in LinkToList:
                for j in x:
                    LinkToListTemp.append([' '.join(p) for p in [j.split()[:]]])
            LinkToListTemp=list(dict.fromkeys([''.join(p) for p in LinkToListTemp]))
            for todel in LinkToListTemp:
                if SiteDetailsDictionary['Location'] in todel:
                    LinkToListTemp.remove(todel)
            AntennaValues.extend([str(LinkToListTemp)])
            if url[-2:]=='/1':
              AntennaValues2.extend([str(LinkToListTemp)])
        Antennaheaders.extend(['Destination Link'])
        if url[-2:]=='/1':
          Antennaheaders2.extend(['Destination Link'])


        AntennaDictionary = {Antennaheaders[i]: AntennaValues[i] for i in range(len(Antennaheaders))}
        AntennaTable_toAdd = pd.DataFrame([AntennaDictionary])
        AntennaTable = pd.concat([AntennaTable, AntennaTable_toAdd], ignore_index=True)

        if url[-2:]=='/1':
            AntennaTable_toAdd2 = pd.DataFrame([{Antennaheaders2[i]: AntennaValues2[i] for i in range(len(Antennaheaders2))}])
            AntennaTable = pd.concat([AntennaTable, AntennaTable_toAdd2], ignore_index=True)
    except Exception as e:
        print(e)
        print('\nException')
        print(url)
        print('\n')
        NotFoundLinks.append(url)
        continue
    et = time.time()
      # get the execution time
    elapsed_time = et - st
    print(url)
    print(headstr)
    print('Execution time:', elapsed_time, 'seconds')

profiler.disable()

## Formatting the Data

In [None]:
AntennaTable = AntennaTable.fillna('')
AntennaTable['Antenna']=([''.join(x[2].title() +' '+ x[1] + ' '+ x[0].title()) for x in AntennaTable['Antenna'].str.split(',', 2)])

if 'EFL ID' in AntennaTable:
    AntennaTable['Device Registration ID'] = AntennaTable['Device Registration ID'].astype(str) + AntennaTable['EFL ID']


In [None]:
  
AntennaTable['Antenna']= AntennaTable['Antenna'].str.strip()
AntennaTable['Antenna']= AntennaTable['Antenna'].str.replace('Rf Industries', 'RFI')
AntennaTable['Antenna']= AntennaTable['Antenna'].str.replace('Parallel Array Of Vertical Dipoles', 'Vertical Dipole Array')

#AntennaTable['Client']=AntennaTable['Client'].apply(lambda x: x.title())
AntennaTable['Client']=AntennaTable['Client'].str.replace('Limited', 'Ltd')
AntennaTable['Client']=AntennaTable['Client'].str.replace('New South Wales Government Telecommunications Authority', 'NSWTA')
AntennaTable['Date Authorised']=([''.join(x[2] +'-'+ x[1] + '-'+ x[0].title()) for x in AntennaTable['Date Authorised'].str.split('/', 2)])

AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Jan', '01')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Feb', '02')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Mar', '03')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Apr', '04')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('May', '05')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Jun', '06')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Jul', '07')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Aug', '08')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Sep', '09')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Oct', '10')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Nov', '11')
AntennaTable['Date Authorised']=AntennaTable['Date Authorised'].str.replace('Dec', '12')

AntennaTable['Freq (MHz)'] = FreqToMhz(AntennaTable['Emission Center Frequency'])


In [None]:


Renamed_headers = ['Device ID', 'Antenna', 'Client','Type','Freq (MHz)','Power','Height', 
                      'Polarisation','Azimuth', 'Tilt','Licence','Date Authorised','Destination Link']

AntennaTable_export = AntennaTable
AntennaTable_export = AntennaTable_export.rename(columns={'Device Registration ID': 'Device ID', 'Device Type': 'Type', 
                                                          'Emission Center Frequency': 'Frequency'
                                                          ,'Transmitter Power': 'Power'
                                                          , 'Antenna Height (AGL)': 'Height', 
                                                          'Antenna Polarisation': 'Polarisation', 
                                                          'Antenna Azimuth': 'Azimuth',
                                                          'Licence Number': 'Licence','Antenna Tilt': 'Tilt'})

#AntennaTable_export = AntennaTable_export.rename(columns= dict(zip(Antennaheaders, Renamed_headers)))


AntennaTable_export = AntennaTable_export.reset_index(drop=True)
AntennaTable_export.index += 1
display(AntennaTable_export[Renamed_headers])


In [None]:
display(NotFoundLinks)

## Export

In [None]:
AntennaTable_export.to_excel(r'C:\Users\Mewtwo\Desktop'+'\\ACMA '+ str(sitecode) + " "+ re.sub("[\\\\/]", " ", AntennaDictionary['Site']) + '.xlsx', columns = Renamed_headers, index=True)

In [None]:
stats = pstats.Stats(profiler).sort_stats('ncalls')
stats.print_stats()

In [None]:
#AntennaTable_export.to_html(r'C:\Users\Mewtwo\Desktop\Antennadata.html', columns = Renamed_headers, index=True)
#import subprocess
#subprocess.call('wkhtmltoimage -f png --width 0 Antennadata.html Antennadata.png', shell=True)