In [1]:
import re
import json
import urllib3
from bs4 import BeautifulSoup

In [2]:
# initialization
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
http = urllib3.PoolManager()
target_urls = []
scrapped_urls = []

In [3]:
# arion scraper attributes
arion_base_url = 'https://arion.aut.ac.nz/ArionMain/CourseInfo/Information/Qualifications/'
arion_initial_url = 'QualificationTypes.aspx'

target_urls.append(arion_initial_url)

In [4]:
# arion scrapper methods

# scrap nested urls
def scrap_arion_urls():
    for url in target_urls:
        request = http.request('GET', arion_base_url + url)
        soup = BeautifulSoup(request.data, 'lxml')
        for element in soup.find_all('a', 'Navigation'):
            scrapped_urls.append(element)

# extract entries from scrapped urls, return a dictionary
def get_arion_entries(dictionary_name: str):
    dictionary = {}
    dictionary.setdefault(dictionary_name, {})
    previous_degree: str
    previous_degree = ''
    counter = 0
    for tag in scrapped_urls:
        print(previous_degree)
        if tag.text == previous_degree:
            counter += 1
            temp_text = tag.text + ' ' + str(counter)
            print (temp_text)
            dictionary[dictionary_name].setdefault(temp_text, {})
        else:
            dictionary[dictionary_name].setdefault(tag.text, {})
            previous_degree = tag.text
    return dictionary

# extract paper entries for creating a paper list only
def get_arion_paper_list():
    dictionary = {}
    target_entries = []
    return_text = 'Returning to Qualification Details'
    for tag in scrapped_urls:
        target_entries.append(tag.text)
    while return_text in target_entries:
        target_entries.remove(return_text)
    counter = 0
    for entry in target_entries:
        if counter % 2 is 0:
            dictionary.setdefault(entry, [])
        else:
            dictionary[target_entries[(counter - 1)]] = entry
        counter += 1
    paper_list = []
    for key in dictionary:
        paper_list.append({'value': key, 'synonyms': [key, dictionary[key], re.sub('[^A-Z]', '', dictionary[key])]})
    return paper_list

# extract a set of papers for each degress
def get_arion_course_set(dictionary: dict):
    counter = 0
    for degree in dictionary['qualifications']:
        scrapped_course_urls = []
        request = http.request('GET', arion_base_url + target_urls[counter])
        soup = BeautifulSoup(request.data, 'lxml')
        for element in soup.find_all('a', 'Navigation'):
            scrapped_course_urls.append(element)
        target_papers = []
        for tag in scrapped_course_urls:
            target_papers.append(tag.text)
        return_text = 'Returning to Qualification Details'
        while return_text in target_papers:
            target_papers.remove(return_text)
        iterator = 0
        temp_paper_code: str
        for paper in target_papers:
            if iterator % 2 is 0:
                dictionary['qualifications'][degree].setdefault(paper, [])
                temp_paper_code = paper
            else:
                dictionary['qualifications'][degree][temp_paper_code].append({'name': paper})
            iterator += 1
        counter += 1
    return dictionary
    
# add scrapped urls to target urls for the next round, then clear
def clean_up(is_refined: bool):
    target_urls.clear()
    for url in scrapped_urls:
        url = url.get('href')
        if not is_refined:
            url = url.replace('../', '')
        target_urls.append(url)
    scrapped_urls.clear()

In [5]:
# arion scrapping process_01 -> getting qualification type links
scrap_arion_urls()
clean_up(True)

In [6]:
# arion scrapping process_02 -> getting qualification liks
scrap_arion_urls()
qualification_dict = get_arion_entries('qualifications')
clean_up(True)


Doctor of Education
Doctor of Health Science
Doctor of Philosophy
Master of Analytics
Master of Applied Finance
Master of Applied Science
Master of Arts
Master of Arts 1
Master of Arts
Master of Arts Management
Master of Business
Master of Business Administration
Master of Communication Studies
Master of Computer and Information Sciences
Master of Computer and Information Sciences 2
Master of Computer and Information Sciences
Master of Construction Management
Master of Creative Technologies
Master of Creative Writing
Master of Cultural and Creative Practice
Master of Design
Master of Education
Master of Education Practice
Master of Educational Leadership
Master of Emergency Management
Master of Emergency Management 3
Master of Emergency Management
Master of Engineering
Master of Engineering Project Management
Master of English and New Media Studies
Master of Forensic Information Technology
Master of Gastronomy
Master of Geographic Information Science
Master of Global Business
Master o

In [7]:
# arion scrapping process_03 -> getting table of papers links
scrap_arion_urls()
clean_up(False)

In [8]:
# arion scrapping process_04 -> gettting degrees with a set of papers
qualification_dict = get_arion_course_set(qualification_dict)

In [None]:
# arion scrapping process_05 -> getting paper links
scrap_arion_urls()

In [None]:
# arion scrapping process_06 -> getting a list of papers to parse in json for Rudy's paper list entity
paper_list = get_arion_paper_list()

In [9]:
# write a degree list with a corresponding set of papers to a json file for firebase
with open('degrees.json', 'w') as outfile:
    json.dump(qualification_dict, outfile, indent=4)

In [None]:
# write a paper list as a json file for dialogflow
with open('papers.json', 'w') as outfile:
    json.dump(paper_list, outfile, indent=4)