In [None]:
# importing libraries
import re
import json
import urllib3
from bs4 import BeautifulSoup

In [None]:
# initialization
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
http = urllib3.PoolManager()
target_urls = []
scrapped_urls = []
title = ''

In [None]:
# arion scraper attributes
arion_base_url = 'https://arion.aut.ac.nz/ArionMain/CourseInfo/Information/Qualifications/'
arion_initial_url = 'QualificationTypes.aspx'
target_urls.append(arion_initial_url)

In [None]:
# arion scrapper methods

# scrap nested urls
def scrap_arion_urls():
    for url in target_urls:
        request = http.request('GET', arion_base_url + url)
        soup = BeautifulSoup(request.data, 'lxml')
        for element in soup.find_all('a', 'Navigation'):
            scrapped_urls.append(element)

# scrap nested entries into a list
def scrap_simple_entries():
    target_entries = []
    for tag in scrapped_urls:
        target_entries.append(tag.text)
    return target_entries
    
# extract entries from scrapped urls, return a dictionary
def get_arion_entries(dictionary_name: str):
    dictionary = {}
    dictionary.setdefault(dictionary_name, {})
    previous_degree: str
    previous_degree = ''
    counter = 0
    for tag in scrapped_urls:
        if tag.text == previous_degree:
            counter += 1
            temp_text = tag.text + ' ' + str(counter)
            dictionary[dictionary_name].setdefault(temp_text, {})
        else:
            dictionary[dictionary_name].setdefault(tag.text, {})
            previous_degree = tag.text
    return dictionary

# extract paper entries for creating a paper list only
def get_arion_paper_list():
    dictionary = {}
    target_entries = []
    return_text = 'Returning to Qualification Details'
    for tag in scrapped_urls:
        target_entries.append(tag.text)
    while return_text in target_entries:
        target_entries.remove(return_text)
    counter = 0
    for entry in target_entries:
        if counter % 2 is 0:
            dictionary.setdefault(entry, [])
        else:
            dictionary[target_entries[(counter - 1)]] = entry
        counter += 1
    paper_list = []
    for key in dictionary:
        paper_list.append({'value': key, 'synonyms': [key, dictionary[key], re.sub('[^A-Z]', '', dictionary[key])]})
    return paper_list

# extract a set of papers for each degress
def get_arion_course_set(dictionary: dict):
    counter = 0
    for degree in dictionary['qualifications']:
        scrapped_course_urls = []
        request = http.request('GET', arion_base_url + target_urls[counter])
        soup = BeautifulSoup(request.data, 'lxml')
        for element in soup.find_all('a', 'Navigation'):
            scrapped_course_urls.append(element)
        target_papers = []
        for tag in scrapped_course_urls:
            target_papers.append(tag.text)
        return_text = 'Returning to Qualification Details'
        while return_text in target_papers:
            target_papers.remove(return_text)
        iterator = 0
        temp_paper_code: str
        for paper in target_papers:
            if iterator % 2 is 0:
                dictionary['qualifications'][degree].setdefault(paper, [])
                temp_paper_code = paper
            else:
                dictionary['qualifications'][degree][temp_paper_code].append({'name': paper})
            iterator += 1
        counter += 1
    return dictionary

# extract requisite information for every existing courses at AUT
def get_requisites(dictionary: dict):
    for url in target_urls:
        request = http.request('GET', arion_base_url + url)
        soup = BeautifulSoup(request.data, "lxml")
        title = soup.find('td', {'width': '150'})
        if title is not None:
            title = title.text.strip()
            guide_urls = soup.find_all('table', id = re.compile('^wucControl_repQualifications__ctl1_wucPaperRequisites'))
            requisite_urls = soup.find_all('a', id = re.compile('^wucControl_repQualifications__ctl1_wucPaperRequisites'))
            for url in guide_urls:
                requisite_type = url.find_next('td')
                dictionary['papers'][title].setdefault(requisite_type.text, [])
                for tag in requisite_urls:
                    temp_list = dictionary['papers'][title][requisite_type.text]
                    if tag.text not in temp_list:
                        dictionary['papers'][title][requisite_type.text].append(tag.text)
                        nextTag = tag
                        requisite_urls.remove(tag)
                        if nextTag.name != 'a':
                            break
            #print(title)
            #print(dictionary['papers'][title])
    return dictionary
                    
# sort paper entries in the right order of the paper code and its name
def sort_entries(entries: list):
    counter = 0
    dictionary = {}
    dictionary['papers'] = {}
    for i in range(0, int(len(entries) / 2)):
        dictionary['papers'].setdefault(entries[counter], {})
        temp_list = dictionary['papers'][entries[counter]]
        if{'name': entries[counter + 1]} not in temp_list.values():
            dictionary['papers'][entries[counter]] = {'name': entries[counter + 1]}
        counter += 2
    return dictionary

# add scrapped urls to target urls for the next round, then clear
def clean_up(is_refined: bool):
    target_urls.clear()
    for url in scrapped_urls:
        url = url.get('href')
        if not is_refined:
            url = url.replace('../', '')
        target_urls.append(url)
    scrapped_urls.clear()

In [None]:
# arion scrapping process_01 -> getting qualification type links
scrap_arion_urls()
clean_up(True)

In [None]:
# arion scrapping process_02 -> getting qualification liks
scrap_arion_urls()
qualification_dict = get_arion_entries('qualifications')
clean_up(True)

In [None]:
# arion scrapping process_03 -> getting table of papers links
scrap_arion_urls()
clean_up(False)

In [None]:
# arion scrapping process_04 -> gettting degrees with a set of papers
qualification_dict = get_arion_course_set(qualification_dict)

In [None]:
# arion scrapping process_05 -> getting paper links
scrap_arion_urls()

In [None]:
# arion scrapping process_06 -> getting a list of papers to parse in json for Rudy's paper list entity
paper_list = get_arion_paper_list()

In [None]:
# arion scrapping process_07 -> getting a list, not dict of papers to be sorted with requisites later on
papers = scrap_simple_entries()
return_text = 'Returning to Qualification Details'
while return_text in papers:
    papers.remove(return_text)

In [None]:
# arion scrapping process_08 -> making a dictionary of papers only to store requisites and avoid duplicates
paper_dict = sort_entries(papers)
clean_up(True)

In [None]:
# arion scrapping process_09 -> scrapping requisite properties from arion to make a complete list
paper_dict = get_requisites(paper_dict)

In [None]:
# write a degree list with a corresponding set of papers to a json file for firebase
with open('degrees.json', 'w') as outfile:
    json.dump(qualification_dict, outfile, indent = 4)

In [None]:
# write a paper list as a json file for dialogflow
with open('papers.json', 'w') as outfile:
    json.dump(paper_list, outfile, indent = 4)

In [None]:
# write a paper list with requisite information for firebase
with open('requisites.json', 'w') as outfile:
    json.dump(paper_dict, outfile, indent = 4)