Crawler for FAQ Database
---

In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

For our database, we will be crawling FAQs from the following websites:
1. Official HKU website on admissino: https://admissions.hku.hk/faqs
    - HKDSE (Local Students)
    - HKDSE (Non-local Students)
    - International/Overseas Qualifications
    - Bachelor's Degree
    - Hong Kong Sub-degrees (for senior year entry)
2. AAO: https://aao.hku.hk/faq/
3. HKU Science programmes
    - BSc 6901: https://www.scifac.hku.hk/prospective/ug/6901-bsc/faq#On-Admission

---
# 1. FAQ - Offical HKU website on admissions
1. HKDSE (Local Students)
2. HKDSE (Non-local Students)
3. International/Overseas Qualifications
4. Bachelor's Degree
5. Hong Kong Sub-degrees (for senior year entry)

method: manual data entry

---
# 2. FAQ - AAO
https://aao.hku.hk/faq/

In [269]:
from bs4 import BeautifulSoup
from time import sleep
import requests

In [147]:
# helper function
# retrieve HTML of the FAQs
def get_faq_html(url, verify=True):
    page = requests.get(url, verify=verify)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    return soup

In [148]:
# helper function
# retrieve dataframe of all AAO FAQs
# return: DATAFRAME

def get_aao_faq(url):
    # call helper function to get metatable HTML
    soup = get_faq_html(url, verify=False)

    faq_list = soup.find_all('p')

    # get question and answer for each FAQ in the list, and store to dataframe
    rows = []
    for faq in faq_list:
        try:
            question = faq.text.split('\n', 1)[0].strip()
            answer = faq.text.split('\n', 1)[1].replace('\t', '')

            rows.append([question, answer])
        except: continue
    
    df_faq = pd.DataFrame(rows, columns=['Question', 'Answer'])

    return df_faq

In [149]:
# collect AAO FAQs
df_aao_faq = get_aao_faq('https://aao.hku.hk/faq/')



In [138]:
# save AAO FAQs dataframe to csv
df_aao_faq.to_csv('datasets/df_aao_faq.csv')

---
# 3. FAQ - HKU Science programmes
- BSc 6901 (Bachelor of Science): https://www.scifac.hku.hk/prospective/ug/6901-bsc/faq
- BASc (Bachelor of Arts & Sciences): https://www.socsc.hku.hk/basc/prospective-students/faq/

## 3.1 FAQ - BSc 6901 Bachelor of Science

In [264]:
# helper function
# retrieve dataframe of all BSc 6901 FAQs
# return: DATAFRAME
def get_6901_faq(url):
    soup = get_faq_html(url)

    questions = soup.find_all('div', {'class': 'panel__heading'})
    answers = soup.find_all('div', {'class', 'panel__collapse collapse'})

    questions = list(map(lambda x: x.text.strip().replace('\xa0', ' '), questions))
    
    answers_final = []
    for a in answers:
        bullets = a.find_all('li')
        if len(bullets) != 0:
            concat_str = '; '.join(list(map(lambda x: x.text.strip().replace('\xa0', ' '), bullets)))
            answers_final.append(concat_str)
        else:
            answers_final.append(a.text.strip().replace('\xa0', ' ').replace('\r\n', ' '))

    df_faq = pd.DataFrame()
    df_faq['Question'] = questions
    df_faq['Answer'] = answers_final

    return df_faq

In [267]:
# collect BSc 6901 FAQs
df_6901_faq = get_6901_faq('https://www.scifac.hku.hk/prospective/ug/6901-bsc/faq')

In [268]:
# save BSc 6901 FAQs dataframe to csv
df_6901_faq.to_csv('datasets/df_6901_faq.csv')

## 3.2 FAQ - BASc Bachelor of Arts & Sciences
method: manual data entry