Crawler for FAQ Database
---

In [131]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

For our database, we will be crawling FAQs from the following websites:
1. Official HKU website on admissino: https://admissions.hku.hk/faqs
    - HKDSE (Local Students)
    - HKDSE (Non-local Students)
    - International/Overseas Qualifications
    - Bachelor's Degree
    - Hong Kong Sub-degrees (for senior year entry)
2. AAO: https://aao.hku.hk/faq/
3. HKU Science programmes
    - BSc 6901: https://www.scifac.hku.hk/prospective/ug/6901-bsc/faq#On-Admission

---
# 1. Offical HKU website on admissions
1. HKDSE (Local Students)
2. HKDSE (Non-local Students)
3. International/Overseas Qualifications
4. Bachelor's Degree
5. Hong Kong Sub-degrees (for senior year entry)

In [27]:
from bs4 import BeautifulSoup
from time import sleep
import requests

In [30]:
def get_html(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')

    return soup

In [31]:
get_html('https://admissions.hku.hk/faqs/hkdse')

<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
<head>
<meta charset="utf-8"/>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-193577211-1"></script>
<script>window.dataLayer = window.dataLayer || [];function gtag(){dataLayer.push(arguments)};gtag("js", new Date());gtag("config", "UA-193577211-1", {"groups":"default","anonymize_ip":true});</script>
<meta content="Drupal 8 (https://www.drupal.org)" name="Generator"/>
<meta content="width" name="MobileOptimized"/>
<meta content="true" name="HandheldFriendly"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<style>div#sliding-popup

---
# 2. AAO
https://aao.hku.hk/faq/

In [38]:
# helper function
# retrieve HTML of the FAQs
def get_faq_html(url):
    page = requests.get(url, verify=False)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    return soup

In [141]:
# helper function
# retrieve dataframe of all AAO FAQs
# return: DATAFRAME

def get_aao_faq(url):
    # call helper function to get metatable HTML
    soup = get_faq_html(url)

    faq_list = soup.find_all('p')

    # get question and answer for each FAQ in the list, and store to dataframe
    rows = []
    for faq in faq_list:
        try:
            question = faq.text.split('\n', 1)[0].strip()
            answer = faq.text.split('\n', 1)[1].replace('\t', '')

            rows.append([question, answer])
        except: continue
    
    df_faq = pd.DataFrame(rows, columns=['Question', 'Answer'])

    return df_faq

In [142]:
# collect AAO FAQs
df_aao_faq = get_aao_faq('https://aao.hku.hk/faq/')



In [138]:
# save AAO FAQs dataframe to csv
df_aao_faq.to_csv('df_aao_faq.csv')