Crawler for Department of Computer Science (Type 3)
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Metadata of CS Courses for an Academic Year
- All courses offered in 2022-23 are listed in HTML table format
- We use BeautifulSoup for the crawler

We crawl the following fields:
1. Year
2. Type (Core/Elective)
3. Course Code
4. Course Title
5. Term
6. Staff (can have multiple names)
7. Moodle link
8. Course description link
9. Staff link

In [3]:
from bs4 import BeautifulSoup
from time import sleep
import requests

**[Helper Function] Retrieve HTML of the metatable**

In [121]:
def get_meta_table_html(academic_year):
    # define url to crawl based on academic year
    META_URL = 'https://www.cs.hku.hk/index.php/programmes/course-offered?acadYear=' + academic_year

    # set up beautiful soup configurations
    meta_page = requests.get(META_URL)
    meta_soup = BeautifulSoup(meta_page.text, 'html.parser')

    # get the second table on the page (which is the one we want to crawl)
    meta_tables = meta_soup.find_all('table')[1]

    return meta_tables

**[Function] Retrieve metatable dataframe of all courses offered in an academic year**

In [337]:
def get_meta_df(academic_year):
    # call helper function to get metatable HTML
    meta_table = get_meta_table_html(academic_year)

    # get all the rows in this table
    meta_trs = meta_table.find_all('tr')

    # get data in each row
    rows = []
    for tr in meta_trs:
        row = []
        for td in tr:
            try: 
                if td.text != '\n':
                    row.append(td.text)
                    link = td.find('a').get('href')
                    if link != None:
                        row.append(link)
            except: 
                continue
        
        # only append the row if there are 4 fields
        if len(row) > 4: rows.append(row)

    # define column names
    col_names = ['Course Code', 'Moodle Link', 'Course Title', 'Course Link', 'Term', 'Staff', 'Staff Link']
    
    # convert matrix into a dataframe
    df_meta = pd.DataFrame(rows, columns=col_names)

    # TODO: should we drop staff link or no?
    # drop the staff link
    df_meta.drop(columns=['Staff Link'], inplace=True)

    df_meta.insert(0, 'Academic Year', [academic_year]*len(df_meta))
    
    return df_meta

In [338]:
df_meta = get_meta_df('2022')

In [339]:
df_meta

Unnamed: 0,Academic Year,Course Code,Moodle Link,Course Title,Course Link,Term,Staff
0,2022,COMP1117A,https://moodle.hku.hk/course/search.php?q=COMP...,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",/index.php/programmes/course-offered?infile=20...,1,Chim T W\n\nTing HF\n\n
1,2022,COMP1117B,https://moodle.hku.hk/course/search.php?q=COMP...,"Computer Programming (Quant Fin, DA, Minor, 2n...",/index.php/programmes/course-offered?infile=20...,2,Choi Loretta\n\nTing HF\n\n
2,2022,ENGG1330A,https://moodle.hku.hk/course/search.php?q=_ENG...,Computer Programming I (A1 - M2),/index.php/programmes/course-offered?infile=20...,1,Schnieders Dirk\n\nChui CK\n\n
3,2022,ENGG1340A,https://moodle.hku.hk/course/search.php?q=_COM...,Computer Programming II,/index.php/programmes/course-offered?infile=20...,1,Chim T W\n\n
4,2022,ENGG1340B,https://moodle.hku.hk/course/search.php?q=_COM...,Computer Programming II,/index.php/programmes/course-offered?infile=20...,2,Qian Chenxiong\n\nChim T W\n\n
...,...,...,...,...,...,...,...
73,2022,COMP3362,https://moodle.hku.hk/course/search.php?q=COMP...,Hands-on AI: Experimentation & Applications,/index.php/programmes/course-offered?infile=20...,1,Choi Loretta\n\n
74,2022,COMP3366,https://moodle.hku.hk/course/search.php?q=COMP...,Quantum Algorithms and Computer Architecture,/index.php/programmes/course-offered?infile=20...,1,Yang Yuxiang\n\n
75,2022,COMP3516,https://moodle.hku.hk/course/search.php?q=COMP...,Data Analytics for IoT,/index.php/programmes/course-offered?infile=20...,2,Wu Chenshu\n\n
76,2022,FITE3010,https://moodle.hku.hk/course/search.php?q=FITE...,Big Data and Data Mining,/index.php/programmes/course-offered?infile=20...,2,Liu Qi\n\n


# 2. Detailed Course Info Page

In [314]:
# helper function to generate section table based on info type from the HTML
def get_section_table(COURSE_URL, info_type):
    # first retrieve all table HTMLs on the course info page
    course_page = requests.get(COURSE_URL)
    course_soup = BeautifulSoup(course_page.text, 'html.parser')

    # first find the two big section tables
    section_tables = course_soup.find_all('table', {'border': '1', 'width': '99%'})

    if info_type == 'basic': section_table = section_tables[0]
    else: section_table = section_tables[1]

    return section_table

## 2.1 Basic course information
1. Course name
2. Instructor(s)
3. Number of credits
4. Recommended learning hours (?)
5. Pre-requisite(s)
6. Co-requisite(s)
7. Mutually exclusive with: ENGG1111 or ENGG1330
8. Remarks
9. Moodle course link

In [340]:
# retrieve the basic info of a specific course page
# return: DATAFRAME

def get_specific_course_basic_info(COURSE_URL):
    # get section table based on the info_type (in this case is 'basic')
    section_table = get_section_table(COURSE_URL, 'basic')

    trs = section_table.find_all('tr')
    col_names = []
    rows = []
    for i, tr in enumerate(trs):
        row = []
        for td in tr:
            try: 
                if td.text != '\n':
                    row.append(td.text.strip('\n'))
            except: continue

        if i != 3: # TODO: needs BUG FIX - make this dynamic!!!
            if i == 0:
                col_names.append('Academic Year')
                rows.append(row[0].split('-')[0]) # TODO: to delete later
            elif i == 1:
                col_names.append(row[3].strip(':\n'))
                rows.append(row[4])
                # TODO: add instructor info
            elif i > 3:
                col_names.append(row[0].strip(' :\n'))
                rows.append(row[1].strip(' \n').replace('\xa0', ''))
    
    df_course = pd.DataFrame(rows).transpose()
    df_course.columns = col_names
    
    return df_course

## 2.2 Learning Objectives (table 2)
1. List of objectives

In [365]:
# retrieve the learning objectives of a specific course page
# return: LIST
def get_specific_course_learning_objectives(COURSE_URL):
    import re
    
    # get section table based on the info_type (in this case is 'learning_objectives')
    section_table = get_section_table(COURSE_URL, 'learning_objectives')

    # since the learning objectives are in the first table under this section table, we only select the first table
    lo_table = section_table.find('table')
    trs = lo_table.find_all('tr')
    
    learning_objectives = []
    for tr in trs:
        cleaned_text = tr.text.replace('\n', '').replace('\r', ' ')
        cleaned_text = re.sub(r'.*\[', '[', cleaned_text)
        learning_objectives.append(cleaned_text)
    
    return learning_objectives

In [366]:
get_specific_course_learning_objectives('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp1117.html')

['[Computational mind] Able to identify possible solutions for problems based on computer programs. ',
 '[Program implementation] Able to implement solutions for problems using Python ',
 '[Program comprehension] Able to understand programs written by others and participate in larger scale system implementation. ']

## 2.3 Syllabus
1. Course description
2. Detailed description (table form)
3. Assessment

In [445]:
# retrieve the course description of a specific course page
# return: SINGLE STRING

def get_specific_course_description(COURSE_URL):
    # get section table based on the info_type (in this case is 'learning_objectives')
    section_table = get_section_table(COURSE_URL, 'description')

    # since the learning objectives are in the first table under this section table, we only select the first table
    paragraphs = section_table.find_all('p')
    for p in paragraphs:
        try:
            p_text = p.find('u').text
        except: continue
        
        if p_text == 'Calendar Entry:':
            course_description = p.text.replace('\n', '').replace(p_text, '')
    
    return course_description

In [446]:
get_specific_course_description('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp1117.html')

'This is an introductory course in computer programming. Students will acquire basic Python programming skills, including syntax, identifiers, control statements, functions, recursions, strings, lists, dictionaries, tuples and files. Searching and sorting algorithms, such as sequential search, binary search, bubble sort, insertion sort and selection sort, will also be covered. '

## 2.4 Aggregate Scraping Results

In [464]:
# retrieve info from all course pages in a particular year 
# return: DATAFRAME
def get_all_course_basic_info(academic_year):
    df_meta = get_meta_df(academic_year)

    course_urls = 'https://www.cs.hku.hk'+ df_meta['Course Link']

    df_all_courses = pd.DataFrame()
    for i, course_url in enumerate(course_urls):
        row = get_specific_course_basic_info(course_url)
        row['Course Code'] = df_meta['Course Code'][i]
        row['Course Description'] = get_specific_course_description(course_url)

        df_all_courses = df_all_courses.append(row)
    
    return df_all_courses
    

In [465]:
get_all_course_basic_info('2022')

Unnamed: 0,Academic Year,No. of credit(s),Lecture,Lab session,Pre-requisite(s),Co-requisite(s),Mutually exclusive with,Remarks,Course Code,Course Description,Tutorial,Other,Recommended Learning Hours,Self-study & practical modules,Choi Loretta
0,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117A,This is an introductory course in computer pro...,,,,,
0,2022,6,32.5,6.5,,,ENGG1111 or ENGG1330,,COMP1117B,This is an introductory course in computer pro...,,,,,
0,2022,6,,,,,COMP1117 or ENGG1111,,ENGG1330A,This is an introductory course designed for fi...,26.0,13.0,,,
0,2022,6,2.0,,ENGG1330,,COMP2113 or COMP2123,,ENGG1340A,This course covers intermediate to advanced co...,13.0,,Self-study & practical modules:\n 39.0\n\...,39.0,
0,2022,6,2.0,,ENGG1330,,COMP2113 or COMP2123,,ENGG1340B,This course covers intermediate to advanced co...,13.0,,Self-study & practical modules:\n 39.0\n\...,39.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2022,6,,,COMP3314,,COMP3359,,COMP3362,This course allows students to experience a co...,,,,,
0,2022,6,,,MATH1853; and COMP2119,,,,COMP3366,Quantum computing can perform hard computation...,3.0,,,,
0,2022,6,,,COMP2119,,,,COMP3516,"This course introduces basic concepts, technol...",6.0,,,,
0,2022,6,,,FITE1010 or MATH1853 or MATH2101; and COMP2119...,,COMP3323,,FITE3010,To study some important topics and techniques ...,11.0,,,,
