Crawler for Department of Computer Science (Type 3)
---

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 1. Metadata of CS Courses for an Academic Year
- All courses offered in 2022-23 are listed in HTML table format
- We use BeautifulSoup for the crawler

We crawl the following fields:
1. Year
2. Type (Core/Elective)
3. Course Code
4. Course Title
5. Term
6. Staff (can have multiple names)
7. Moodle link
8. Course description link
9. Staff link

In [2]:
from bs4 import BeautifulSoup
from time import sleep
import requests

**[Helper Function] Retrieve HTML of the metatable**

In [3]:
def get_meta_table_html(academic_year):
    # define url to crawl based on academic year
    META_URL = 'https://www.cs.hku.hk/index.php/programmes/course-offered?acadYear=' + academic_year

    # set up beautiful soup configurations
    meta_page = requests.get(META_URL)
    meta_soup = BeautifulSoup(meta_page.text, 'html.parser')

    # get the second table on the page (which is the one we want to crawl)
    meta_tables = meta_soup.find_all('table')[1]

    return meta_tables

**[Function] Retrieve metatable dataframe of all courses offered in an academic year**

In [4]:
def get_meta_df(academic_year):
    # call helper function to get metatable HTML
    meta_table = get_meta_table_html(academic_year)

    # get all the rows in this table
    meta_trs = meta_table.find_all('tr')

    # get data in each row
    rows = []
    for tr in meta_trs:
        row = []
        for td in tr:
            try: 
                if td.text != '\n':
                    row.append(td.text)
                    link = td.find('a').get('href')
                    if link != None:
                        row.append(link)
            except: 
                continue
        
        # only append the row if there are 4 fields
        if len(row) > 4: rows.append(row)

    # define column names
    col_names = ['Course Code', 'Moodle Link', 'Course Title', 'Course Link', 'Term', 'Staff', 'Staff Link']
    
    # convert matrix into a dataframe
    df_meta = pd.DataFrame(rows, columns=col_names)

    # TODO: should we drop staff link or no?
    # drop the staff link
    df_meta.drop(columns=['Staff Link'], inplace=True)

    df_meta.insert(0, 'Academic Year', [academic_year]*len(df_meta))
    
    return df_meta

In [5]:
df_meta = get_meta_df('2022')

In [28]:
# save the meta dataset
df_meta.to_csv('data/df_cs_courses_pre.csv')

**Generate an extension dataset that contains course and staff member (can be multiple)**

In [6]:
df_courses_and_professors = df_meta[['Course Code', 'Course Title', 'Term', 'Staff']]

In [7]:
# split the 'Staff' column into multiple rows if there are more than 1 staff for that course
df_courses_and_professors['Staff'] = df_courses_and_professors['Staff'].apply(lambda x: x.strip().replace('\n\n', '\n').split('\n'))
df_courses_and_professors = df_courses_and_professors.explode('Staff')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
df_courses_and_professors = df_courses_and_professors.reset_index(drop=True)
df_courses_and_professors

Unnamed: 0,Course Code,Course Title,Term,Staff
0,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",1,Chim T W
1,COMP1117A,"Computer Programming (ActSc, AppAI, DA, IS, Mi...",1,Ting HF
2,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",2,Choi Loretta
3,COMP1117B,"Computer Programming (Quant Fin, DA, Minor, 2n...",2,Ting HF
4,ENGG1330A,Computer Programming I (A1 - M2),1,Schnieders Dirk
...,...,...,...,...
82,COMP3516,Data Analytics for IoT,2,Wu Chenshu
83,FITE3010,Big Data and Data Mining,2,Liu Qi
84,FITE3012,E-payment and Crypto-currency,1,Au Allen
85,COMP7310,Artificial intelligence of things,1,Wu Chenshu


In [40]:
# save the dataset to csv
df_courses_and_professors.to_csv('data/df_cs_courses_and_professors.csv')

# 2. Detailed Course Info Page

In [9]:
# helper function to generate section table based on info type from the HTML
def get_section_table(COURSE_URL, info_type):
    # first retrieve all table HTMLs on the course info page
    course_page = requests.get(COURSE_URL)
    course_soup = BeautifulSoup(course_page.text, 'html.parser')

    # first find the two big section tables
    section_tables = course_soup.find_all('table', {'border': '1', 'width': '99%'})

    if info_type == 'basic': section_table = section_tables[0]
    else: section_table = section_tables[1]

    return section_table

## 2.1 Basic course information
1. Course name
2. Instructor(s)
3. Number of credits
4. Recommended learning hours (?)
5. Pre-requisite(s)
6. Co-requisite(s)
7. Mutually exclusive with: ENGG1111 or ENGG1330
8. Remarks
9. Moodle course link

In [10]:
# retrieve the basic info of a specific course page
# return: DATAFRAME

def get_specific_course_basic_info(COURSE_URL):
    # get section table based on the info_type (in this case is 'basic')
    section_table = get_section_table(COURSE_URL, 'basic')

    trs = section_table.find_all('tr')
    col_names = []
    rows = []
    for i, tr in enumerate(trs):
        row = []
        for td in tr:
            try: 
                if td.text != '\n':
                    row.append(td.text.strip('\n'))
            except: continue

        if i != 3: # TODO: needs BUG FIX - make this dynamic!!!
            if i == 0:
                col_names.append('Academic Year')
                rows.append(row[0].split('-')[0]) # TODO: to delete later
            elif i == 1:
                col_names.append(row[3].strip(':\n'))
                rows.append(row[4])
                # TODO: add instructor info
            elif i > 3:
                col_names.append(row[0].strip(' :\n'))
                rows.append(row[1].strip(' \n').replace('\xa0', ''))
    
    df_course = pd.DataFrame(rows).transpose()
    df_course.columns = col_names
    
    return df_course

In [11]:
get_specific_course_basic_info('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp2501.html')

Unnamed: 0,Academic Year,No. of credit(s),Lecture,Tutorial,Pre-requisite(s),Co-requisite(s),Mutually exclusive with,Remarks
0,2022,6,27.0,12.0,COMP1117 or ENGG1330,,STAT1005 or STAT1015,


## 2.2 Learning Objectives (table 2)
1. List of objectives

In [12]:
# retrieve the learning objectives of a specific course page
# return: STRING
def get_specific_course_learning_objectives(COURSE_URL):
    import re
    
    # get section table based on the info_type (in this case is 'learning_objectives')
    section_table = get_section_table(COURSE_URL, 'learning_objectives')

    # since the learning objectives are in the first table under this section table, we only select the first table
    lo_table = section_table.find('table')
    trs = lo_table.find_all('tr')
    
    learning_objectives = []
    for tr in trs:
        cleaned_text = tr.text.strip().replace('\n', ' ').replace('\r', ' ')
        cleaned_text = re.sub(r'.*\[', '[', cleaned_text)
        learning_objectives.append(cleaned_text)
    
    learning_objectives = '; '.join(learning_objectives)

    return learning_objectives

In [13]:
get_specific_course_learning_objectives('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp2501.html')

'[Data Preparation and Manipulation] Able to demonstrate practical knowledge in data preparation and data manipulation.; [Data Analysis] Able to use appropriate modelling and analysis techniques for data science problems.; [Implementation] Able to implement practical solutions for data science problems.; [Visualization and Communication] Able to communicate data analysis results effectively.'

## 2.3 Syllabus
1. Course description
2. Detailed description (table form)
3. Assessment

In [14]:
# retrieve the course description of a specific course page
# return: SINGLE STRING
def get_specific_course_description(COURSE_URL):
    # get section table based on the info_type (in this case is 'description')
    section_table = get_section_table(COURSE_URL, 'description')

    paragraphs = section_table.find_all('p')
    for p in paragraphs:
        try:
            p_text = p.find('u').text
        except: continue
        
        if p_text == 'Calendar Entry:': # get course description
            course_description = p.text.replace('\n', '').replace(p_text, '')
    
    return course_description

In [15]:
get_specific_course_description('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp1117.html')

'This is an introductory course in computer programming. Students will acquire basic Python programming skills, including syntax, identifiers, control statements, functions, recursions, strings, lists, dictionaries, tuples and files. Searching and sorting algorithms, such as sequential search, binary search, bubble sort, insertion sort and selection sort, will also be covered. '

In [16]:
# retrieve the course assessment information of a specific course
# return: TUPLE consisting of 2 LISTS
def get_specific_course_assessment(COURSE_URL):
    # get section table based on the info_type
    section_table = get_section_table(COURSE_URL, 'assessment')

    paragraphs = section_table.find_all('p')
    for p in paragraphs:
        try:
            p_text = p.find('u').text
        except: continue

        if p_text == 'Assessment:': # get course assessment info
            course_assessments = p.get_text(strip=True, separator='\n').strip().split('\n')
        
    # since the first element in the list is the heading, we remove it from the list
    course_assessments.pop(0)

    # break up the list into 2 lists: assessment_types and assessment_weightings
    assessment_types = course_assessments[::2] # get even indices
    assessment_types = list(map(lambda x: x.strip(':') + ' Weighting in final course grade (%)', assessment_types))

    assessment_weightings = course_assessments[1::2] # get odd indices
    assessment_weightings = list(map(lambda x: x.strip().strip('%'), assessment_weightings))
    
    return assessment_types, assessment_weightings

In [17]:
get_specific_course_assessment('https://www.cs.hku.hk/index.php/programmes/course-offered?infile=2022/comp1117.html')

(['Continuous Assessment Weighting in final course grade (%)',
  'Written Examination Weighting in final course grade (%)'],
 ['50', '50'])

## 2.4 Aggregate Scraping Results

In [18]:
# retrieve info from all course pages in a particular year 
# return: DATAFRAME
def get_all_course_basic_info(academic_year):
    df_meta = get_meta_df(academic_year)

    course_urls = 'https://www.cs.hku.hk'+ df_meta['Course Link']

    df_all_courses = pd.DataFrame()
    for i, course_url in enumerate(course_urls):
        row = get_specific_course_basic_info(course_url)
        row['Course Code'] = df_meta['Course Code'][i]
        row['Course Title'] = df_meta['Course Title'][i]
        row['Course Description'] = get_specific_course_description(course_url)
        row['Learning Outcomes'] = get_specific_course_learning_objectives(course_url)
        assessment_types, assessment_weightings = get_specific_course_assessment(course_url)

        for i, type in enumerate(assessment_types):
            row[type] = assessment_weightings[i]

        df_all_courses = df_all_courses.append(row)
    
    return df_all_courses
    

In [19]:
df_all_course_details = get_all_course_basic_info('2022')

In [20]:
# save the dataset to csv
df_all_course_details.reset_index(drop=True).to_csv('data/df_cs_all_course_details_prelim.csv')