In [8]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.support.wait import WebDriverWait
import pandas as pd
import re
from selenium.common.exceptions import NoSuchElementException
import os
from selenium.webdriver.support.color import Color
from tqdm.notebook import trange
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [9]:
options = Options()
options.add_argument("--headless=new")

In [14]:
NTU_CONTENT_OF_COURSES_LINK = 'https://wis.ntu.edu.sg/webexe/owa/aus_subj_cont.main'
ACAD_YEAR_2022_Sem1 = "2022_1"
ACAD_YEAR_2022_Sem2 = "2022_2"
CHROME_DRIVER = os.getcwd() + "\chromedriver-win64\chromedriver.exe"

In [15]:
def scrape_single_degrees(link, acad_year, options, driver_location):
    single_degree_modules_df = pd.DataFrame(columns=['Course Code', 'Course Name', 'Academic Units', 'Faculty', 'BDE', 'Grade Type', 'Prerequisites', 'Mutually Exclusive', 'Course Information'])
    
    # Change to the path where you downloaded the chrome driver
    service = Service(driver_location) # put the corresponding version of chrome driver
#     options = webdriver.ChromeOptions()
#     options.binary_location = r"C:\Users\blood\Downloads\chrome-win64\chrome.exe"
    driver = webdriver.Chrome(service=service, options=options)
    
#     driver = webdriver.Chrome(options=options)
    driver.get(link)
    
    acad_year_dropdown = Select(driver.find_element(By.NAME, 'acadsem'))
    acad_year_dropdown.select_by_value(acad_year)
    
    program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
    list_of_programs = program_dropdown.options;
    
    for index in trange(1, len(list_of_programs)):
        program_name = list_of_programs[index].text

        if (program_name == '---Double Degree---' or program_name == ''):
            break
            
        print("current program = " + program_name + ", index = " + str(index));
        
        program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
        program_dropdown.select_by_index(index)

        load_content_of_courses_button = driver.find_element(By.XPATH, "//*[@id='top']/div/section[2]/div/div/p[1]/table/tbody/tr/td[2]/input[1]")

        actions = webdriver.ActionChains(driver)
        actions.move_to_element(load_content_of_courses_button)
        actions.click()
        actions.perform()

        subjects_iframe = driver.find_element(By.TAG_NAME, 'iframe')
        driver.switch_to.frame(subjects_iframe)

        list_of_modules = driver.find_elements(By.TAG_NAME, 'table')
        modules_len = len(list_of_modules)

        for module_index in range(0, modules_len):
            module_dict = {}
            rows = list_of_modules[module_index].find_elements(By.TAG_NAME, 'tr');
            rows_len = len(rows)

            for row_index in range(0, rows_len):
                cells = rows[row_index].find_elements(By.TAG_NAME, 'td')
                match_for_course_code = re.search(r'\b[A-Z]{2}\d{4}\b', cells[0].text)

                if (match_for_course_code and len(cells[0].text) == 6):
                    print("Course Code: " + cells[0].text)
                    module_dict['Course Code'] = cells[0].text
                    module_dict['Course Name'] = cells[1].text
                    module_dict['Academic Units'] = cells[2].text

                elif (cells[0].text == 'Grade Type:'):
                    module_dict['Grade Type'] = cells[1].text
                elif (cells[0].text == 'Prerequisite:'):
                    if ('Prerequisites' not in module_dict):
                        module_dict['Prerequisites'] = [cells[1].text]
                    else:
                        module_dict['Prerequisites'].append(cells[1].text) 
                elif (cells[0].text == 'Mutually exclusive with:'):
                    module_dict['Mutually Exclusive'] = cells[1].text
                elif (cells[0].text == 'Not offered as Broadening and Deepening Elective' or cells[0].text == 'Not offered as Unrestricted Elective'):
                    module_dict['BDE'] = 'No'
                elif (cells[0].text == ''):
                    try:
                        color = driver.find_element(By.XPATH, "/html/body/table[%d]/tbody/tr[%d]/td[2]/b/font" % (module_index + 1,row_index + 1)).value_of_css_property("color")
                        if (Color.from_string(color).hex == '#ff00ff'):
                            if ('Prerequisites' not in module_dict):
                                module_dict['Prerequisites'] = cells[1].text
                            else:
                                module_dict['Prerequisites'] += " " + cells[1].text
                    except NoSuchElementException:
                        module_dict['Course Information'] = cells[0].text
                else:
                    module_dict['Course Information'] = cells[0].text

            module_dict['Faculty'] = 'NA'

            if ('BDE' not in module_dict):
                module_dict['BDE'] = 'Yes'

            if ('Grade Type' not in module_dict):
                module_dict['Grade Type'] = 'Letter Graded'

            single_degree_modules_df = single_degree_modules_df.append(module_dict, ignore_index=True)

        driver.switch_to.default_content();

    driver.quit()
    
    return single_degree_modules_df


In [19]:
sem1_single_degree_modules_df = scrape_single_degrees(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem1, options, CHROME_DRIVER)

  0%|          | 0/576 [00:00<?, ?it/s]

current program = Accountancy Year 1, index = 1
Course Code: GC0001
Course Code: HW0001
Course Code: HY0001
Course Code: LS5005
Course Code: SP0061
current program = Accountancy (GA) Year 1, index = 2
Course Code: AB0403
Course Code: AB1201
Course Code: AB1202
Course Code: AB1301
Course Code: AB1403
Course Code: AB1501
Course Code: AB1601
Course Code: AC1103
Course Code: AC1104
Course Code: BE1401
Course Code: CC0001
Course Code: CC0002
Course Code: CC0003
Course Code: CC0005
Course Code: CC0006
Course Code: CC0007
Course Code: HE5091
Course Code: ML0002
Course Code: ML0003
Course Code: ML0004
Course Code: SP0061
current program = Accountancy (GA) Year 2, index = 3
Course Code: AB0301
Course Code: AB0403
Course Code: AB0502
Course Code: AB0602
Course Code: AB0603
Course Code: AB2008
Course Code: AC2101
Course Code: AC2104
Course Code: AC2105
Course Code: AC2301
Course Code: AC2302
Course Code: AC2401
Course Code: BE1401
Course Code: BE1402
Course Code: CC0006
Course Code: CC0007
Course

In [21]:
sem1_single_degree_modules_df.size

87525

In [22]:
def scrape_double_degrees(link, acad_year, options, driver_location):
    double_degree_modules_df = pd.DataFrame(columns=['Course Code', 'Course Name', 'Academic Units', 'Faculty', 'BDE', 'Grade Type', 'Prerequisites', 'Mutually Exclusive', 'Course Information'])
     
    service = Service(driver_location)
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(link)
    
    acad_year_dropdown = Select(driver.find_element(By.NAME, 'acadsem'))
    acad_year_dropdown.select_by_value(acad_year)
    
    program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
    list_of_programs = program_dropdown.options;
    
    double_degree_flag = False
    
    for index in trange(1, len(list_of_programs)):
        program_name = list_of_programs[index].text
        
        if (program_name == '---Double Degree---'):
            double_degree_flag = True
            continue
        
        if (double_degree_flag and program_name == ''):
            break;
        
        if (double_degree_flag):   
            print("current program = " + program_name + ", index = " + str(index));

            program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
            program_dropdown.select_by_index(index)

            load_content_of_courses_button = driver.find_element(By.XPATH, "//*[@id='top']/div/section[2]/div/div/p[1]/table/tbody/tr/td[2]/input[1]")

            actions = webdriver.ActionChains(driver)
            actions.move_to_element(load_content_of_courses_button)
            actions.click()
            actions.perform()

            subjects_iframe = driver.find_element(By.TAG_NAME, 'iframe')
            driver.switch_to.frame(subjects_iframe)

            list_of_modules = driver.find_elements(By.TAG_NAME, 'table')
            modules_len = len(list_of_modules)

            for module_index in range(0, modules_len):
                module_dict = {}
                rows = list_of_modules[module_index].find_elements(By.TAG_NAME, 'tr');
                rows_len = len(rows)

                for row_index in range(0, rows_len):
                    cells = rows[row_index].find_elements(By.TAG_NAME, 'td')
                    match_for_course_code = re.search(r'\b[A-Z]{2}\d{4}\b', cells[0].text)

                    if (match_for_course_code and len(cells[0].text) == 6):
                        print("Course Code: " + cells[0].text)
                        module_dict['Course Code'] = cells[0].text
                        module_dict['Course Name'] = cells[1].text
                        module_dict['Academic Units'] = cells[2].text

                    elif (cells[0].text == 'Grade Type:'):
                        module_dict['Grade Type'] = cells[1].text
                    elif (cells[0].text == 'Prerequisite:'):
                        if ('Prerequisites' not in module_dict):
                            module_dict['Prerequisites'] = cells[1].text
                        else:
                            module_dict['Prerequisites'] += " " + cells[1].text 
                    elif (cells[0].text == 'Mutually exclusive with:'):
                        module_dict['Mutually Exclusive'] = cells[1].text
                    elif (cells[0].text == 'Not offered as Broadening and Deepening Elective' or cells[0].text == 'Not offered as Unrestricted Elective'):
                        module_dict['BDE'] = 'No'
                    elif (cells[0].text == ''):
                        try:
                            color = driver.find_element(By.XPATH, "/html/body/table[%d]/tbody/tr[%d]/td[2]/b/font" % (module_index + 1,row_index + 1)).value_of_css_property("color")
                            if (Color.from_string(color).hex == '#ff00ff'):
                                if ('Prerequisites' not in module_dict):
                                    module_dict['Prerequisites'] = cells[1].text
                                else:
                                    module_dict['Prerequisites'] += " " + cells[1].text
                        except NoSuchElementException:
                            module_dict['Course Information'] = cells[0].text
                    else:
                        module_dict['Course Information'] = cells[0].text

                module_dict['Faculty'] = 'NA'

                if ('BDE' not in module_dict):
                    module_dict['BDE'] = 'Yes'

                if ('Grade Type' not in module_dict):
                    module_dict['Grade Type'] = 'Letter Graded'

                double_degree_modules_df = double_degree_modules_df.append(module_dict, ignore_index=True)

            driver.switch_to.default_content();

    driver.quit()
    
    return double_degree_modules_df


In [25]:
def scrape_rest_of_the_modules(link, acad_year, options, driver_location):
    all_other_modules_df = pd.DataFrame(columns=['Course Code', 'Course Name', 'Academic Units', 'Faculty', 'BDE', 'Grade Type', 'Prerequisites', 'Mutually Exclusive', 'Course Information'])

    service = Service(driver_location)
    driver = webdriver.Chrome(service=service, options=options)
    driver.get(link)

    acad_year_dropdown = Select(driver.find_element(By.NAME, 'acadsem'))
    acad_year_dropdown.select_by_value(acad_year)

    program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
    list_of_programs = program_dropdown.options;
    
    bde_flag = False

    for index in trange(1, len(list_of_programs)):
        program_name = list_of_programs[index].text

        if (program_name.find('---All Broadening and Deepening/Unrestricted Electives---') != -1):
            bde_flag = True
            continue

        if (program_name.find('Minor in') != -1 or program_name.find('General Education in') != -1 or program_name.find('C N Yang Scholars Programme') != -1 or bde_flag):

            print("current program = " + program_name + ", index = " + str(index));

            program_dropdown = Select(driver.find_element(By.NAME, 'r_course_yr'))
            program_dropdown.select_by_index(index)

            load_content_of_courses_button = driver.find_element(By.XPATH, "//*[@id='top']/div/section[2]/div/div/p[1]/table/tbody/tr/td[2]/input[1]")

            actions = webdriver.ActionChains(driver)
            actions.move_to_element(load_content_of_courses_button)
            actions.click()
            actions.perform()

            subjects_iframe = driver.find_element(By.TAG_NAME, 'iframe')
            driver.switch_to.frame(subjects_iframe)

            details = driver.find_elements(By.TAG_NAME, 'tr')
            details_len = len(details)
            module_dict = {}

            for details_index in range(1, details_len):
                cells = details[details_index].find_elements(By.TAG_NAME, 'td')

                match_for_course_code_pattern_1 = re.search(r'\b[A-Z]{2}\d{4}\b', cells[0].text)
                match_for_course_code_pattern_2 = re.search(r'\b[A-Z]{3}\d{2}[A-Z]{1}\b', cells[0].text)

                if ((match_for_course_code_pattern_1 or match_for_course_code_pattern_2) and len(cells[0].text) == 6):
                    if (bool(module_dict) == True):
                        if ('BDE' not in module_dict):
                            module_dict['BDE'] = 'Yes'

                        if ('Grade Type' not in module_dict):
                            module_dict['Grade Type'] = 'Letter Graded'

                        if ('Course Information' not in module_dict):
                            module_dict['Course Information'] = ''

                        all_other_modules_df = all_other_modules_df.append(module_dict, ignore_index=True)
                        module_dict = {}

                    print("Course Code: " + cells[0].text)
                    module_dict['Course Code'] = cells[0].text
                    module_dict['Course Name'] = cells[1].text
                    module_dict['Academic Units'] = cells[2].text
                    module_dict['Faculty'] = cells[3].text
                else:
                    if (cells[0].text == 'Grade Type:'):
                        module_dict['Grade Type'] = cells[1].text
                    elif (cells[0].text == 'Prerequisite:'):
                        if ('Prerequisites' not in module_dict):
                            module_dict['Prerequisites'] = cells[1].text
                        else:
                            module_dict['Prerequisites'] += " " + cells[1].text 
                    elif (cells[0].text == 'Mutually exclusive with:'):
                        module_dict['Mutually Exclusive'] = cells[1].text
                    elif (cells[0].text == 'Not offered as Broadening and Deepening Elective' or cells[0].text == 'Not offered as Unrestricted Elective'):
                        module_dict['BDE'] = 'No'
                    elif (cells[0].text == ''):
                        try:
                            color = driver.find_element(By.XPATH, "/html/body/center/table/tbody/tr[%d]/td[2]/b/font" % (details_index+1)).value_of_css_property("color")
                            if (Color.from_string(color).hex == '#ff00ff'):
                                if ('Prerequisites' not in module_dict):
                                    module_dict['Prerequisites'] = cells[1].text
                                else:
                                    module_dict['Prerequisites'] += " " + cells[1].text
                        except NoSuchElementException:
                            pass
                    else:
                        try:
                            driver.find_element(By.XPATH, "/html/body/center/table/tbody/tr[%d]/td[text()='\u00A0']" % (details_index + 1))
                        except NoSuchElementException:
                            module_dict['Course Information'] = cells[0].text


            if (bool(module_dict) == True):
                if ('BDE' not in module_dict):
                    module_dict['BDE'] = 'Yes'

                if ('Grade Type' not in module_dict):
                    module_dict['Grade Type'] = 'Letter Graded'

                if ('Course Information' not in module_dict):
                    module_dict['Course Information'] = ''

                all_other_modules_df = all_other_modules_df.append(module_dict, ignore_index=True)
                module_dict = {}

            driver.switch_to.default_content();

    driver.quit()
    
    return all_other_modules_df

In [43]:
sem1_single_degree_modules_df = scrape_single_degrees(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem1, options)

SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 114
Current browser version is 117.0.5938.62 with binary path C:\Users\blood\Downloads\chrome-win64\chrome.exe
Stacktrace:
Backtrace:
	GetHandleVerifier [0x003DA813+48355]
	(No symbol) [0x0036C4B1]
	(No symbol) [0x00275358]
	(No symbol) [0x002961AC]
	(No symbol) [0x00291EF3]
	(No symbol) [0x00290579]
	(No symbol) [0x002C0C55]
	(No symbol) [0x002C093C]
	(No symbol) [0x002BA536]
	(No symbol) [0x002982DC]
	(No symbol) [0x002993DD]
	GetHandleVerifier [0x0063AABD+2539405]
	GetHandleVerifier [0x0067A78F+2800735]
	GetHandleVerifier [0x0067456C+2775612]
	GetHandleVerifier [0x004651E0+616112]
	(No symbol) [0x00375F8C]
	(No symbol) [0x00372328]
	(No symbol) [0x0037240B]
	(No symbol) [0x00364FF7]
	BaseThreadInitThunk [0x768EFCC9+25]
	RtlGetAppContainerNamedObjectPath [0x77D07B1E+286]
	RtlGetAppContainerNamedObjectPath [0x77D07AEE+238]


In [23]:
sem1_double_degree_modules_df = scrape_double_degrees(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem1, options, CHROME_DRIVER)

  0%|          | 0/576 [00:00<?, ?it/s]

current program = Accountancy And Business (ACS) Year 2, index = 354
Course Code: BA2202
Course Code: BA2203
current program = Accountancy And Business (ACS) Year 4, index = 355
Course Code: BA3201
Course Code: BA3202
current program = Accountancy And Business (AWM) Year 2, index = 356
Course Code: BF2100
Course Code: BF2201
Course Code: BF2206
Course Code: BF2207
Course Code: BF2209
Course Code: BF2213
Course Code: BF3201
Course Code: BF3204
Course Code: BF3207
current program = Accountancy And Business (AWM) Year 3, index = 357
Course Code: BF2206
Course Code: BF2207
Course Code: BF2209
Course Code: BF2213
Course Code: BF3201
Course Code: BF3204
Course Code: BF3207
current program = Accountancy And Business (AWM) Year 4, index = 358
Course Code: BF2207
Course Code: BF2209
Course Code: BF2213
Course Code: BF3201
Course Code: BF3204
Course Code: BF3207
current program = Accountancy And Business (BA) Year 2, index = 359
Course Code: BC2402
Course Code: BC2406
Course Code: BC3405
Course 

In [26]:
sem_1_rest_of_the_modules_df = scrape_rest_of_the_modules(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem1, options, CHROME_DRIVER)

  0%|          | 0/576 [00:00<?, ?it/s]

current program = Minor in Art History, index = 470
Course Code: DA2004
Course Code: DD1003
Course Code: DD3016
Course Code: DF2002
Course Code: DF2005
Course Code: DF2009
Course Code: DP2002
Course Code: DT2007
Course Code: HL2009
Course Code: HL2043
Course Code: HL3001
Course Code: HL3042
Course Code: HL3043
Course Code: HL4014
Course Code: HL4024
Course Code: HH3001
Course Code: HH4005
Course Code: HR1001
Course Code: HR2001
Course Code: HR2005
Course Code: HR2006
Course Code: HR2009
Course Code: HR3002
Course Code: HR3003
Course Code: HR3006
current program = Minor in Applied Physics, index = 471
Course Code: PH1104
Course Code: PH1105
Course Code: PH1107
Course Code: PH2103
Course Code: PH3102
Course Code: PH3601
Course Code: PH3602
current program = Minor in Chinese Creative Writing, index = 472
Course Code: HF5101
Course Code: HF5301
Course Code: HF5501
Course Code: HF5801
current program = Minor in Computing And Data Analysis, index = 473
Course Code: CE1107
Course Code: CE2002

In [24]:
sem1_double_degree_modules_df.size

25839

In [27]:
sem_1_rest_of_the_modules_df.size

13185

In [30]:
sem1_single_degree_modules_df.to_csv(os.path.join(os.getcwd(), "scraped-data/sem_1_single_degree_modules.csv"))
sem1_double_degree_modules_df.to_csv(os.path.join(os.getcwd(), "scraped-data/sem_1_double_degree_modules.csv"))
sem_1_rest_of_the_modules_df.to_csv(os.path.join(os.getcwd(), "scraped-data/sem_1_rest_of_the_modules.csv"))

In [None]:
sem2_single_degree_modules_df = scrape_single_degrees(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem2, options)

In [None]:
sem2_double_degree_modules_df = scrape_double_degrees(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem2, options)

In [None]:
sem_2_rest_of_the_modules_df = scrape_rest_of_the_modules(NTU_CONTENT_OF_COURSES_LINK, ACAD_YEAR_2022_Sem2, options)

In [10]:
sem2_single_degree_modules_df.to_csv(os.path.join(os.getcwd(), "sem_2_single_degree_modules.csv"))
sem2_double_degree_modules_df.to_csv(os.path.join(os.getcwd(), "sem_2_double_degree_modules.csv"))
sem_2_rest_of_the_modules_df.to_csv(os.path.join(os.getcwd(), "sem_2_rest_of_the_modules.csv"))