In [21]:
import requests
import re
import time
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By

## UCI

In [81]:
uci_base = "https://catalogue.uci.edu"

In [82]:
uci_all = requests.get(uci_base + '/undergraduatedegrees/').text
uci_soup = BeautifulSoup(uci_all, 'html.parser')

In [112]:
uci_programs = uci_soup.select_one('h2[class="tglhead"]').find_next_siblings()[0].select('li')

In [120]:
uci_names = []
uci_reqs = []
for p in tqdm(uci_programs):
    name = p.text.split(',')
    if 'B' not in name[-1]:
        pass
    else:
        uci_names.append(''.join(name[:-1]))
        p_url = uci_base + p.a['href'] + "#requirementstext"
        p_reqs = requests.get(p_url).text
        p_soup = BeautifulSoup(p_reqs, 'html.parser')
        p_reqs_ls = list(map(lambda x: x.text.strip(), p_soup.select('td[class="codecol"] + td:not([class]):not([colspan])')))
        uci_reqs.append(', '.join(p_reqs_ls))

100%|██████████| 166/166 [00:27<00:00,  6.08it/s]


In [124]:
uci_df = pd.DataFrame({'program': uci_names, 'requirements': uci_reqs})

In [125]:
uci_df.head()

Unnamed: 0,program,requirements
0,Aerospace Engineering,"General Chemistry, Accelerated General Chemist..."
1,African American Studies,"African American Studies I, African American S..."
2,Anthropology,"Introduction to Sociocultural Anthropology, In..."
3,Applied Physics,"Single-Variable Calculus, Single-Variable Calc..."
4,Art,"Art in Context: History, Theory, and Practice,..."


## Princeton

In [170]:
pu_base = "https://ua.princeton.edu"

In [171]:
pu_ba = requests.get(pu_base + '/academic-units/departmental-concentrations-for-the-degree-of-bachelor-of-arts').text
pu_ba_soup = BeautifulSoup(pu_ba, 'html.parser')
pu_bse = requests.get(pu_base + '/academic-units/departmental-concentrations-for-the-degree-of-bachelor-of-science-in-engineering').text
pu_bse_soup = BeautifulSoup(pu_bse, 'html.parser')

In [172]:
pu_ba_programs = pu_ba_soup.select(".field-content > a")
pu_bse_programs = pu_bse_soup.select(".field-content > a")

In [227]:
pu_names = []
pu_courses = []
pu_desc = []
for p in tqdm(pu_ba_programs + pu_bse_programs):
    name = p.text.strip()
    pu_names.append(name)
    p_url = pu_base + p['href']
    p_content = requests.get(p_url).text
    p_soup = BeautifulSoup(p_content, 'html.parser')
    p_courses = ' '.join(list(map(lambda x: re.sub('\n', '', ' '.join(x.text.strip().split(' ')[2:])), p_soup.select('h3[class="course-title"]'))))
    pu_courses.append(p_courses)
    p_desc = ' '.join(list(map(lambda x: x.text.strip(), p_soup.select('.course-desc'))))
    pu_desc.append(p_desc)

100%|██████████| 37/37 [00:19<00:00,  1.95it/s]


In [228]:
pu_df = pd.DataFrame({'institution': 'Princeton', 'program': pu_names, 'courses': pu_courses, 'descriptions': pu_desc})

In [229]:
pu_df.head()

Unnamed: 0,institution,program,courses,descriptions
0,Princeton,African American Studies,African American Studies and the Philosophy of...,This course introduces students to the field o...
1,Princeton,Anthropology,Introduction to Anthropology CDEC Human Evolu...,An introduction to anthropology and key topics...
2,Princeton,Architecture,An Introduction to the History of Architecture...,A broad overview of the discipline of architec...
3,Princeton,Art and Archaeology,An Introduction to the History of Art: Meaning...,A team-taught introduction to the history of a...
4,Princeton,Astrophysical Sciences,The Universe Spring QRSN Topics in Modern Astr...,This specially designed course targets the fro...


In [230]:
pu_df.to_csv('curriculum/princeton_programs.csv', index=False)

## Texas Tech

In [7]:
# bypass SSLError
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS += 'HIGH:!DH:!aNULL'

In [2]:
ttu_base = "https://www.ttu.edu/majors-and-colleges/undergraduate-majors/"

In [8]:
ttu_all = requests.get(ttu_base).text
ttu_soup = BeautifulSoup(ttu_all, 'html.parser')

In [99]:
ttu_programs = ttu_soup.select('.columns-2 > ul > li > a')

In [110]:
driver = webdriver.Chrome()
ttu_names = []
ttu_courses = []
ttu_desc = []
for p in tqdm(ttu_programs):
    driver.get(p['href'])
    time.sleep(1)
    ttu_names.append(p.text)
    p_courses = driver.find_elements(By.CSS_SELECTOR, '.acalog-course > span')
    course_name_ls = []
    course_desc_ls = []
    for course in p_courses:
        a = course.find_element(By.TAG_NAME, 'a')
        course_name = re.search('(?<= - ).*', a.text).group(0)
        course_name_ls.append(course_name)
        # click on a tag and wait until element is present
        a.click()
        driver.implicitly_wait(3)
        course_desc = course.find_element(By.XPATH, 'table/tbody/tr/td/div[2]').text.split('\n')[-1]
        course_desc_ls.append(course_desc)
        time.sleep(0.5)
    ttu_courses.append(' '.join(course_name_ls))
    ttu_desc.append(' '.join(course_desc_ls))
    time.sleep(1)
driver.close()

100%|██████████| 188/188 [1:35:03<00:00, 30.34s/it]


In [111]:
ttu_df = pd.DataFrame({'institution': 'Texas Tech', 'program': ttu_names, 'courses': ttu_courses, 'descriptions': ttu_desc})

In [112]:
ttu_df.head()

Unnamed: 0,institution,program,courses,descriptions
0,Texas Tech,"Accounting, B.B.A.",Foundations of Business Introductory Mathemati...,Provides students with a basic understanding o...
1,Texas Tech,"Advertising, B.A.",Foundations of Media and Communication America...,TCCNS: [COMM1307] A broad survey of media hist...
2,Texas Tech,"Agribusiness, B.S.",Essentials of College Rhetoric Introductory Ma...,TCCNS: [ENGL1301] Prerequisite: Successful com...
3,Texas Tech,"Agricultural and Applied Economics, B.S.",Essentials of College Rhetoric Introductory Ma...,TCCNS: [ENGL1301] Prerequisite: Successful com...
4,Texas Tech,"Agricultural Communications, B.S.",Essentials of College Rhetoric College Algebra...,TCCNS: [ENGL1301] Prerequisite: Successful com...


In [None]:
ttu_df.to_csv('curriculum/texas_tech_programs.csv', index=False)