In [16]:
"""
Use BeautifulSoup library to take directory of HTML files as input and spit out a giant text file that lists all of the course entries.

Initial use case: enable fun final projects for INST 126.
"""

from bs4 import BeautifulSoup
import os
import pandas as pd
import urllib.request, urllib.parse, urllib.error

def html_to_course_list(html):
    """
    Get html page and parse out course info and dump into list of course entries
    Params: filepath: path to html page
    Returns: courses: list of course entries, which can then be printed
    """

    # initialize beautifulsoup object for course html page
    soup = BeautifulSoup(html, 'html.parser')

    courses = []
    # for every course (we know it's a course entry if it's a div with class = "course")
    for item in soup.find_all(attrs={"class":"course"}):
        approved_course_text = item.find_all(attrs={"class": "approved-course-text"})
        coursedetails = {
            'code': item['id'],
            'title': item.find(attrs={"class": "course-title"}).text,
        }
        # only process if there is a course description
        if len(approved_course_text) > 1:
            coursedetails['prerequisite'] = "None"
            requirements = approved_course_text[0].find_all('div')
            for requirement in requirements:
                if "Prerequisite" in requirement.text:
                    coursedetails['prerequisite'] = requirement.text.replace("Prerequisite: ", "")
            coursedetails['description'] = approved_course_text[1].text
            coursedetails['credits'] = item.find(attrs={"class": "course-min-credits"}).text
            courses.append(coursedetails)
        # # no course description
        # else:
        #     print("\nno approved course text for %s: %s" %(coursedetails['code'], coursedetails['title']))
        #     print(approved_course_text)
        #     print("\n")
        #     courses_special.append(coursedetails)
        # courses_all.append(coursedetails)
    #print(BeautifulSoup(item, 'html.parser').prettify())
    return courses

def course_d_to_text(course):
    """
    Convert a course entry (in dictionary form) to a string that can be printed more easily
    Params: course: dictionary that contains info about a course entry
    Returns: course_str: string that contains info about a course entry
    """
    course_str = "%s || %s || %s || %s || %s\n" %(course['code'], course['title'], course['description'], course['prerequisite'], course['credits'])
    return course_str

In [18]:
url = "https://app.testudo.umd.edu/soc/202001/INST"
fhand = urllib.request.urlopen(url)
l = html_to_course_list(fhand.read())
l

[{'code': 'INST126',
  'title': 'Introduction to Programming for Information Science',
  'prerequisite': 'Minimum grade of C- in MATH115; or must have math eligibility of MATH140 or higher; or permission of instructor. ',
  'description': 'An introduction to computer programming for students with very limited or no previous programming experience. Topics include fundamental programming concepts such as variables, data types, assignments, arrays, conditionals, loops, functions, and I/O operations.',
  'credits': '3'},
 {'code': 'INST155',
  'title': 'Social Networking',
  'prerequisite': 'None',
  'description': 'Introduces methods for analyzing and understanding how people use social media - social networking websites, blogging and microblogging, and other forms of online interaction and content generation - and their societal implications. Introduces students to the science and social science of network analysis. Through real world examples, including analysis of their own social netw

In [19]:
# get input directory path
INPUT_DIR = "data/fall2020-courses"

# list to hold all course info
courses_all = []

# iterate over files in directory
# assume each file is an html page that lists all courses for a given area
print("processing html pages in %s" %INPUT_DIR)
for FILENAME in os.listdir(INPUT_DIR):
    if FILENAME.endswith(".html"):
        print("Processing courses for %s" %FILENAME.replace(".html", ""))
        # get filepath
        FILEPATH = os.path.join(INPUT_DIR, FILENAME)
        # open the file and read it
        html = open(FILEPATH).read()
        # parse html to course list
        this_course_list = html_to_course_list(html)
        # add to master course list
        courses_all += this_course_list

# # write to master text file
# ## open file to write
# OUTPUT_F = "data/testudo-courses-large.txt"
# print("writing %i courses to output txt file to %s" %(len(courses_all), OUTPUT_F))
# to_write = open(OUTPUT_F, 'w')
# for course in courses_all:
#     to_write.write(course_d_to_text(course))
# to_write.close()
# # print out
# for course in courses_all[:50]:
#     print(course_d_to_text(course))

df = pd.DataFrame(courses_all)
df.head()

processing html pages in data/fall2020-courses
Processing courses for AMST
Processing courses for BMGT
Processing courses for BUMK
Processing courses for CMSC
Processing courses for COMM
Processing courses for DATA
Processing courses for ECON
Processing courses for ENSP
Processing courses for ENTS
Processing courses for INFM
Processing courses for INST
Processing courses for MATH
Processing courses for PHSC
Processing courses for PLCY
Processing courses for PSYC
Processing courses for SPHL
Processing courses for STAT
Processing courses for URSP


Unnamed: 0,code,credits,description,prerequisite,title
0,AMST101,3,Introduces students to the interdisciplinary f...,,Introduction American Studies
1,AMST298C,3,The aggregate experience of Asian Pacific Amer...,,Introduction to Asian American Studies
2,AMST340,3,Introduction to the process of interdisciplina...,Must have completed AMST201; and 2 courses in ...,"Introduction to History, Theories and Methods ..."
3,AMST418N,3,,,Asian American Public Policy
4,AMST450,3,Developments in theories and methods of Americ...,AMST201 and AMST340; and 1 course in AMST.,Seminar in American Studies


In [5]:
jsonlike = {}
for code, codedata in df.groupby("code"):
    cols = ['credits', 'description', 'prerequisite', 'title']
    j = {}
    for col in cols:
        j[col] = codedata[col].values[0]
    jsonlike.update({code: j})

In [7]:
import json

In [8]:
open("data/testudo-courses-large.json", "w").write(json.dumps(jsonlike, indent=2))

216245