# Programming for the Web (teaser trailer demo)

In [1]:
# libraries for getting/retreieving data from the Web
import urllib.request, urllib.parse, urllib.error
# library for parsing the data we get from the Web
import bs4
import json
import pandas as pd

## Using urllib to *CONNECT* to data from another computer

In [46]:
# url we want (it's like a file path!)
# course_list_url = "https://app.testudo.umd.edu/soc/202001/INST"
# course_list_url = "https://app.testudo.umd.edu/soc/search?courseId=INST&sectionId=&termId=202208&_openSectionsOnly=on&creditCompare=%3E%3D&credits=0.0&courseLevelFilter=ALL&instructor=&_facetoface=on&_blended=on&_online=on&courseStartCompare=&courseStartHour=&courseStartMin=&courseStartAM=&courseEndHour=&courseEndMin=&courseEndAM=&teachingCenter=ALL&_classDay1=on&_classDay2=on&_classDay3=on&_classDay4=on&_classDay5=on"
# make a request to the server computer, and store the response data packet in a variable
response = urllib.request.urlopen(course_list_url)

In [47]:
# response object behaves a bit like a file handler (remember?)
course_list_html = response.read()
course_list_html

b'\n\n\n\n\n<!doctype html>\n<!--[if lt IE 7]>\t<html lang="en-us" class="no-js ie6">\t<![endif]--> \n<!--[if IE 7]>\t\t<html lang="en-us" class="no-js ie7">\t<![endif]--> \n<!--[if IE 8]>\t\t<html lang="en-us" class="no-js ie8">\t<![endif]--> \n<!--[if gte IE 9]>\t<html lang="en-us" class="no-js ie9">\t<![endif]-->\n<!--[if !IE]> -->\t<html lang="en-us" class="no-js">\t<!-- <![endif]-->\n\t<head>\n\t\t\n\t\t<meta http-equiv="X-UA-Compatible" content="IE=edge" />\n\t\t<meta charset="utf-8" />\n\t\t\n\t\t<title>\n\t\t\tSchedule of Classes\n\t\t</title>\n\t\t\n\t\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\n\t\t<link rel="icon" type="image/x-icon" href="/soc/resources/images/favicon.ico">\n\t\t\n\t\t<link rel="stylesheet" href="/soc/resources/css/lib/jquery-ui-1.8.23-umd/jquery-ui-1.8.23-umd.css" type="text/css" media="screen" />\n\t\t<link rel="stylesheet" href="/soc/resources/css/umd.css" type="text/css" media="screen" />\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\

## Using beautifulsoup to *PARSE* an HTML document we get from another computer

In [43]:
def html_to_course_list(html):
    """
    Get html page and parse out course info and dump into list of course entries
    Params:
    - html (str) - a str representation of an html page that lists courses from testudo
    Returns: courses: list of course entries, which can then be printed
    """
    
    print("Initializing html parser")
    # init parser
    soup = bs4.BeautifulSoup(html, 'html.parser')

    # to hold the course data
    courses = []

    print("Parsing all the courses...")
    # grab all the divs that have class 'course' (these are all the courses)
    # and iterate through them
    course_packets = soup.find_all(class_="course")
    print(course_packets)
    for course in course_packets:
        # code is the id of this div
        code = course.get('id')
        # title is in text of div that has class 'course-title'
        title = course.find(class_="course-title").text

        # grab the description and prereqs
        # all are of class `approved-course-text` with only in-text (not html) distinctions of type :/
        # so we gotta iterate through all the course texts and check for the prereq
        description = ""
        prereq = "None"
        # get all the divs that have class `approved-course-text` 
        # and iterate through them
        for descr in course.find_all(class_="approved-course-text"):
            # get the text of the div
            d_text = descr.text
            # it's prereq if it has `Prerequisite:` in it
            if "Prerequisite:" in d_text:
                # if it is, then set the value of prereq to this text
                prereq = d_text
            else: 
                # otherwise, add to the general course description text bundle
                description += d_text
        # credits in text of div that has class `course-min-credits`
        credits = course.find(class_="course-min-credits").text

        # ok now stitch all the data together into a dict to add 
        # as an entry for this course to our course data list
        courses.append({
            'Code': code,
            'Title': title,
            'Description': description,
            'Prereqs': prereq,
            'Credits': credits
        })
    return courses

In [44]:
# parse the html
course_list = html_to_course_list(course_list_html)
# display it
course_list[:5]

Initializing html parser
Parsing all the courses...
[<div class="course" id="INST101">
<input name="courseId" type="hidden" value="INST101"/>
<div class="row">
<div class="course-id-container one columns">
<div class="course-id">INST101</div>
</div>
<div class="course-info-container eleven columns">
<div class="course-basic-info-container sixteen colgrid">
<div class="row">
<div class="thirteen columns">
<span class="course-title">Bits and Bytes of Computer and Information Sciences</span>
</div>
<div class="two columns">
<fieldset class="syllabus-fieldset">
<legend>
<a class="toggle-syllabus-link" href="">
<span class="toggle-sections-arrow ui-icon ui-icon-triangle-1-e"></span>
<span class="toggle-sections-link-text">Syllabus Repository  </span>
<span>(0)</span>
<input name="courseId" type="hidden" value="INST101"/>
</a>
</legend>
<div id="INST101-syllabus-container"></div>
</fieldset>
</div>
<!-- @@@@@@@@@ -->
<div class="course-action-links-container one columns">
<a class="saved-cou

[{'Code': 'INST101',
  'Title': 'Bits and Bytes of Computer and Information Sciences',
  'Description': 'Restriction: For first time freshmen and first time transfer students. Cross-listed with: CMSC100. Credit only granted for: CMSC100 or INST101.Students are introduced to the fields (and disciplines) of computer science and information science within a small classroom setting. They will learn to make a successful transition from high school to the university, while exploring study skills, student success plans and research opportunities.',
  'Prereqs': 'None',
  'Credits': '1'},
 {'Code': 'INST104',
  'Title': 'Design Across Campus',
  'Description': 'What is design, who does it, and how is it done? There is no one answer to this question--it depends on who you ask. The answers to these questions vary across disciplines and across the University campus. This course, designed with modules from contributors in UMD programs including Information Studies, Human-Computer Interaction, Grap

We can then turn this into a dataframe for analysis! :) This is actually how I made the data file for our exercises this semester.

In [45]:
# turn into pandas df and show
course_list_df = pd.DataFrame(course_list)
course_list_df.head()

Unnamed: 0,Code,Title,Description,Prereqs,Credits
0,INST101,Bits and Bytes of Computer and Information Sci...,Restriction: For first time freshmen and first...,,1
1,INST104,Design Across Campus,"What is design, who does it, and how is it don...",,3
2,INST123,Databases for All,Restriction: Must not have completed or be cur...,,3
3,INST126,Introduction to Programming for Information Sc...,An introduction to computer programming for st...,Prerequisite: Math placement of STAT100 or hig...,3
4,INST151,Becoming A Social Media Influencer,Credit only granted for: INST408N or INST151. ...,,3


## Using urllib and the json library to parse json data from another computer

In [26]:
response2 = urllib.request.urlopen('http://joelchan.me/contacts.json') # this is my server computer!

In [27]:
rawdata = response2.read()
parsedcontacts = json.loads(rawdata)
parsedcontacts

[{'name': 'Chuck',
  'phone': {'type': 'intl', 'number': '+1 734 303 4456'},
  'email': {'email': 'ck@umd.edu', 'hide': 'yes'}},
 {'name': 'Joel',
  'phone': {'type': 'local', 'number': '479 647 9905'},
  'email': {'email': 'jc@umd.edu', 'hide': 'no'}},
 {'name': 'Zara',
  'phone': {'type': 'local', 'number': '678 321 4456'},
  'email': {'email': 'zh@umd.edu', 'hide': 'yes'}}]

In [28]:
first_item = parsedcontacts[0]

In [29]:
first_item

{'name': 'Chuck',
 'phone': {'type': 'intl', 'number': '+1 734 303 4456'},
 'email': {'email': 'ck@umd.edu', 'hide': 'yes'}}

In [30]:
first_item.get("name")

'Chuck'

In [31]:
first_item["name"]

'Chuck'

In [32]:
contacts = pd.read_json(rawdata)

In [33]:
contacts

Unnamed: 0,name,phone,email
0,Chuck,"{'type': 'intl', 'number': '+1 734 303 4456'}","{'email': 'ck@umd.edu', 'hide': 'yes'}"
1,Joel,"{'type': 'local', 'number': '479 647 9905'}","{'email': 'jc@umd.edu', 'hide': 'no'}"
2,Zara,"{'type': 'local', 'number': '678 321 4456'}","{'email': 'zh@umd.edu', 'hide': 'yes'}"
