# Programming for the Web (teaser trailer demo)

In [None]:
# libraries for getting/retreieving data from the Web
import urllib.request, urllib.parse, urllib.error
# library for parsing the data we get from the Web
import bs4
import json
import pandas as pd

## Using urllib to *CONNECT* to data from another computer

In [None]:
# url we want (it's like a file path!)
course_list_url = "https://app.testudo.umd.edu/soc/202001/INST"
# make a request to the server computer, and store the response data packet in a variable
response = urllib.request.urlopen(course_list_url)

In [None]:
# response object behaves a bit like a file handler (remember?)
course_list_html = response.read()
course_list_html

b'\n\n\n\n\n<!doctype html>\n<!--[if lt IE 7]>\t<html lang="en-us" class="no-js ie6">\t<![endif]--> \n<!--[if IE 7]>\t\t<html lang="en-us" class="no-js ie7">\t<![endif]--> \n<!--[if IE 8]>\t\t<html lang="en-us" class="no-js ie8">\t<![endif]--> \n<!--[if gte IE 9]>\t<html lang="en-us" class="no-js ie9">\t<![endif]-->\n<!--[if !IE]> -->\t<html lang="en-us" class="no-js">\t<!-- <![endif]-->\n\t<head>\n\t\t\n\t\t<meta http-equiv="X-UA-Compatible" content="IE=edge" />\n\t\t<meta charset="utf-8" />\n\t\t\n\t\t<title>\n\t\t\tSchedule of Classes\n\t\t</title>\n\t\t\n\t\t<meta name="viewport" content="width=device-width, initial-scale=1.0" />\n\n\t\t<link rel="icon" type="image/x-icon" href="/soc/resources/images/favicon.ico">\n\t\t\n\t\t<link rel="stylesheet" href="/soc/resources/css/lib/jquery-ui-1.8.23-umd/jquery-ui-1.8.23-umd.css" type="text/css" media="screen" />\n\t\t<link rel="stylesheet" href="/soc/resources/css/umd.css" type="text/css" media="screen" />\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\

## Using beautifulsoup to *PARSE* an HTML document we get from another computer

In [None]:
def html_to_course_list(html):
    """
    Get html page and parse out course info and dump into list of course entries
    Params:
    - html (str) - a str representation of an html page that lists courses from testudo
    Returns: courses: list of course entries, which can then be printed
    """

    # initialize beautifulsoup object for course html page
    soup = bs4.BeautifulSoup(html, 'html.parser')

    # list to hold results
    courses = []
    # for every course (we know it's a course entry if it's a div with class = "course")
    for item in soup.find_all(attrs={"class":"course"}):
        approved_course_text = item.find_all(attrs={"class": "approved-course-text"})
        coursedetails = {
            'code': item['id'],
            'title': item.find(attrs={"class": "course-title"}).text,
        }
        # only process if there is a course description
        if len(approved_course_text) > 1:
            coursedetails['prerequisite'] = "None"
            requirements = approved_course_text[0].find_all('div')
            for requirement in requirements:
                if "Prerequisite" in requirement.text:
                    coursedetails['prerequisite'] = requirement.text.replace("Prerequisite: ", "")
            coursedetails['description'] = approved_course_text[1].text
            coursedetails['credits'] = item.find(attrs={"class": "course-min-credits"}).text
            courses.append(coursedetails)
    
    return courses

In [None]:
# parse the html
course_list = html_to_course_list(course_list_html)
# display it
course_list[:5]

[{'code': 'INST126',
  'title': 'Introduction to Programming for Information Science',
  'prerequisite': 'Minimum grade of C- in MATH115; or must have math eligibility of MATH140 or higher; or permission of instructor. ',
  'description': 'An introduction to computer programming for students with very limited or no previous programming experience. Topics include fundamental programming concepts such as variables, data types, assignments, arrays, conditionals, loops, functions, and I/O operations.',
  'credits': '3'},
 {'code': 'INST155',
  'title': 'Social Networking',
  'prerequisite': 'None',
  'description': 'Introduces methods for analyzing and understanding how people use social media - social networking websites, blogging and microblogging, and other forms of online interaction and content generation - and their societal implications. Introduces students to the science and social science of network analysis. Through real world examples, including analysis of their own social netw

We can then turn this into a dataframe for analysis! :) This is actually how I made the data file for our exercises this semester.

In [None]:
# turn into pandas df and show
course_list_df = pd.DataFrame(course_list)
course_list_df.head()

Unnamed: 0,code,credits,description,prerequisite,title
0,INST126,3,An introduction to computer programming for st...,Minimum grade of C- in MATH115; or must have m...,Introduction to Programming for Information Sc...
1,INST155,3,Introduces methods for analyzing and understan...,,Social Networking
2,INST201,3,Examining the effects of new information techn...,,Introduction to Information Science
3,INST311,3,"Examines the theories, concepts, and principle...",,Information Organization
4,INST314,3,Basic concepts in statistics including measure...,Minimum grade of C- in STAT100 and MATH115 (or...,Statistics for Information Science


## Using urllib and the json library to parse json data from another computer

In [None]:
response2 = urllib.request.urlopen('http://joelchan.me/contacts.json') # this is my server computer!

In [None]:
rawdata = response2.read()
parsedcontacts = json.loads(rawdata)
parsedcontacts

[{'email': {'email': 'ck@umd.edu', 'hide': 'yes'},
  'name': 'Chuck',
  'phone': {'number': '+1 734 303 4456', 'type': 'intl'}},
 {'email': {'email': 'jc@umd.edu', 'hide': 'no'},
  'name': 'Joel',
  'phone': {'number': '479 647 9905', 'type': 'local'}},
 {'email': {'email': 'zh@umd.edu', 'hide': 'yes'},
  'name': 'Zara',
  'phone': {'number': '678 321 4456', 'type': 'local'}}]

In [None]:
first_item = parsedcontacts[0]

In [None]:
first_item

{'name': 'Chuck',
 'phone': {'type': 'intl', 'number': '+1 734 303 4456'},
 'email': {'email': 'ck@umd.edu', 'hide': 'yes'}}

In [None]:
first_item.get("name")

'Chuck'

In [None]:
first_item["name"]

'Chuck'

In [None]:
contacts = pd.read_json(rawdata)

In [None]:
contacts

Unnamed: 0,email,name,phone
0,"{'email': 'ck@umd.edu', 'hide': 'yes'}",Chuck,"{'type': 'intl', 'number': '+1 734 303 4456'}"
1,"{'email': 'jc@umd.edu', 'hide': 'no'}",Joel,"{'type': 'local', 'number': '479 647 9905'}"
2,"{'email': 'zh@umd.edu', 'hide': 'yes'}",Zara,"{'type': 'local', 'number': '678 321 4456'}"
