# http://people.duke.edu/~ccc14/sta-663-2017/

# IO

In [1]:
%%file ./data/animals.txt
name|species|age|weight
arun|cat|5|7.3
bob|bird|2|1.5
coco|cat|2|5.5
dumbo|elephant|23|454
elmo|dog|5|11
fido|dog|3|24.5
gumba|bird|2|2.7

Overwriting ./data/animals.txt


In [2]:
with open('./data/animals.txt') as f:
    for line in f:
        if 'cat' in line:
            print(line.strip())

arun|cat|5|7.3
coco|cat|2|5.5


In [3]:
with open('./data/animals.txt') as f:
    text = f.read()
print(text)

name|species|age|weight
arun|cat|5|7.3
bob|bird|2|1.5
coco|cat|2|5.5
dumbo|elephant|23|454
elmo|dog|5|11
fido|dog|3|24.5
gumba|bird|2|2.7



In [4]:
import pandas as pd

df = pd.read_table('./data/animals.txt', sep='|')
df

Unnamed: 0,name,species,age,weight
0,arun,cat,5,7.3
1,bob,bird,2,1.5
2,coco,cat,2,5.5
3,dumbo,elephant,23,454.0
4,elmo,dog,5,11.0
5,fido,dog,3,24.5
6,gumba,bird,2,2.7


## Note that pandas has intelligently guessed the appropriate type of each column

In [5]:
df.dtypes

name        object
species     object
age          int64
weight     float64
dtype: object

## Saving a text file

In [6]:
s = """
name|species|age|weight
arun|cat|5|7.3
bob|bird|2|1.5
coco|cat|2|5.5
dumbo|elephant|23|454
elmo|dog|5|11
fido|dog|3|24.5
gumba|bird|2|2.7
"""

In [7]:
with open('./data/animals2.txt', 'w') as f:
    f.write(s)

In [8]:
!cat './data/animals2.txt'


name|species|age|weight
arun|cat|5|7.3
bob|bird|2|1.5
coco|cat|2|5.5
dumbo|elephant|23|454
elmo|dog|5|11
fido|dog|3|24.5
gumba|bird|2|2.7


## Web resources

In [9]:
import requests

In [10]:
# Only download once - Project Gutenburg will block you if you do this repeatedly

try:
    with open('./data/Ulysses.txt') as f:
        text = f.read()
except IOError:
    url = 'http://www.gutenberg.org/cache/epub/4300/pg4300.txt'
    resp = requests.get(url)
    text = resp.text
    
    with open('./data/Ulysses.txt', 'w') as f:
        f.write(text)

In [11]:
print(text[:550])

﻿The Project Gutenberg EBook of Ulysses, by James Joyce

This eBook is for the use of anyone anywhere at no cost and with almost
no restrictions whatsoever. You may copy it, give it away or re-use
it under the terms of the Project Gutenberg License included with this
eBook or online at www.gutenberg.org


Title: Ulysses

Author: James Joyce

Release Date: August 1, 2008 [EBook #4300]
Last Updated: January 29, 2016

Language: English


*** START OF THIS PROJECT GUTENBERG EBOOK ULYSSES ***



Produced by Col Choat, and David Widger



ULYSSES

by


## Getting a table from a URL

In [14]:
from urllib import request
import ssl

url = 'http://www.marketwatch.com/investing/stock/aapl/financials'
context = ssl._create_unverified_context()
response = request.urlopen(url, context=context)
html = response.read()

pd.read_html(html)[0]

Unnamed: 0,Fiscal year is October-September. All values USD millions.,2014,2015,2016,2017,2018,5-year trend
0,Sales/Revenue,183.24B,231.28B,214.23B,228.57B,265.81B,
1,Sales Growth,-,26.22%,-7.37%,6.70%,16.29%,
2,Cost of Goods Sold (COGS) incl. D&A,112.55B,142.26B,131.51B,141.7B,163.83B,
3,COGS excluding D&A,104.55B,131.76B,121.71B,132.3B,154.53B,
4,Depreciation & Amortization Expense,8B,10.5B,9.8B,9.4B,9.3B,
5,Depreciation,6.9B,9.2B,8.3B,8.2B,9.3B,
6,Amortization of Intangibles,1.1B,1.3B,1.5B,1.2B,-,
7,COGS Growth,-,26.39%,-7.56%,7.75%,15.61%,
8,Gross Income,70.69B,89.03B,82.72B,86.87B,101.98B,
9,Gross Income Growth,-,25.94%,-7.08%,5.02%,17.40%,


In [16]:
import json

with open('./data/north_carolina_bicycle_crash_data_heatmap_.json') as f:
    data = json.load(f)

In [17]:
len(data)

5716

In [18]:
data[0]

{'datasetid': 'north_carolina_bicycle_crash_data_heatmap_',
 'recordid': '696348a220ddd21dfcbd30cdc744147464ffd639',
 'fields': {'drvr_age': 66,
  'rd_defects': 'None',
  'crsh_sevri': 'C: Possible Injury',
  'objectid': 19,
  'crash_ty_1': 353311,
  'ambulancer': 'No',
  'excsspdind': 'No',
  'county': 'Durham',
  'speed_limi': '20 - 25  MPH',
  'rural_urba': 'Urban',
  'bike_injur': 'C: Possible Injury',
  'bike_race': 'Black',
  'drvr_vehty': 'Pickup',
  'crash_type': 'Bicyclist Ride Out - Residential Driveway',
  'bike_dir': 'Not Applicable',
  'city': 'Durham',
  'workzone_i': 'No',
  'rd_class': 'Local Street',
  'rd_config': 'Two-Way, Not Divided',
  'num_lanes': '2 lanes',
  'rd_feature': 'No Special Feature',
  'bike_age': 6,
  'location': [36.002743, -78.8785],
  'drvr_injur': 'O: No Injury',
  'crash_loc': 'Non-Intersection',
  'rd_charact': 'Straight - Level',
  'drvr_alc_d': 'No',
  'drvrage_gr': '60-69',
  'i_fid': 18,
  'light_cond': 'Daylight',
  'drvr_sex': 'Male',
  '

# Flatten the nested dictionaries recursively

In [20]:
def flatten(d, parent='', sep='_'):
    """This takes a nested dictionary, and returns a flat dictinoary.

    A new prefix can optionally be added to the keys by specifying parent.
    Keys in the flat dictionary are created by joining parent and child keys with sep.
    """
    items = []
    for k, v in d.items():
        key = sep.join([parent, k]) if parent else k
        try:
            items.extend(flatten(v, key, sep=sep).items())
        except AttributeError:
            items.append((key, v))
    return dict(items)

In [22]:
flatten(data[0])

{'datasetid': 'north_carolina_bicycle_crash_data_heatmap_',
 'recordid': '696348a220ddd21dfcbd30cdc744147464ffd639',
 'fields_drvr_age': 66,
 'fields_rd_defects': 'None',
 'fields_crsh_sevri': 'C: Possible Injury',
 'fields_objectid': 19,
 'fields_crash_ty_1': 353311,
 'fields_ambulancer': 'No',
 'fields_excsspdind': 'No',
 'fields_county': 'Durham',
 'fields_speed_limi': '20 - 25  MPH',
 'fields_rural_urba': 'Urban',
 'fields_bike_injur': 'C: Possible Injury',
 'fields_bike_race': 'Black',
 'fields_drvr_vehty': 'Pickup',
 'fields_crash_type': 'Bicyclist Ride Out - Residential Driveway',
 'fields_bike_dir': 'Not Applicable',
 'fields_city': 'Durham',
 'fields_workzone_i': 'No',
 'fields_rd_class': 'Local Street',
 'fields_rd_config': 'Two-Way, Not Divided',
 'fields_num_lanes': '2 lanes',
 'fields_rd_feature': 'No Special Feature',
 'fields_bike_age': 6,
 'fields_location': [36.002743, -78.8785],
 'fields_drvr_injur': 'O: No Injury',
 'fields_crash_loc': 'Non-Intersection',
 'fields_rd

## Now we can easily put into a DataFrfame for analysis

In [24]:
df = pd.DataFrame(flatten(d) for d in data)
df.head()

Unnamed: 0,datasetid,recordid,fields_drvr_age,fields_rd_defects,fields_crsh_sevri,fields_objectid,fields_crash_ty_1,fields_ambulancer,fields_excsspdind,fields_county,...,fields_num_units,fields_rd_surface,fields_hit_run,fields_drvr_estsp,fields_crashalcoh,geometry_type,geometry_coordinates,record_timestamp,fields_bikeage_gr,fields_crash_mont
0,north_carolina_bicycle_crash_data_heatmap_,696348a220ddd21dfcbd30cdc744147464ffd639,66.0,,C: Possible Injury,19,353311.0,No,No,Durham,...,2,Smooth Asphalt,No,11-15 mph,No,Point,"[-78.8785, 36.002743]",2015-04-21T05:55:44-04:00,,
1,north_carolina_bicycle_crash_data_heatmap_,9e89f7103e3a849d289b1015e80c3d3f7a74d058,34.0,,C: Possible Injury,30,211180.0,Yes,No,Pitt,...,2,Smooth Asphalt,No,0-5 mph,No,Point,"[-77.39265, 35.612984]",2015-04-21T05:55:44-04:00,50-59,
2,north_carolina_bicycle_crash_data_heatmap_,ed2998fdddd54956266ac4588b5e3e734ba6a419,52.0,,O: No Injury,34,111144.0,No,No,Pitt,...,2,Smooth Asphalt,No,21-25 mph,No,Point,"[-77.59074, 35.595676]",2015-04-21T05:55:44-04:00,,
3,north_carolina_bicycle_crash_data_heatmap_,77d8284533f319b61986fd4449ad5ea60b4f663a,33.0,,B: Evident Injury,36,119139.0,Yes,No,Mecklenburg,...,3,Smooth Asphalt,No,46-50 mph,No,Point,"[-80.7728, 35.076767]",2015-04-21T05:55:44-04:00,16-19,
4,north_carolina_bicycle_crash_data_heatmap_,5accb69983a9ee89d7a184b1e4601b6baac7241a,,,O: No Injury,50,112114.0,No,No,Mecklenburg,...,2,Smooth Asphalt,Yes,16-20 mph,No,Point,"[-80.75713, 35.19999]",2015-04-21T05:55:44-04:00,,


## Download file if necessary

In [36]:
import xmltodict
import os

if not os.path.exists('./data/reed.xml'):
    import requests
    resp = requests.get('http://www.cs.washington.edu/research/xmldatasets/data/courses/reed.xml')

    with open('./data/reed.xml', 'w') as f:
        f.write(resp.text)

In [37]:
with open('./data/reed.xml') as f:
    xml = f.read()
    d = xmltodict.parse(xml)

In [47]:
d.keys()

odict_keys(['root'])

In [48]:
d['root'].keys()

odict_keys(['course'])

In [49]:
courses = d['root']['course']

In [50]:
len(courses)

703

In [52]:
df = pd.DataFrame(flatten(d) for d in courses)
df.head()

Unnamed: 0,reg_num,subj,crse,sect,title,units,instructor,days,time_start_time,time_end_time,place_building,place_room
0,10577,ANTH,211,F01,Introduction to Anthropology,1.0,Brightman,M-W,03:10PM,04:30,ELIOT,414
1,20573,ANTH,344,S01,Sex and Gender,1.0,Makley,T-Th,10:30AM,11:50,VOLLUM,120
2,10624,BIOL,431,F01,Field Biology of Amphibians,0.5,Kaplan,T,06:10PM,08:00,PHYSIC,240A
3,10626,BIOL,431,F03,Bacterial Pathogenesis,0.5,,,,,,Mellies RESCHEDULED TO OTHER SEMESTER
4,20626,BIOL,431,S04,Seminar in Biology,0.5,Yezerinac,Th,06:10PM,08:00,BIOL,200A
