In [1]:
# Standard Includes
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# "magic" to display graphs in the notebook
%matplotlib inline


In [2]:
# import jtplot submodule from jupyterthemes
from jupyterthemes import jtplot

# currently installed theme will be used to
# set plot style if no arguments provided
jtplot.style()

In [3]:
xml_str = '''
<employees>
  <person>
    <name>Bill</name>
    <phone type="intl" other="stuff">
       +1 734 303 4456
     </phone>
     <email hide="yes"/>
  </person>
  <person>
    <name>Doug</name>
    <phone type="intl">
       +1 777 777 7777
     </phone>
     <email>doug@email.com</email>
  </person>
  <person>
    <name>Sally</name>
    <email>sally@email.com</email>
  </person>
</employees>'''

In [4]:
dl = {
    'name' : ['Bill', 'Doug', 'Sally'],
    'phone' : ['1111', '2222', np.NaN],
    'email' : ['bill@umd.edu', 'doug@umd.edu', 'sally@umd.edu']
}
ld = [{'name': 'Bill', 'email': 'bill@email.com', 'phone': '1111'}, 
      {'name': 'Doug', 'phone': '2222', 'email': 'doug@email.com'}, 
      {'name': 'Sally', 'email': 'sally@email.com'}]

In [5]:
pd.DataFrame(dl)

Unnamed: 0,email,name,phone
0,bill@umd.edu,Bill,1111.0
1,doug@umd.edu,Doug,2222.0
2,sally@umd.edu,Sally,


In [6]:
pd.DataFrame(ld)

Unnamed: 0,email,name,phone
0,bill@email.com,Bill,1111.0
1,doug@email.com,Doug,2222.0
2,sally@email.com,Sally,


### ElementTree

In [7]:
import xml.etree.ElementTree as ET

In [8]:
ex = ET.fromstring(xml_str)

In [31]:
ex

<Element 'employees' at 0x0A8E5DE0>

In [32]:
type(ex)

xml.etree.ElementTree.Element

In [10]:
for em in ex.getchildren():
    print(em.tag)
    for c in em.getchildren():
        print(c.tag, c.text, c.attrib)

person
name Bill {}
phone 
       +1 734 303 4456
      {'type': 'intl', 'other': 'stuff'}
email None {'hide': 'yes'}
person
name Doug {}
phone 
       +1 777 777 7777
      {'type': 'intl'}
email doug@email.com {}
person
name Sally {}
email sally@email.com {}


In [11]:
def employeeParser(xml_obj):
    emp_list = []
    for person in xml_obj.getchildren():
        person_dict = {}
        for child in person.getchildren():
            if child.text != None:
                person_dict[child.tag] = child.text.strip()
        emp_list.append(person_dict)
    return pd.DataFrame(emp_list)

In [12]:
employeeParser(ex)

Unnamed: 0,email,name,phone
0,,Bill,+1 734 303 4456
1,doug@email.com,Doug,+1 777 777 7777
2,sally@email.com,Sally,


In [13]:
df1 = employeeParser(ex)

In [14]:
df1

Unnamed: 0,email,name,phone
0,,Bill,+1 734 303 4456
1,doug@email.com,Doug,+1 777 777 7777
2,sally@email.com,Sally,


### LXML

In [15]:
import lxml.etree as LET

In [16]:
lx = LET.fromstring(xml_str)

In [17]:
df2 = employeeParser(lx)

In [18]:
df2

Unnamed: 0,email,name,phone
0,,Bill,+1 734 303 4456
1,doug@email.com,Doug,+1 777 777 7777
2,sally@email.com,Sally,


In [19]:
df2.memory_usage().sum()

76

In [20]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
email    2 non-null object
name     3 non-null object
phone    2 non-null object
dtypes: object(3)
memory usage: 76.0+ bytes


## MoCo Traffic Violations - Jan 2018

XML data file for Traffic Violations in Montgomery County for January 2018
https://data.montgomerycountymd.gov/Public-Safety/Traffic-Violations/4mse-ku6q

In [21]:
mxml = LET.parse('moco_traffic_Jan2018.xml')


In [22]:
i = 0
for c in mxml.getiterator():
    if c.tag == 'row':
        i += 1
    if i > 3:
        break
    print(c.tag, c.attrib, c.text)

response {} 
  
row {} 
    
row {'_id': 'row-2ga7.ifin_4q23', '_uuid': '00000000-0000-0000-6460-E73B33438C41', '_position': '0', '_address': 'https://data.montgomerycountymd.gov/resource/ms8i-8ux3/row-2ga7.ifin_4q23'} 
      
date_of_stop {} 2018-01-18T00:00:00
time_of_stop {} 18:36:00
agency {} MCP
subagency {} 4th district, Wheaton
description {} TAILLIGHTS (*)
location {} S/B GEORGIA AVE. @ UNIVERSITY BLV. W.
latitude {} 39.04033
longitude {} -77.0514933333333
accident {} No
belts {} No
personal_injury {} No
property_damage {} No
fatal {} No
commercial_license {} No
hazmat {} No
commercial_vehicle {} No
alcohol {} No
work_zone {} No
state {} MD
vehicle_type {} 02 - Automobile
year {} 2013
make {} HYUND
model {} 4S
color {} RED
violation_type {} ESERO
charge {} 56*
contributed_to_accident {} No
race {} WHITE
gender {} F
driver_city {} BETHESDA
driver_state {} MD
dl_state {} MD
arrest_type {} A - Marked Patrol
geolocation {'latitude': '39.04033', 'longitude': '-77.051493333333'} None

In [23]:
def trafficParser(xml_obj):
    cnt = 0
    rows = []
    for obj in xml_obj.getchildren():
        if obj.tag == 'row':
            for r in obj.getchildren():
                rdict = {}
                for t in r.getchildren():
                    if t.text != None:
                        rdict[t.tag] = t.text.strip()
                rows.append(rdict)
                cnt += 1
    print("Num Rows: " + str(cnt))
    return pd.DataFrame(rows)

In [24]:
mxml

<lxml.etree._ElementTree at 0xce55670>

In [25]:
response = mxml.getroot()

In [26]:
response

<Element response at 0xce551c0>

In [27]:
moco = trafficParser(response)

Num Rows: 17577


In [28]:
moco.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17577 entries, 0 to 17576
Data columns (total 34 columns):
accident                   17577 non-null object
agency                     17577 non-null object
alcohol                    17577 non-null object
arrest_type                17577 non-null object
article                    16699 non-null object
belts                      17577 non-null object
charge                     17577 non-null object
color                      17577 non-null object
commercial_license         17577 non-null object
commercial_vehicle         17577 non-null object
contributed_to_accident    17577 non-null object
date_of_stop               17577 non-null object
description                17577 non-null object
dl_state                   17577 non-null object
driver_city                17577 non-null object
driver_state               17577 non-null object
fatal                      17577 non-null object
gender                     17577 non-null object
hazmat   

In [29]:
moco.isnull().sum()

accident                     0
agency                       0
alcohol                      0
arrest_type                  0
article                    878
belts                        0
charge                       0
color                        0
commercial_license           0
commercial_vehicle           0
contributed_to_accident      0
date_of_stop                 0
description                  0
dl_state                     0
driver_city                  0
driver_state                 0
fatal                        0
gender                       0
hazmat                       0
latitude                   412
location                     0
longitude                  412
make                         0
model                        0
personal_injury              0
property_damage              0
race                         0
state                        0
subagency                    0
time_of_stop                 0
vehicle_type                 0
violation_type               0
work_zon