# Parsing Structured JSON and XML data

## JSON format

### Step 1. Loading the needed packages

In [1]:
import pandas as pd
import json
from datetime import datetime

### Step 2. Open the structured data file and read it into a Python dictionary

In [2]:
with open('structured_example_log.json') as f:
    txt = json.load(f)

In [3]:
txt

{'data': [{'session_time': '2013-05-15 14:17:26',
   'event_name': 'Session Start',
   'event_attribute': 'NaN'},
  {'session_time': '2013-05-15 14:17:26',
   'event_name': 'Leaving sequence',
   'event_attribute': 'loadXML, moving forward.'},
  {'session_time': '2013-05-15 14:17:30',
   'event_name': 'Player submitted name',
   'event_attribute': 'Carl'},
  {'session_time': '2013-05-15 14:17:30',
   'event_name': 'Leaving sequence',
   'event_attribute': 'InputNameScreen, moving forward.'},
  {'session_time': '2013-05-15 14:17:31',
   'event_name': 'Player submitted name',
   'event_attribute': 'Carl'},
  {'session_time': '2013-05-15 14:17:31',
   'event_name': 'Leaving sequence',
   'event_attribute': 'startScreen, moving forward.'},
  {'session_time': '2013-05-15 14:17:50',
   'event_name': 'Player submitted name',
   'event_attribute': 'Carl'},
  {'session_time': '2013-05-15 14:17:50',
   'event_name': 'Leaving sequence',
   'event_attribute': 'slide2, moving forward.'},
  {'sessio

### Step 3. Convert a list of dictionaries to data frame

In [4]:
pd.DataFrame(txt.get('data'))

Unnamed: 0,session_time,event_name,event_attribute
0,2013-05-15 14:17:26,Session Start,
1,2013-05-15 14:17:26,Leaving sequence,"loadXML, moving forward."
2,2013-05-15 14:17:30,Player submitted name,Carl
3,2013-05-15 14:17:30,Leaving sequence,"InputNameScreen, moving forward."
4,2013-05-15 14:17:31,Player submitted name,Carl
5,2013-05-15 14:17:31,Leaving sequence,"startScreen, moving forward."
6,2013-05-15 14:17:50,Player submitted name,Carl
7,2013-05-15 14:17:50,Leaving sequence,"slide2, moving forward."
8,2013-05-15 14:17:55,Player submitted name,Carl
9,2013-05-15 14:17:55,Leaving sequence,"slide2b, moving forward."


## XML format (this is not included in the book code snippet, but included here as an extra bonus)

### Step 1. Load needed packages

In [5]:
import xml.etree.ElementTree as et  # package for xml parsing

### Step 2. Specify XML file name

In [6]:
xml_file_name = 'structured_example_log.xml'  # note that this xml is structured to comply with ETS' VPA data model

### Step 3. Parsing the xml tree

In [7]:
tree = et.parse(xml_file_name)
root = tree.getroot()

In [8]:
# check how many child 
len(root)

1

In [9]:
# find out the child of the root and the number of grandchild
for chd in root[0]:
    print(chd.tag,',',chd.text, ',',len(chd))

sessionID , 7369 , 0
teamID , hao_jiangang , 0
playerID , None , 4
attemptID , 17 , 0
sessionExtData , None , 5
eventSequence , None , 45


In [10]:
# check the playerID child 
root[0][2]

<Element 'playerID' at 0x7fe1e5037ae0>

In [11]:
# check the child of playerID
for chd in root[0][2]:
    print(chd.tag,',',chd.text, ',',len(chd))

pair , None , 2
pair , None , 2
pair , None , 2
pair , None , 2


In [12]:
# check the child of the child of playerID
root[0][2][0]

<Element 'pair' at 0x7fe1e5037b30>

In [13]:
# check the eventSequence child
root[0][5]

<Element 'eventSequence' at 0x7fe1e247c400>

In [14]:
for chd in root[0][5]:
    print(chd.tag,',',chd.text, ',',len(chd))

event , None , 7
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 7
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8
event , None , 8


In [15]:
# check the first event
root[0][5][0]

<Element 'event' at 0x7fe1e247c450>

In [16]:
for chd in root[0][5][0]:
    print(chd.tag,',',chd.text, ',',len(chd))

eventName , chat , 0
eventStartTime , 2019-11-06T14:18:31Z , 0
eventEndTime , 2019-11-06T14:18:31Z , 0
eventBy , jiangang , 0
eventTo , others , 0
eventResult , hi , 0
eventLocation , slide1-step0 , 0


#### The above codes show how you reach out to each leaf in the XML tree. You can put them together into a dataframe. 

# Bonus - Parsing HTML

In [3]:
#!pip install beautifulsoup4
from bs4 import BeautifulSoup

In [4]:
with open("html_example.html") as f:
    soup = BeautifulSoup(f,'html.parser')

In [5]:
print(soup)

<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title"> <b>The Dormouse's story</b> </p>
<p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>
</body>
</html>


In [6]:
#get the title
soup.title

<title>The Dormouse's story</title>

In [7]:
# get all text
soup.get_text()

"\n\nThe Dormouse's story\n\n\n The Dormouse's story \nOnce upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.\n\n"

In [8]:
# get the head
soup.head

<head>
<title>The Dormouse's story</title>
</head>

In [9]:
# get the the body
soup.body

<body>
<p class="title"> <b>The Dormouse's story</b> </p>
<p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>
</body>

In [10]:
#access one element
soup.a

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [11]:
# name of the tag
soup.a.name

'a'

In [12]:
# content of the tag
soup.a.text

'Elsie'

In [13]:
# attributes of the tag
soup.a.attrs

{'class': ['sister'], 'href': 'http://example.com/elsie', 'id': 'link1'}

In [14]:
# get the attribute of href
soup.a.attrs['href']

'http://example.com/elsie'

In [15]:
# find all a tags
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [16]:
# find all a tags with id attribute = link2
soup.find_all('a',{"id":"link2"})

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [17]:
soup.find_all('a',text='Lacie')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]