# working with JSON and HTML

Importing simple JSON data<br>
Importing more complicated JSON data from an API<br>
Importing data from web pages<br>
Persisting JSON data<br>

### Importing simple JSON data<br>
Being readable by humans<br>
Being consumable by most client devices<br>
Not being limited in structure<br>

In [1]:
import pandas as pd
import numpy as np
import json
import pprint
from collections import Counter

In [29]:
# 1. Load the JSON data and look for potential issues.

with open('data/allcandidatenewssample.json') as f:
    candidatenews = json.load(f)
    

In [3]:
len(candidatenews)

60000

In [4]:
pprint.pprint(candidatenews[0:2])

[{'date': '2019-12-25 10:00:00',
  'domain': 'www.nbcnews.com',
  'panel_position': 1,
  'query': 'Michael Bloomberg',
  'source': 'NBC News',
  'story_position': 6,
  'time': '18 hours ago',
  'title': 'Bloomberg cuts ties with company using prison inmates to make '
           'campaign calls',
  'url': 'https://www.nbcnews.com/politics/2020-election/bloomberg-cuts-ties-company-using-prison-inmates-make-campaign-calls-n1106971'},
 {'date': '2019-11-09 08:00:00',
  'domain': 'www.townandcountrymag.com',
  'panel_position': 1,
  'query': 'Amy Klobuchar',
  'source': 'Town & Country Magazine',
  'story_position': 3,
  'time': '18 hours ago',
  'title': "Democratic Candidates React to Michael Bloomberg's Potential Run",
  'url': 'https://www.townandcountrymag.com/society/politics/a29739854/michael-bloomberg-democratic-candidates-campaign-reactions/'}]


In [5]:
# 2.Check for differences in the structure of the dictionaries
Counter([len(item) for item in candidatenews])

Counter({9: 57202, 2: 2382, 10: 416})

In [6]:
pprint.pprint(next(item for item in candidatenews if len(item)<9))

{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'}


In [7]:
pprint.pprint(next(item for item in candidatenews if len(item)>9))

{'category': 'Satire',
 'date': '2019-08-21 04:00:00',
 'domain': 'politics.theonion.com',
 'panel_position': 1,
 'query': 'John Hickenlooper',
 'source': 'Politics | The Onion',
 'story_position': 8,
 'time': '4 days ago',
 'title': '‘And Then There Were 23,’ Says Wayne Messam Crossing Out '
          'Hickenlooper Photo \n'
          'In Elaborate Grid Of Rivals',
 'url': 'https://politics.theonion.com/and-then-there-were-23-says-wayne-messam-crossing-ou-1837311060'}


In [8]:
pprint.pprint([item for item in candidatenews if len(item)==2][0:10])


[{'date': '2019-09-11 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-07-24 00:00:00', 'reason': 'No Top stories'},
 {'date': '2019-08-19 20:00:00', 'reason': 'Not collected'},
 {'date': '2019-09-13 16:00:00', 'reason': 'Not collected'},
 {'date': '2019-10-16 20:00:00', 'reason': 'No Top stories'},
 {'date': '2019-10-17 18:00:00', 'reason': 'Not collected'},
 {'date': '2019-08-02 14:00:00', 'reason': 'Not collected'},
 {'date': '2019-05-27 12:00:00', 'reason': 'Not collected'},
 {'date': '2019-12-03 12:00:00', 'reason': 'No Top stories'},
 {'date': '2019-01-03 00:00:00', 'reason': 'No Top stories'}]


In [9]:
candidatenews = [item for item in candidatenews if len(item)>2]


In [10]:
len(candidatenews)

57618

In [11]:
#3 Generate counts from the JSON data

politico = [item for item in candidatenews if item["source"] == "Politico"]

In [12]:
len(politico)

2732

In [13]:
pprint.pprint(politico[0:2])

[{'date': '2019-05-18 18:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Marianne Williamson',
  'source': 'Politico',
  'story_position': 7,
  'time': '1 week ago',
  'title': 'Marianne Williamson reaches donor threshold for Dem debates',
  'url': 'https://www.politico.com/story/2019/05/09/marianne-williamson-2020-election-1315133'},
 {'date': '2018-12-27 06:00:00',
  'domain': 'www.politico.com',
  'panel_position': 1,
  'query': 'Julian Castro',
  'source': 'Politico',
  'story_position': 1,
  'time': '1 hour ago',
  'title': "O'Rourke and Castro on collision course in Texas",
  'url': 'https://www.politico.com/story/2018/12/27/orourke-julian-castro-collision-texas-election-1073720'}]


In [14]:
#4. Get the source data and confirm that it has the anticipated length
sources = [item.get('source') for item in candidatenews]


In [15]:
type(sources)

list

In [16]:
len(sources)

57618

In [17]:
sources[0:5]

['NBC News', 'Town & Country Magazine', 'TheHill', 'CNBC.com', 'Fox News']

In [18]:
pprint.pprint(Counter(sources).most_common(10))

[('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('TheHill', 2383),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('The Hill', 1342),
 ('New York Post', 1275),
 ('Vox', 941)]


In [20]:
#5. Fix any errors in the values in the dictionary
for newsdict in candidatenews:
    newsdict.update((k, "The Hill") for k, v in newsdict.items() if k == "source" and v == "TheHill")
    

In [21]:
sources = [item.get('source') for item in candidatenews]

In [22]:
pprint.pprint(Counter(sources).most_common(10))

[('The Hill', 3725),
 ('Fox News', 3530),
 ('CNN.com', 2750),
 ('Politico', 2732),
 ('The New York Times', 1804),
 ('Washington Post', 1770),
 ('Washington Examiner', 1655),
 ('New York Post', 1275),
 ('Vox', 941),
 ('Breitbart', 799)]


In [23]:
#6. Create a pandas DataFrame
candidatenewsdf = pd.DataFrame(candidatenews)


In [24]:
candidatenewsdf.dtypes

title             object
url               object
source            object
time              object
date              object
query             object
story_position     int64
panel_position    object
domain            object
category          object
dtype: object

In [26]:
#7. Confirm that we are getting the expected values for source
candidatenewsdf.rename(columns={'date':'storydate'}, inplace=True)

In [27]:
candidatenewsdf.storydate = candidatenewsdf.storydate.astype('datetime64[ns]')

In [28]:
candidatenewsdf.shape

(57618, 10)

In [30]:
candidatenewsdf.source.value_counts(sort=True).head(10)

The Hill               3725
Fox News               3530
CNN.com                2750
Politico               2732
The New York Times     1804
Washington Post        1770
Washington Examiner    1655
New York Post          1275
Vox                     941
Breitbart               799
Name: source, dtype: int64

## Importing more complicated JSON data from an API

#### Create a DataFrame from the museum's collections data with one row for each citation, and the title and creation_date duplicated

In [31]:
# 1. Import the json, requests, and pprint libraries

import pandas as pd
import numpy as np
import json
import pprint
import requests

In [32]:
#2. Use an API to load the JSON data

response = requests.get("https://openaccess-api.clevelandart.org/api/artworks/?african_american_artists")

In [33]:
camcollections = json.loads(response.text)

In [34]:
print(len(camcollections['data']))

796


In [35]:
pprint.pprint(camcollections['data'][0])

{'accession_number': '2007.158',
 'catalogue_raisonne': None,
 'citations': [{'citation': 'Annual Exhibition: Sculpture, Paintings, '
                            'Watercolors, Drawings (New York: Whitney Museum '
                            'of American Art, 1958).',
                'page_number': 'Unpaginated, [8],[12]',
                'url': None},
               {'citation': '"Moscow to See Modern U.S. Art,"<em> New York '
                            'Times</em> (May 31, 1959).',
                'page_number': 'P. 60',
                'url': None},
               {'citation': 'Baur, John I. H. <em>Between the Fairs: 25 Years '
                            'of America Art, 1939-1964.</em> New York. Whitney '
                            'Museum. Catalogues. 1962-64. New York: Published '
                            'for the Whitney Museum of American Art by F.A. '
                            'Praeger, 1964.',
                'page_number': 'P. 88',
                'url': None},
      

In [36]:
#3.Flatten the JSON data

camcollectionsdf=pd.json_normalize(camcollections['data'],'citations',['accession_number','title','creation_date','collection','creators','type'])


In [37]:
camcollectionsdf.head(2).T

Unnamed: 0,0,1
citation,"Annual Exhibition: Sculpture, Paintings, Water...","""Moscow to See Modern U.S. Art,""<em> New York ..."
page_number,"Unpaginated, [8],[12]",P. 60
url,,
accession_number,2007.158,2007.158
title,Fulton and Nostrand,Fulton and Nostrand
creation_date,1958,1958
collection,American - Painting,American - Painting
creators,"[{'id': 4050, 'description': 'Jacob Lawrence (...","[{'id': 4050, 'description': 'Jacob Lawrence (..."
type,Painting,Painting


In [38]:
#4.Pull the birth_year value from creators

creator = camcollectionsdf[:1].creators[0]


In [39]:
type(creator[0])

dict

In [40]:
pprint.pprint(creator)

[{'biography': 'Jacob Lawrence (born 1917) has been a prominent artist since '
               '1941 when, at age 24, he became the first African American to '
               'have a work in the permanent collection of the Museum of '
               'Modern Art in New York. His career, now spanning seven '
               'decades, has been devoted to documenting African-American life '
               'and history, from everyday scenes to the universal struggle '
               'for freedom, social justice, and human dignity. Moving to '
               'Harlem as a teenager in 1930, Lawrence was influenced by the '
               'artists, writers, and philosophers of the Harlem '
               'Renaissance-among them Romare Bearden, Langston Hughes, and '
               'W.E.B. DuBois-who fostered pride in African-American culture. '
               "Lawrence's subjects include the legendary abolitionist heroes "
               'Frederick Douglass, Harriet Tubman, and John Brown, and th

In [41]:
camcollectionsdf['birthyear'] = camcollectionsdf.creators.apply(lambda x: x[0]['birth_year'])

In [42]:
camcollectionsdf.birthyear.value_counts().sort_index().head()

1821    18
1886     2
1888     1
1892    13
1899    17
Name: birthyear, dtype: int64

In [43]:
camcollectionsdf['birthyear'] = camcollectionsdf.creators.apply(lambda x: x[0]['birth_year'])

## Importing data from web pages

We scrape the COVID data from the website and do some routine data checks:

In [44]:
import pandas as pd
import numpy as np
import json
import pprint
import requests
from bs4 import BeautifulSoup

In [45]:
#2.Parse the web page and get the header row of the table

webpage = requests.get("http://www.alrb.org/datacleaning/covidcaseoutliers.html")

In [46]:
bs = BeautifulSoup(webpage.text, 'html.parser')

In [48]:
theadrows = bs.find('table', {'id':'tblDeaths'}).thead.find_all('th')

In [49]:
type(theadrows)

bs4.element.ResultSet

In [50]:
labelcols = [j.get_text() for j in theadrows]

In [51]:
labelcols[0] = "rowheadings"

In [52]:
labelcols

['rowheadings',
 'Cases',
 'Deaths',
 'Cases per Million',
 'Deaths per Million',
 'population',
 'population_density',
 'median_age',
 'gdp_per_capita',
 'hospital_beds_per_100k']

In [53]:
#3.Get the data from the table cells.
# Find all of the table rows for the table we want

rows = bs.find('table', {'id':'tblDeaths'}).tbody.find_all('tr')


In [54]:
datarows = []
labelrows = []

for row in rows:
    rowlabels = row.find('th').get_text()
    cells = row.find_all('td', {'class':'data'})
    if (len(rowlabels)>3):
        labelrows.append(rowlabels)
    
    if (len(cells)>0):
        cellvalues = [j.get_text() for j in cells]
        datarows.append(cellvalues)
        

In [55]:
pprint.pprint(datarows[0:2])

[['9,394', '653', '214', '15', '43,851,043', '17', '29', '13,914', '1.9'],
 ['16,642', '668', '1848', '74', '9,006,400', '107', '44', '45,437', '7.4']]


In [56]:
pprint.pprint(labelrows[0:2])

['Algeria', 'Austria']


In [57]:
for i in range(len(datarows)):
    datarows[i].insert(0, labelrows[i])
    

In [58]:
pprint.pprint(datarows[0:1])

[['Algeria',
  '9,394',
  '653',
  '214',
  '15',
  '43,851,043',
  '17',
  '29',
  '13,914',
  '1.9']]


In [59]:
#4.Load the data into pandas
totaldeaths = pd.DataFrame(datarows, columns=labelcols)


In [60]:
totaldeaths.head()

Unnamed: 0,rowheadings,Cases,Deaths,Cases per Million,Deaths per Million,population,population_density,median_age,gdp_per_capita,hospital_beds_per_100k
0,Algeria,9394,653,214,15,43851043,17,29,13914,1.9
1,Austria,16642,668,1848,74,9006400,107,44,45437,7.4
2,Bangladesh,47153,650,286,4,164689383,1265,28,3524,0.8
3,Belgium,58381,9467,5037,817,11589616,376,42,42659,5.6
4,Brazil,514849,29314,2422,138,212559409,25,34,14103,2.2


In [61]:
totaldeaths.dtypes

rowheadings               object
Cases                     object
Deaths                    object
Cases per Million         object
Deaths per Million        object
population                object
population_density        object
median_age                object
gdp_per_capita            object
hospital_beds_per_100k    object
dtype: object

In [62]:
#5. Fix the column names and convert the data to numeric values
# Remove spaces from column names.

totaldeaths.columns = totaldeaths.columns.str.replace(" ", "_").str.lower()


In [63]:
for col in totaldeaths.columns[1:-1]:
    totaldeaths[col] = totaldeaths[col].str.replace("[^0-9]","").astype('int64')
    

In [64]:
totaldeaths['hospital_beds_per_100k'] = totaldeaths['hospital_beds_per_100k'].astype('float')


In [65]:
totaldeaths.head()

Unnamed: 0,rowheadings,cases,deaths,cases_per_million,deaths_per_million,population,population_density,median_age,gdp_per_capita,hospital_beds_per_100k
0,Algeria,9394,653,214,15,43851043,17,29,13914,1.9
1,Austria,16642,668,1848,74,9006400,107,44,45437,7.4
2,Bangladesh,47153,650,286,4,164689383,1265,28,3524,0.8
3,Belgium,58381,9467,5037,817,11589616,376,42,42659,5.6
4,Brazil,514849,29314,2422,138,212559409,25,34,14103,2.2


In [66]:
totaldeaths.dtypes

rowheadings                object
cases                       int64
deaths                      int64
cases_per_million           int64
deaths_per_million          int64
population                  int64
population_density          int64
median_age                  int64
gdp_per_capita              int64
hospital_beds_per_100k    float64
dtype: object

In [67]:
theadrows = bs.find('table', {'id':'tblDeaths'}).thead.find_all('th')

In [68]:
labelcols = [j.get_text() for j in theadrows]

In [69]:
for i in range(len(datarows)):
    datarows[i].insert(0, labelrows[i])
    

## Persisting JSON data

### We will serialize the JSON data using two different methods:

In [70]:
#1.Load the pandas, json, pprint, requests, and msgpack libraries

import pandas as pd
import json
import pprint
import requests
import msgpack

In [71]:
#2.Load the JSON data from an API. I have abbreviated the JSON output
response = requests.get("https://openaccess-api.clevelandart.org/api/artworks/?african_american_artists")

In [72]:
camcollections = json.loads(response.text)

In [73]:
print(len(camcollections['data']))

796


In [74]:
pprint.pprint(camcollections['data'][0])

{'accession_number': '2007.158',
 'catalogue_raisonne': None,
 'citations': [{'citation': 'Annual Exhibition: Sculpture, Paintings, '
                            'Watercolors, Drawings (New York: Whitney Museum '
                            'of American Art, 1958).',
                'page_number': 'Unpaginated, [8],[12]',
                'url': None},
               {'citation': '"Moscow to See Modern U.S. Art,"<em> New York '
                            'Times</em> (May 31, 1959).',
                'page_number': 'P. 60',
                'url': None},
               {'citation': 'Baur, John I. H. <em>Between the Fairs: 25 Years '
                            'of America Art, 1939-1964.</em> New York. Whitney '
                            'Museum. Catalogues. 1962-64. New York: Published '
                            'for the Whitney Museum of American Art by F.A. '
                            'Praeger, 1964.',
                'page_number': 'P. 88',
                'url': None},
      

In [75]:
#3.Save and reload the JSON file using Python's json library

with open("data/camcollections.json","w") as f:
    json.dump(camcollections, f)

In [76]:
with open("data/camcollections.json","r") as f:
    camcollections = json.load(f)
    

In [77]:
pprint.pprint(camcollections['data'][0]['creators'])

[{'biography': 'Jacob Lawrence (born 1917) has been a prominent artist since '
               '1941 when, at age 24, he became the first African American to '
               'have a work in the permanent collection of the Museum of '
               'Modern Art in New York. His career, now spanning seven '
               'decades, has been devoted to documenting African-American life '
               'and history, from everyday scenes to the universal struggle '
               'for freedom, social justice, and human dignity. Moving to '
               'Harlem as a teenager in 1930, Lawrence was influenced by the '
               'artists, writers, and philosophers of the Harlem '
               'Renaissance-among them Romare Bearden, Langston Hughes, and '
               'W.E.B. DuBois-who fostered pride in African-American culture. '
               "Lawrence's subjects include the legendary abolitionist heroes "
               'Frederick Douglass, Harriet Tubman, and John Brown, and th

In [78]:
#4.Save and reload the JSON file using msgpack
with open("data/camcollections.msgpack", "wb") as outfile:
    packed = msgpack.packb(camcollections)
    outfile.write(packed)
    

In [79]:

with open("data/camcollections.msgpack", "rb") as data_file:
    msgbytes = data_file.read()

In [80]:
camcollections = msgpack.unpackb(msgbytes)

In [81]:
pprint.pprint(camcollections['data'][0]['creators'])

[{'biography': 'Jacob Lawrence (born 1917) has been a prominent artist since '
               '1941 when, at age 24, he became the first African American to '
               'have a work in the permanent collection of the Museum of '
               'Modern Art in New York. His career, now spanning seven '
               'decades, has been devoted to documenting African-American life '
               'and history, from everyday scenes to the universal struggle '
               'for freedom, social justice, and human dignity. Moving to '
               'Harlem as a teenager in 1930, Lawrence was influenced by the '
               'artists, writers, and philosophers of the Harlem '
               'Renaissance-among them Romare Bearden, Langston Hughes, and '
               'W.E.B. DuBois-who fostered pride in African-American culture. '
               "Lawrence's subjects include the legendary abolitionist heroes "
               'Frederick Douglass, Harriet Tubman, and John Brown, and th