### Reading and parsing json files containing html content
See the [BeautifulSoup documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) 

In [1]:
import json # to work with json file format
from bs4 import BeautifulSoup # to parse html

In [2]:
# TODO: this doesn't scale well and doesn't help with looking at the data.
#       Possible solutions: load into pandas dataframe or mongodb, maybe both.
records = []
with open('data/bigfoot_first100records.json') as f:
    for i in f:
        records.append(json.loads(i))

In [3]:
# show the first record
records[0]

{'_id': {'$oid': '5939abbd2acdf6607095366c'},
 'url': 'show_report.asp?id=13038',
 'html': '\r\n<!doctype html public "-//w3c//dtd html 4.0 transitional//en">\r\n<HTML>\r\n\t<HEAD>\r\n        <meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />\r\n        <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">\r\n    <meta name="KEYWORDS" content="bigfoot, big foot, BigFoot, BFRO, Bigfoot research, sightings database, organizations, Organization, sightings, lists, locations, areas, reports, report, ape, apes, forest, giant, giants, monster, monsters, swamp, creek, Florida, Skunk Ape Florida Skuk Ape, Abominable Snowman, Agogwe, Almasti, Batutut-Borneo, Didi, Dwendi, Kakundakari, Kung-Lu, Mapinguary, Muhalu, sasquatch, sasquach, Sedapa, Orang Pendek, Shiru, Sisimite, Skunk Ape, Tok- Burma, Yeti, Yowie, cryptozoology, animals, strange, glowing eyes, foot prints, cast, tracks, fottprint cast, hairy, beast, mystery, paranormal, ufo, x-files, xfiles, bionic woman, 

In [4]:
# finds each key in json file for that record
print(records[0].keys(), '\n')

# prints each key and value (using f strings)
for k, v in records[0].items():
    print(f'{k}: {v} \n')

dict_keys(['_id', 'url', 'html', 'time']) 

_id: {'$oid': '5939abbd2acdf6607095366c'} 

url: show_report.asp?id=13038 

html: 
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<HTML>
	<HEAD>
        <meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7" />
        <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
    <meta name="KEYWORDS" content="bigfoot, big foot, BigFoot, BFRO, Bigfoot research, sightings database, organizations, Organization, sightings, lists, locations, areas, reports, report, ape, apes, forest, giant, giants, monster, monsters, swamp, creek, Florida, Skunk Ape Florida Skuk Ape, Abominable Snowman, Agogwe, Almasti, Batutut-Borneo, Didi, Dwendi, Kakundakari, Kung-Lu, Mapinguary, Muhalu, sasquatch, sasquach, Sedapa, Orang Pendek, Shiru, Sisimite, Skunk Ape, Tok- Burma, Yeti, Yowie, cryptozoology, animals, strange, glowing eyes, foot prints, cast, tracks, fottprint cast, hairy, beast, mystery, paranormal, ufo, x-files, xfiles,

In [5]:
# use BeautifulSoup to parse the html
soup = BeautifulSoup(records[0]['html'], 'html.parser')

In [6]:
print(soup.prettify())

<!DOCTYPE doctype html public "-//w3c//dtd html 4.0 transitional//en">
<html>
 <head>
  <meta content="IE=EmulateIE7" http-equiv="X-UA-Compatible"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="bigfoot, big foot, BigFoot, BFRO, Bigfoot research, sightings database, organizations, Organization, sightings, lists, locations, areas, reports, report, ape, apes, forest, giant, giants, monster, monsters, swamp, creek, Florida, Skunk Ape Florida Skuk Ape, Abominable Snowman, Agogwe, Almasti, Batutut-Borneo, Didi, Dwendi, Kakundakari, Kung-Lu, Mapinguary, Muhalu, sasquatch, sasquach, Sedapa, Orang Pendek, Shiru, Sisimite, Skunk Ape, Tok- Burma, Yeti, Yowie, cryptozoology, animals, strange, glowing eyes, foot prints, cast, tracks, fottprint cast, hairy, beast, mystery, paranormal, ufo, x-files, xfiles, bionic woman, sightings, sighting, IVBC, Byrne, report, howl, scream, primate, everglades, Florida Everglades, swamp, Gigantopithecus, missing link, miss

In [7]:
print(soup.title)
print(soup.title.text)

<title>BFRO Report 13038: Snowmobiler has encounter in deep snow near Potter, AK</title>
BFRO Report 13038: Snowmobiler has encounter in deep snow near Potter, AK


In [8]:
soup.get_text()

'\n\n\n\n\n\n\nBFRO Report 13038: Snowmobiler has encounter in deep snow near Potter, AK\n\n\r\n\t\tfunction runLogo() {\r\n\t\t\tvar logo = document.getElementById("logo");\r\n\t\t\tif(logo != null)\r\n\t\t\t{\r\n\t\t\t\tlogo.src=logo.src + \'a;\'\r\n\t\t\t}\r\n\t\t}\r\n\t\t\r\n\t\t\n\n\n\n\n\n\r\n\t\t\t<!--\r\n\t\t\t\tif (document.images) \r\n\t\t\t\t{\r\n\t\t\t\t\tb_Media_Articles = new Image();\r\n\t\t\t\t\tb_Media_Articles.src = "/images/templates/b-Media%20Articles_hl.jpg";\r\n\t\t\t\t\tb_Media_Articles_orig = new Image();\r\n\t\t\t\t\tb_Media_Articles_orig.src = "/images/templates/b-Media%20Articles.jpg";\r\n\t\t\t\t\tb_ReportForm = new Image();\r\n\t\t\t\t\tb_ReportForm.src = "/images/templates/b-Report%20Form_hl.jpg";\r\n\t\t\t\t\tb_ReportForm_orig = new Image();\r\n\t\t\t\t\tb_ReportForm_orig.src = "/images/templates/b-Report%20Form.jpg";\r\n\t\t\t\t\tb_Home = new Image();\r\n\t\t\t\t\tb_Home.src = "/images/templates/b-Home_hl.jpg";\r\n\t\t\t\t\tb_Home_orig = new Image();\r\n

In [9]:
# example showing getting one html object from record
soup.find_all('span')

[<span style="font-size: 10pt"><a href="/gdb/">Geographical Index</a> &gt; <a href="/gdb/#usa">United States</a> &gt; <a href="/GDB/state_listing.asp?state=AK">Alaska</a> &gt; <a href="/GDB/show_county_reports.asp?state=AK&amp;county=Anchorage">Anchorage County</a> &gt; Report # 13038<br/> <br/></span>,
 <span class="reportheader">Report # 13038</span>,
 <span class="reportclassification">(Class A)</span>,
 <span class="field">Submitted  by  witness   on Saturday, November 12, 2005.</span>,
 <span class="field">Snowmobiler has encounter in deep snow near Potter, AK</span>,
 <span style="font-size:10pt"><a href="/GDB/show_report.asp?ID=13038&amp;PrinterFriendly=True">(Show Printer-friendly Version)</a></span>,
 <span class="field">YEAR:</span>,
 <span class="field">SEASON:</span>,
 <span class="field">MONTH:</span>,
 <span class="field">STATE:</span>,
 <span class="field">COUNTY:</span>,
 <span class="field">LOCATION DETAILS:</span>,
 <span class="field">NEAREST TOWN:</span>,
 <span cla

In [10]:
# filtering by class and attribute
soup.find_all('span', {'class': 'field'})

[<span class="field">Submitted  by  witness   on Saturday, November 12, 2005.</span>,
 <span class="field">Snowmobiler has encounter in deep snow near Potter, AK</span>,
 <span class="field">YEAR:</span>,
 <span class="field">SEASON:</span>,
 <span class="field">MONTH:</span>,
 <span class="field">STATE:</span>,
 <span class="field">COUNTY:</span>,
 <span class="field">LOCATION DETAILS:</span>,
 <span class="field">NEAREST TOWN:</span>,
 <span class="field">NEAREST ROAD:</span>,
 <span class="field">OBSERVED:</span>,
 <span class="field">ALSO NOTICED:</span>,
 <span class="field">OTHER WITNESSES:</span>,
 <span class="field">OTHER STORIES:</span>,
 <span class="field">TIME AND CONDITIONS:</span>,
 <span class="field">ENVIRONMENT:</span>]