In [1]:
import json
import pandas as pd

# Importing Country Traveling Info
with open('countrytravelinfo.json', 'r') as f:
    country_data = pd.DataFrame(json.load(f))

In [2]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.data = ""
        
    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if attr[0] == "href":
                self.data += "(" + attr[1] + ")"
    
    def handle_endtag(self, tag):
        if tag == "p":
            self.data += "\n"
                
    def feed(self, *args, **kwargs):
        super().feed(*args, **kwargs)
        self.data += "\n\n"

    def reset(self):
        super().reset()
        self.data = ""
        
    def handle_data(self, data):
        self.data += data
        
parser = MyHTMLParser()

In [69]:
def parse_text_data(text_data):
    temp_dict = {}
    last_key = ""
    
    text_data = text_data.replace(u'\xa0', u' ')
    text_data = text_data.replace(":\n", ": ")
    
    for string in text_data.strip().split("\n"):
        split = string.split(": ")
        if len(split) == 1 and last_key != "":
            temp_dict[last_key] += split[0]
        elif len(split) == 2:
            temp_dict[split[0]] = split[1]
            last_key = split[0]
        else:
            temp_dict["error"] = string

    return temp_dict

def parse_cell(html_data):
    parser.reset()
    parser.feed(html_data)
    
    return parse_text_data(parser.data)

def get_unique_keys_from_text(text):
    return [key for key in parse_cell(text)]

In [None]:
ana = country_data[[
    'travel_transportation', 'health', 'local_laws_and_special_circumstances', 'safety_and_security', 
    'entry_exit_requirements', 'destination_description', 'travel_embassyAndConsulate']].applymap(parse_cell)

# Makes sure there is a dictionary in each text cell
# Note: Would rather use an "and" aggregate function, but this is a hacky way to get around that
ana[ana.applymap(lambda x: isinstance(x, dict)).aggregate('sum', axis='columns') != 7]

In [4]:
country_data.head()

Unnamed: 0,tag,geopoliticalarea,travel_transportation,health,local_laws_and_special_circumstances,safety_and_security,entry_exit_requirements,destination_description,iso_code,travel_embassyAndConsulate,last_update_date
0,AF,Afghanistan,"<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Bas...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>The...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Afg...",AF,"<div style=""margin-top:5px""> <p><h2><a...","Last Updated: May 22, 2019"
1,AL,Albania,"<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Cre...","<div style=""margin-top:5px""> <p><ul><l...","<div style=""margin-top:5px""> <p><p>See...",AL,"<div style=""margin-top:5px""> <p><h2><a...","Last Updated: July 3, 2019"
2,DZ,Algeria,"<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Whi...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>See...","<div style=""margin-top:5px""> <p><p>See ...",DZ,"<div style=""margin-top:5px""> <p><h2><a...","Last Updated: January 8, 2019"
3,AD,Andorra,"<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Cre...","<div style=""margin-top:5px""> <p><p>And...","<div style=""margin-top:5px""> <p><p>Read...",AD,"<div style=""margin-top:5px""> <p><h2><a...","Last Updated: August 27, 2019"
4,AO,Angola,"<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Med...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>Lan...","<div style=""margin-top:5px""> <p><p><b>...","<div style=""margin-top:5px""> <p><p>See ...",AO,"<div style=""margin-top:5px""> <p><h3><a...","Last Updated: June 17, 2019"


In [109]:
column = 'health'
idx = 12
print(country_data.iloc[idx][column], "\n\n")

# Clears the buffer and the parsed data so far
parser.reset()
parser.feed(country_data.iloc[idx][column])
print(parser.data)

exp_data = parse_text_data(parser.data)
print(exp_data)


<div style="margin-top:5px">         <p><p>Local hospitals will not settle accounts directly with American insurance companies. <b>You must pay the bill to the local hospital and later claim a refund from the insurance carrier in the United States. </b>The Austrian Medicine Import Act generally prohibits the import of prescription drugs into Austria, with two exceptions: 1) Non- European Union residents are allowed medicines as part of their personal luggage, but only the quantity required during the course of the stay, 2) Travelers may receive medicines by mail for personal use while staying in Austria. However, the quantity is limited to the length of their stay in Austria and cannot exceed three packages.</p><ul><li>Carry <b>prescription medication</b> in original packaging, along with your doctor’s prescription.&nbsp;</li><li><b>Obtain&nbsp;</b><a href="http://travel.state.gov/content/passports/en/go/health/insurance-providers.html"><b>supplemental insurance</b></a>&nbsp;to cover m

In [72]:
country_data['travel_transportation_parsed'] = country_data['travel_transportation'].apply(parse_cell)

In [107]:
ana

Unnamed: 0,travel_transportation,health,local_laws_and_special_circumstances,safety_and_security,entry_exit_requirements,destination_description,travel_embassyAndConsulate
0,{'Traffic Safety and Road Conditions': 'Road c...,{'error': 'Travelers are strongly urged to obt...,{'Criminal Penalties': 'U.S. citizens in Afgha...,{'error': 'The Afghanistan (http://travel.stat...,{'Passport and Visas': 'U.S. Citizens must hav...,{'error': 'Afghanistan remains an extremely da...,{'error': '(https://af.usembassy.gov/embassy/k...
1,{'Road Conditions and Safety': 'Exercise stron...,{'Medical Care': 'Medical care at most hospita...,{'Criminal Penalties': 'You are subject to loc...,{'error': 'We can: help you find appropriate m...,{'error': 'Find information on (https://travel...,{'error': 'See the Department of State’s (http...,{'error': '(https://al.usembassy.gov/embassy/t...
2,{'Road Conditions and Safety': 'Algeria suffer...,{'error': 'Emergency services are satisfactory...,{'Criminal Penalties': 'You are subject to loc...,{'Terrorist Activity': 'While Algeria has long...,{'error': 'See the Embassy of the People's Dem...,{'error': 'See our (https://www.state.gov/r/pa...,{'error': '(https://dz.usembassy.gov/)U.S. Emb...
3,{'Road Conditions and Safety': 'While in Andor...,{'error': 'You can also dial the Europe-wide e...,{'Criminal Penalties': 'While you are travelin...,{'error': 'We can: help you find appropriate m...,{'error': 'Find information on (https://travel...,{'error': 'Read the Department of State’s (htt...,"{'error': 'For assistance in Andorra, please c..."
4,{'Road Conditions and Safety': 'Major roads be...,{'error': 'The following diseases are prevalen...,{'Criminal Penalties': 'You are subject to loc...,{'error': 'We can: help you find appropriate m...,{'Requirements for Entry': 'PassportVisa or Vi...,{'error': 'See our (http://www.state.gov/r/pa/...,{'error': '(https://ao.usembassy.gov/embassy/l...
5,{'Road Conditions and Safety': 'Please be awar...,{'error': 'Anguilla has reported past or curre...,{'error': 'Faith-Based Travelers: See the fol...,{'error': 'We can: help you find appropriate m...,{'Passports and visa': 'U.S. citizens must hav...,{'error': 'See the Department of State’s (http...,{'error': '(https://bb.usembassy.gov/)U.S. Emb...
6,{'error': 'There is no direct air service from...,"{'error': 'Antarctica has no public hospitals,...","{'Criminal Penalties': 'Some Treaty Parties, i...",{'error': 'See our webpage on (http://travel.s...,{'Passports and Visas': 'A U.S. passport is re...,{'error': 'See the Department of State’s (http...,{'error': 'The United States does not maintain...
7,{'Road Conditions and Safety': 'Be extremely c...,{'error': 'The following diseases are present:...,{'Criminal Penalties': 'You are subject to loc...,"{'Crime': 'Crimes, including murder, rape, arm...",{'error': 'Find information on (https://travel...,{'error': 'See the Department of State’s (http...,{'error': 'ConsulatesU.S. Consular Agent - Ant...
8,{'Road Conditions and Safety': 'Driving is mor...,{'error': 'Health Concerns: The following dise...,{'Criminal Penalties': 'You are subject to loc...,{'error': 'We can: help you find appropriate m...,{'Entry/Exit Requirements for U.S. citizens': ...,{'error': 'See the Department of State’s (http...,{'error': '(https://ar.usembassy.gov/embassy/)...
9,{'Road Conditions and Safety': 'The informatio...,{'error': 'Most prescription medications are a...,{'Criminal Penalties': 'You are subject to loc...,{'error': 'We can: help you find appropriate m...,{'error': 'You need a valid passport to enter ...,{'error': 'See the Department of State’s (http...,{'error': '(https://am.usembassy.gov/embassy/y...
