# Creating Data Dictionaries for Crashes, Vehicles, and People Datasets

In [1]:
# HTML parsing
from bs4 import BeautifulSoup

In [2]:
def write_datadict(path, title):
    """
    Create a data dictionary Markdown file from an HTML table file
    
    Parameters
    ----------
    path : str
        file path for the HTML file
    title : str
        title for the data dictionary Markdown file
    """
    
    with open(path) as f:  
        soup = BeautifulSoup(f, 'lxml')
    
    # Create lists of name and description tags from soup
    name_tags = soup.table.tbody.find_all('td', attrs={'class': 'column-name'}, recursive=True)
    descr_tags = soup.table.tbody.find_all('td', attrs={'class': 'column-description'}, recursive=True)
    type_tags = soup.table.tbody.find_all('td', attrs={'class': 'column-type'}, recursive=True)
    
    # Convert to string values
    feature_names = [name_tag.text.strip() for name_tag in name_tags]
    descr_tags = [descr_tag.text.strip() for descr_tag in descr_tags]
    type_tags = [type_tag.text.strip() for type_tag in type_tags]
    
    # Zip lists into 3-tuples
    metadata = zip(feature_names, descr_tags, type_tags)
    
    # Write and format Markdown file
    with open(f"datadict_{title.lower()}.md", 'w') as datadict_file:
        datadict_file.write(f"# {title} Data Dictionary\n___\n")
        for name, description, col_type in metadata:
            text = f"\n## {name}: _{col_type}_\n\t{description}\n"
            datadict_file.write(text)

In [3]:
paths_titles = {
    'metadata_htmls/metadata_crashes.html': 'Crashes',
    'metadata_htmls/metadata_people.html': 'People',
    'metadata_htmls/metadata_vehicles.html': 'Vehicles'
}

for path, title in paths_titles.items():
    write_datadict(path, title)