In [22]:
import sys
!{sys.executable} -m pip install numpy pandas geopandas requests beautifulsoup4

import zipfile
import requests
import os
import shutil
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import geopandas as gpd
from bs4 import BeautifulSoup



In [55]:
# Configure settings values
data_dir = '/Volumes/EXTERNAL/mdst_data'
pull_zip_files = False
process_zip_files = False
combine_zip_files = False

In [56]:
# Hardcode namespaces for xml processing
ns = {'gmx':"http://www.isotc211.org/2005/gmx",
      'gco':"http://www.isotc211.org/2005/gco",
      'gmd':"http://www.isotc211.org/2005/gmd",
      'xlink':"http://www.w3.org/1999/xlink",
      'gml':"http://www.opengis.net/gml/3.2",
      'xsi':"http://www.w3.org/2001/XMLSchema-instance",
      'gfc':"http://www.isotc211.org/2005/gfc"}

def read_xml(filename):
    # Read in xml data from file
    with open(filename) as fp:
        xml = ET.fromstring(fp.read())

    # Get all listed features
    features = [e for e in xml.iter() if e.find('./gfc:memberName/gco:LocalName', ns) is not None]

    # Retrieve the route types and feature class codes from the listed features
    for feature in features:
        if feature.find('./gfc:memberName/gco:LocalName', ns).text == 'RTTYP':
            route_type_codes = list(map(lambda x: x.text,
                                        feature.findall('./gfc:listedValue/gfc:FC_ListedValue/gfc:label/gco:CharacterString', ns)))
            route_type_values = list(map(lambda x: x.text,
                                         feature.findall('./gfc:listedValue/gfc:FC_ListedValue/gfc:definition/gco:CharacterString', ns)))
            route_types = pd.DataFrame({'route_type_code':route_type_codes, 'route_type':route_type_values})
        elif feature.find('./gfc:memberName/gco:LocalName', ns).text == 'MTFCC':
            feature_class_codes = feature.findall('./gfc:listedValue', ns)
            mtfcc_codes = list(map(lambda x: x.text,
                                        feature.findall('./gfc:listedValue/gfc:FC_ListedValue/gfc:label/gco:CharacterString', ns)))
            mtfcc_values = list(map(lambda x: x.text,
                                         feature.findall('./gfc:listedValue/gfc:FC_ListedValue/gfc:definition/gco:CharacterString', ns)))
            mtfcc = pd.DataFrame({'mtfcc_code':mtfcc_codes, 'mtfcc':mtfcc_values})
            
    return (route_types, mtfcc)

In [57]:
if pull_zip_files:
    # Get the HTML to scrape for zip files
    url = 'https://www2.census.gov/geo/tiger/TIGER2019/ROADS/'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Pull all the zip files
    filenames = [a.text.rstrip('.zip') for a in soup.find_all('a') if 'tl_2019_' in a.text]
    for i in range(len(filenames)):
        full_path = data_dir + '/zips/' + filenames[i] + '.zip'
        if not os.path.isfile(full_path):
            content = requests.get(url + filenames[i] + '.zip', allow_redirects=True).content
            with open(full_path, 'wb') as writer:
                writer.write(content)

        if ((i+1) % 10 == 0): print(str(i+1) + ' zip files pulled')

In [58]:
if process_zip_files:
    metadata_route_types = None
    metadata_mtfcc = None
    for i in range(len(filenames)):
        # Define local paths
        full_zip_path = data_dir + '/zips/' + filenames[i] + '.zip'
        folder_path = data_dir + '/' + filenames[i]
        metadata_file_path = folder_path + '/' + filenames[i] + '.shp.ea.iso.xml'
        shapefile_file_path = folder_path + '/' + filenames[i] + '.shp'
        final_csv_path = data_dir + '/csv/' + filenames[i] + '.csv'

        if not os.path.isfile(final_csv_path):
            # Extract the zip locally
            with zipfile.ZipFile(full_zip_path, 'r') as zip_ref:
                zip_ref.extractall(folder_path)

            # Extract metadata from .shp.ea.iso.xml file
            route_types, mtfcc = read_xml(metadata_file_path)
            metadata_route_types = route_types.copy() if metadata_route_types is None else metadata_route_types.append(route_types).drop_duplicates()
            metadata_mtfcc = mtfcc.copy() if metadata_mtfcc is None else metadata_mtfcc.append(mtfcc).drop_duplicates()

            # Pull data from shapefile and merge into single df
            shapefile = gpd.read_file(shapefile_file_path)
            shapefile = shapefile.drop(['LINEARID', 'geometry'], axis=1)
            shapefile.columns = ['full_name', 'route_type_code', 'mtfcc_code']

            # Save the data to a csv that can be loaded later
            shapefile.to_csv(final_csv_path, index=False)

            # Delete folder with unzipped data
            shutil.rmtree(folder_path)

        if ((i+1) % 10 == 0): print(str(i+1) + ' zip files processed')

    if metadata_route_types is not None: metadata_route_types.to_csv(data_dir + '/csv/metadata_route_types.csv', index=False)
    if metadata_mtfcc is not None: metadata_mtfcc.to_csv(data_dir + '/csv/metadata_mtfcc.csv', index=False)

In [66]:
# Read in metadata
metadata_route_types = pd.read_csv(data_dir + '/csv/metadata_route_types.csv')
metadata_mtfcc = pd.read_csv(data_dir + '/csv/metadata_mtfcc.csv')

if combine_road_data:
    # Combine all road data into single csv
    df = None
    csv_filenames = [fn for fn in os.listdir(data_dir + '/csv') if 'tl_2019_' in fn]
    for i in range(len(csv_filenames)):
        zip_code = filenames[i].split('_')[2]
        csv_data = pd.read_csv(data_dir + '/csv/' + csv_filenames[i])
        csv_data['zip'] = zip_code

        if df is None:
            df = csv_data.copy()
        else:
            df = df.append(csv_data, ignore_index=True)

        if ((i+1) % 100 == 0): print(str(i+1) + ' csv files processed')
        
    #display(df)
    df.to_csv(data_dir + '/master_road_list.csv', index=False)

100 csv files processed
200 csv files processed
300 csv files processed
400 csv files processed
500 csv files processed
600 csv files processed
700 csv files processed
800 csv files processed
900 csv files processed
1000 csv files processed
1100 csv files processed
1200 csv files processed
1300 csv files processed
1400 csv files processed
1500 csv files processed
1600 csv files processed
1700 csv files processed
1800 csv files processed
1900 csv files processed
2000 csv files processed
2100 csv files processed
2200 csv files processed
2300 csv files processed
2400 csv files processed
2500 csv files processed
2600 csv files processed
2700 csv files processed
2800 csv files processed
2900 csv files processed
3000 csv files processed
3100 csv files processed
3200 csv files processed
CPU times: user 16min 42s, sys: 6min 11s, total: 22min 54s
Wall time: 24min 38s
