In [82]:
#### Import libraries
import requests
import os
from zipfile import ZipFile
import xml.etree.ElementTree as ET
import pandas as pd
import glob
import numpy as np

In [2]:
os.getcwd()

'/Users/Chris/Desktop/Jupyter/deepzoom'

# 1 - Download and Unzip Coast Pilot Charts

# 2 - Loop through XML files and parse out locations

# 3 - Export files

In [3]:
# Make a list of files (There are 10 coast pilot publications)
url_list = list()

for n in range(1,11):
    url_list.append('https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp' + str(n) + 
                    '/CPB' + str(n) + '_WEB.zip')
url_list

['https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp1/CPB1_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp2/CPB2_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp3/CPB3_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp4/CPB4_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp5/CPB5_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp6/CPB6_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp7/CPB7_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp8/CPB8_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp9/CPB9_WEB.zip',
 'https://nauticalcharts.noaa.gov/publications/coast-pilot/files/cp10/CPB10_WEB.zip']

In [4]:
# Big loop:
for url in url_list:
    coastpilot_number = url.split("/")[6]
    print("Working on " + coastpilot_number)
    
    # Download the URL
    output_zip_path = coastpilot_number + '.zip'
    output_folder = os.path.join("zip", coastpilot_number) # Make a folder to put the unzipped data in
    
    if(os.path.exists(output_folder) == False): # Don't make it if it doesn'e already exst
        os.mkdir(output_folder)
    
    # Download the file
    print("Requesting URL")
    rq = requests.get(url, allow_redirects=True)
    
    print("Downloading URL")
    open(output_zip_path, 'wb').write(rq.content)
    
    # Unzip  the file
    print("Unzipping")
    with ZipFile(output_zip_path, 'r') as zipObj:
        zipObj.extractall(output_folder)
    
    # To save space, delete the zipped file
    if os.path.exists(output_zip_path):
        os.remove(output_zip_path)
    
    # Get the names of all chapter files within this folder
    f_list = sorted(glob.glob(os.path.join(output_folder, "*_C*.xml")))
    
    print("Parsing XML")
    # MAIN LOOP:
    for file in f_list:
        # Display the current chapter
        chapter_short = file.split("_")[1] # Cut out the "CXX" part of the name (Chapter)
        #print("working on", chapter_short)
        
        # Open the .xml
        doc_xml = ET.parse(file)
        
        root = doc_xml.getroot()
        
        # Get some values to give some information about the other fields
        chapter_title = ET.tostring(root, encoding='utf8').decode('utf8').split("chapterTitle>")[1].split("</")[0]
        booktitlenum = root.attrib["Number"]
        booktitle = root.attrib['Title']
        bookyear = root.attrib['Year']
        bookedition = root.attrib['Year']
        bookchapternum = root.attrib['ChapterNo']
        
        # We only want the 'CP_GEO_LOC' - Parse these and put them into a list
        output_list = list()
        for el in root.iter('CP_GEO_LOC'):
            output_list.append(el.attrib)
            
        # Since some of the attribute names are capitalized, 
        # we need to make them all lowercase in order to ensure compatability
        output_list_c = list()
        
        for entry in output_list:
            output_list_c.append({k.lower(): v for k, v in entry.items()})

        # Convert the list into a dataframe
        output_df = pd.DataFrame.from_dict(output_list_c)
        
        # Add some other fields to tell us where this came from, what edition, etc.
        output_df["chapter_title"] = chapter_title
        output_df["book_title"] = booktitle
        output_df["book_year"] = bookyear
        output_df["book_edition"] = bookedition
        output_df["book_chapter_number"] = bookchapternum
        
        # Make a folder to output it
        output_folder_path = os.path.join("output", "CP" + booktitlenum)
        output_file_path = "CP" + booktitlenum + "_" + chapter_short + ".csv"
        
        if os.path.exists(output_folder_path) == False:
            os.mkdir(output_folder_path)
        
        # Write the csv
        output_df.to_csv(os.path.join(output_folder_path,output_file_path), index = False)

    

Working on cp1
Requesting URL
Downloading URL
Unzipping
working on C01
working on C02
working on C03
working on C04
working on C05
working on C06
working on C07
working on C08
working on C09
working on C10
working on C11
working on C12
Working on cp2
Requesting URL
Downloading URL
Unzipping
working on C01
working on C02
working on C03
working on C04
working on C05
working on C06
working on C07
working on C08
working on C09
working on C10
working on C11
working on C12
Working on cp3
Requesting URL
Downloading URL
Unzipping
working on C01
working on C02
working on C03
working on C04
working on C05
working on C06
working on C07
working on C08
working on C09
working on C10
working on C11
working on C12
working on C13
working on C14
working on C15
Working on cp4
Requesting URL
Downloading URL
Unzipping
working on C01
working on C02
working on C03
working on C04
working on C05
working on C06
working on C07
working on C08
working on C09
working on C10
working on C11
working on C12
Working on 

# Now, collect all of the .csv files together, being sure to skip empty ones

In [51]:
csv_list = sorted(glob.glob('output' + '/**/*.csv', recursive=True)) # Test on just CP1

In [122]:
master_df = []

for csv in csv_list:
    if pd.read_csv(csv).shape[0] > 0:
        current_csv = pd.read_csv(csv)
        master_df.append(current_csv)

master_df = pd.concat(master_df, ignore_index=True)

In [123]:
# There is an error in the table which occasionally switches the county_name and county_numeric
master_df.head()

Unnamed: 0,lat_dec,long_dec,elev_in_m,source,source_id,source_date,feature_name,feature_class,state_alpha,state_numeric,county_name,county_numeric,map_name,chapter_title,book_title,book_year,book_edition,book_chapter_number
0,44.5723,-67.3103,0.0,GNIS,567378,9/30/1980,Gulf of Maine,Bay,ME,23,29,Washington,Cross Island,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
1,43.2232,-70.6919,209.0,GNIS,561108,3/1/2011,Mount Agamenticus,Summit,ME,23,York,31,York Harbor,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
2,41.9834,-70.6245,0.0,GNIS,614959,6/30/2011,Browns Bank,Bar,MA,25,23,Plymouth,Plymouth,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
3,43.4168,-68.6661,0.0,GNIS,582265,12/4/2013,Jeffreys Bank,Bar,ME,23,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
4,43.0168,-70.1662,0.0,GNIS,582264,7/1/1990,Jeffreys Ledge,Bar,ME,23,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3


In [119]:
# A function to see if the string is a digit or not
def test_num(x):
    if str(x).isdigit() == True:
        return(int(x))
    else:
        return(np.nan)

# Make two columns and combine into one with the numeric values
c_num1 = master_df["county_name"].apply(test_num)
c_num2 = master_df["county_numeric"].apply(test_num)
num_out = c_num1.fillna(c_num2)

In [124]:
# Same, but testing to see if it's a name
def test_name(x):
    if str(x).isdigit() == True:
        return(np.nan)
    else:
        return(str(x))

c_name1 = master_df["county_name"].apply(test_name)
c_name2 = master_df["county_numeric"].apply(test_name)
name_out = c_name1.fillna(c_name2)

In [125]:
master_df["county_name"] = name_out
master_df["county_numeric"] = num_out
master_df.head(20)

Unnamed: 0,lat_dec,long_dec,elev_in_m,source,source_id,source_date,feature_name,feature_class,state_alpha,state_numeric,county_name,county_numeric,map_name,chapter_title,book_title,book_year,book_edition,book_chapter_number
0,44.5723,-67.3103,0.0,GNIS,567378,9/30/1980,Gulf of Maine,Bay,ME,23,Washington,29.0,Cross Island,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
1,43.2232,-70.6919,209.0,GNIS,561108,3/1/2011,Mount Agamenticus,Summit,ME,23,York,31.0,York Harbor,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
2,41.9834,-70.6245,0.0,GNIS,614959,6/30/2011,Browns Bank,Bar,MA,25,Plymouth,23.0,Plymouth,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
3,43.4168,-68.6661,0.0,GNIS,582265,12/4/2013,Jeffreys Bank,Bar,ME,23,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
4,43.0168,-70.1662,0.0,GNIS,582264,7/1/1990,Jeffreys Ledge,Bar,ME,23,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
5,42.3001,-70.2995,0.0,GNIS,1802774,10/22/1998,Stellwagen Bank,Bar,MA,25,Barnstable,1.0,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
6,41.6668,-67.7494,0.0,GNIS,617045,12/4/2013,Georges Bank,Bar,MA,25,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
7,41.1918,-70.7939,0.0,GNIS,600157,12/4/2013,Nantucket Shoals,Bar,MA,25,Nantucket,19.0,Squibnocket OE S,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
8,41.3431,-66.1433,0.0,GNIS,617038,3/14/2012,Corsair Canyon,Valley,MA,25,,,Unknown,Eastport to Cape Cod,Coast Pilot 1,2020,2020,3
9,44.8131,-66.9628,45.0,GNIS,578209,9/30/1980,West Quoddy Head,Cape,ME,23,Washington,29.0,Lubec,"Quoddy Narrows to Calais, Maine",Coast Pilot 1,2020,2020,4


In [126]:
# Export the master_df to a file
master_df.to_csv("output_all.csv", index=False)