In [1]:
import dask
import dask.dataframe as dd
from bs4 import BeautifulSoup
import re
import requests
from zipfile import ZipFile
import os

In [2]:
DATA_DIR = './data'
XML_DATA_URL = 'https://afdata.s3.us-gov-west-1.amazonaws.com'

In [3]:
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
            
def read_files(path, ext):
    file_list = []
    for root, folders, docs in os.walk(path):
        file_list.extend( [os.path.join(root, doc) for doc in docs if ext in doc] )

    return file_list

In [4]:
# Download XML list of data sources.
xml_data_path = DATA_DIR+'/data_sources.xml'
xml_data = download_url(XML_DATA_URL, xml_data_path)

In [34]:
# Download all zip data files from the XML source.
with open(xml_data_path, 'r') as xml:
    soup = BeautifulSoup(xml, 'xml')
    
    # XML structure: <Contents><Key>filename</Key><Size>bytes</Size></Contents>
    contents_elements = soup.find_all('Contents')
    
    for contents in contents_elements:
        key = contents.find('Key')
        filename = str(key.text)
        if not re.search(r'\.zip$', filename):
            continue
        
        save_path = str.format('{0}/{1}', DATA_DIR, filename)
        url = str.format('{0}/{1}', XML_DATA_URL, filename)     
        expected_size = int(str(contents.find('Size').text))
        
        # Only download if the file doesn't exist with the expected size in bytes.
        if os.path.exists(save_path):
            actual_size = os.path.getsize(save_path)
            if expected_size == actual_size:
                print(str.format('{0} at {1} bytes already exists.', filename, expected_size))
                continue
        
        print(str.format('Downloading {0}...', url))
        
        download_url(url, save_path)
        actual_size = os.path.getsize(save_path)
        
        if actual_size != expected_size:
            print(str.format('WARNING: File size for {0} at {1} bytes does not match the expected size of {2} bytes.',
                            filename, actual_size, expected_size))
        else:  
            print(str.format('Successfully downloaded {0} to {1}. Filesize: {2} bytes.'
                             , url, save_path, actual_size))
        
    xml.close()

Scenario_Data/AIS/AIS_2015_01_Zone01.zip at 699204 bytes already exists.
Scenario_Data/AIS/AIS_2015_01_Zone02.zip at 4800320 bytes already exists.
Scenario_Data/AIS/AIS_2015_01_Zone03.zip at 50471539 bytes already exists.
Scenario_Data/AIS/AIS_2016_01_Zone01.zip at 434183 bytes already exists.
Scenario_Data/AIS/AIS_2016_01_Zone02.zip at 3169488 bytes already exists.
Scenario_Data/AIS/AIS_2016_01_Zone03.zip at 55155734 bytes already exists.
Scenario_Data/AIS/AIS_2017_01_Zone01.zip at 176297 bytes already exists.
Scenario_Data/AIS/AIS_2017_01_Zone02.zip at 3566591 bytes already exists.
Scenario_Data/AIS/AIS_2017_01_Zone03.zip at 52487368 bytes already exists.
Scenario_Data/AIS/Zone10_2009_01.zip at 520366084 bytes already exists.
Scenario_Data/AIS/Zone10_2010_01.zip at 503698861 bytes already exists.
Scenario_Data/AIS/Zone10_2011_01.gdb.zip at 425399907 bytes already exists.
Scenario_Data/AIS/Zone10_2012_01.gdb.zip at 603860220 bytes already exists.
Scenario_Data/AIS/Zone10_2013_01.gdb.z

In [42]:
# Unzip all data files.
for path in read_files(path=DATA_DIR, ext='.zip'):
    extract_path = '/'.join(str(path).rsplit('/')[:-1]) + '/unzipped/'
    with ZipFile(path, 'r') as zipfile:
        zipfile.extractall(extract_path)

In [87]:
# Clean TLE data, Save as pipe delimitted datasets.
tle_files = read_files('./data/Scenario_Data/TLE/unzipped', '.txt')
for path in tle_files:
    filename = re.sub(r'\.txt$', '.clean.txt', path)
    with open(path, 'r') as file:
        with open(filename, 'w') as newfile:
            while True:
                line = file.readline()
                if not line:
                    file.close()
                    newfile.close()
                    print(filename)
                    break
                
                newline = ''
                line2 = False
                if re.search(r'^2', line):
                    newline = '\r\n'
                    line2 = True
   
                line = re.sub(r'^(1|2)\s+', '', line)
                line = re.sub(r'(^\s+|\s+$)', '', line)
                line = re.sub(r'\\', '|', line)
                line = re.sub(r'\s+', '|', line)   
                last_num = re.findall(rx, line) 

                # Sometimes Mean Motion and Revolution number are crammed together. Separate.
                rx = r'(?<=\|)\d{1,2}\.\d{13}'
                if line2 and re.search(rx, line):
                    last_num = re.findall(rx, line)[0]
                    mean_motion = last_num[:-5]
                    rev_num = last_num[-5:]
                    # print(mean_motion, rev_num)
                    line = re.sub(rx, mean_motion+'|'+rev_num, line)
                    
                line = line + newline
                newfile.write(line)

./data/Scenario_Data/TLE/unzipped/tle2004_1of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_2of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_3of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_4of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_5of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_6of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_7of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_8of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2005.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2006.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2007.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2008.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2009.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2010.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2011.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2012.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2013.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2014.clean.txt
./data/Scenario_Data/TLE

In [88]:
tle_columns = ['SatNum', 'IntnlDesignator', 'EpochYear', 'JulianDayFrac', 'BallisticCoef', 'MeanMotion'
                , 'DragTerm', 'EphemerisType' 'ElemNumCheckSum', 'SatNum2', 'Inclination'
                , 'RightAscension', 'Eccentricity', 'ArgPerigree', 'MeanAnomaly', 'MeanMotion'
                , 'RevNumEpochCheckSum']
tle_data = dd.read_csv('./data/Scenario_Data/TLE/unzipped/*.clean.txt', sep='|', header=None)
tle_data.head()

  result = _execute_task(task, cache)


ValueError: Mismatched dtypes found in `pd.read_csv`/`pd.read_table`.

+--------+---------+----------+
| Column | Found   | Expected |
+--------+---------+----------+
| 11     | float64 | int64    |
| 15     | float64 | int64    |
| 3      | object  | float64  |
| 8      | float64 | int64    |
+--------+---------+----------+

The following columns also raised exceptions on conversion:

- 3
  ValueError("could not convert string to float: '00000-0'")

Usually this is due to dask's dtype inference failing, and
*may* be fixed by specifying dtypes manually by adding:

dtype={11: 'float64',
       15: 'float64',
       3: 'object',
       8: 'float64'}

to the call to `read_csv`/`read_table`.