In [92]:
import dask
import dask.dataframe as dd
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
from zipfile import ZipFile
import os
import numpy as np
import shutil
from datetime import datetime, timedelta
import math

In [93]:
DATA_DIR = './data'
XML_DATA_URL = 'https://afdata.s3.us-gov-west-1.amazonaws.com'

In [94]:
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
            
def read_files(path, ext):
    file_list = []
    for root, folders, docs in os.walk(path):
        file_list.extend( [os.path.join(root, doc) for doc in docs if ext in doc] )

    return file_list

In [95]:
# Download XML list of data sources.
xml_data_path = DATA_DIR+'/data_sources.xml'
xml_data = download_url(XML_DATA_URL, xml_data_path)

In [None]:
# Download all zip data files from the XML source.
with open(xml_data_path, 'r') as xml:
    soup = BeautifulSoup(xml, 'xml')
    
    # XML structure: <Contents><Key>filename</Key><Size>bytes</Size></Contents>
    contents_elements = soup.find_all('Contents')
    
    for contents in contents_elements:
        key = contents.find('Key')
        filename = str(key.text)
        if not re.search(r'\.zip$', filename):
            continue
        
        save_path = str.format('{0}/{1}', DATA_DIR, filename)
        url = str.format('{0}/{1}', XML_DATA_URL, filename)     
        expected_size = int(str(contents.find('Size').text))
        
        # Only download if the file doesn't exist with the expected size in bytes.
        if os.path.exists(save_path):
            actual_size = os.path.getsize(save_path)
            if expected_size == actual_size:
                print(str.format('{0} at {1} bytes already exists.', filename, expected_size))
                continue
        
        print(str.format('Downloading {0}...', url))
        
        download_url(url, save_path)
        actual_size = os.path.getsize(save_path)
        
        if actual_size != expected_size:
            print(str.format('WARNING: File size for {0} at {1} bytes does not match the expected size of {2} bytes.',
                            filename, actual_size, expected_size))
        else:  
            print(str.format('Successfully downloaded {0} to {1}. Filesize: {2} bytes.'
                             , url, save_path, actual_size))
        
    xml.close()

In [96]:
# Unzip all data files.
for path in read_files(path=DATA_DIR, ext='.zip'):
    extract_path = '/'.join(str(path).rsplit('/')[:-1]) + '/unzipped/'
    with ZipFile(path, 'r') as zipfile:
        zipfile.extractall(extract_path)

In [100]:
def julian_to_iso(julian):
    yr = int(julian[:2])
    yr = (2000+yr) if yr < 21 else (1900+yr)
    day = math.floor(float(julian[2:]))
    fraction = float('.'+julian.split('.')[1])
    dec_hours = fraction*24
    startdate = datetime(year=yr, month=1, day=1)
    startdate += timedelta(days=day)
    startdate += timedelta(hours=dec_hours)
    return startdate.isoformat()
    
def clean_intnl_designator(line):
    """
    Extract LaunchYear, NthLaunch, CharLaunchObject from International Designator.
        e.g. International Designator = '84123A' 
        where '84' is launch yr, 
        '123' is nth launch, 
        and 'A' is nth object resulting from this launch.
    """
    intnl_desig = re.findall(r'(?<=\|)\d{5}[A-Z]+}(?=\|)', line)
    if len(intnl_desig) > 0:
        val = intnl_desig[0]
        launch_year = int(val[:2])
        launch_year = 2000 + launch_year if (launch_year < 21) else (1900 + launch_year)
        nth_launch = int(val[2:5])
        char_launch_obj = re.sub(r'[^A-Z]', '', val)
        return line.replace(val, str.format("{0}|{1}|{2}", launch_year, nth_launch, char_launch_obj))
    
    return line
 
def clean_mean_motion(line):
    """
    Sometimes Mean Motion and Revolution number are crammed together. Separate them.
    """
    last_num = re.findall(r'(?<=\|)\d{1,2}\.\d{13}', line)
    if len(last_num) > 0:
        n = last_num[0]
        mean_motion = n[:-5]
        rev_num = n[-5:]
        return line.replace(n, mean_motion+'|'+rev_num)
    
    return line
        
def clean_tle_line(line):
    newline = ''
    line2 = False
    if re.search(r'^2', line):
        newline = '\r\n'
        line2 = True

    line = re.sub(r'^(1|2)\s+', '', line)
    line = re.sub(r'(^\s+|\s+$)', '', line)
    line = re.sub(r'\\', '|', line)
    line = re.sub(r'\s+', '|', line) 

    # Separate Mean Motion and Revolution Number.
    if line2:
        line = clean_mean_motion(line)

    # Get International Designator parts
    line = clean_intnl_designator(line)
    
    # Convert Julian date to ISO
    if not line2:
        rx_julian = re.findall(r'(?<=\|)\d{3,5}\.\d{8}(?=\|)', line)
        if len(rx_julian) > 0:
            julian_date = rx_julian[0]
            iso = julian_to_iso(julian_date)
            line = line.replace(julian_date, iso)
    
    return line + newline

# Clean TLE data. Save as pipe delimitted datasets.
tle_files = read_files('./data/Scenario_Data/TLE/unzipped', '.txt')
# tle_files = ['./data/Scenario_Data/TLE/test.txt']

for path in tle_files:
    filename = re.sub(r'\.txt$', '.clean.txt', path)
    with open(path, 'r') as file:
        with open(filename, 'w', encoding='utf-8') as newfile:
            while True:
                line = file.readline()
                if not line:
                    file.close()
                    newfile.close()
                    print('Cleaned', filename)
                    break

                cleaned_line = clean_tle_line(line)
                newfile.write(cleaned_line)

Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_1of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_2of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_3of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_4of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_5of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_6of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_7of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2004_8of8.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2005.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2006.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2007.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2008.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2009.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2010.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2011.clean.txt
Cleaned ./data/Scenario_Data/TLE/unzipped/tle2012.clean.txt


In [101]:
tle_dtypes = {
    'SatID': 'object'
    , 'LaunchYear': np.int
    , 'NthLaunch': np.int
    , 'CharLaunchObject': 'object'
    , 'EpochYear': 'object'
    , 'BallisticCoef': 'object'
    , 'SecDerivMeanMotion': 'object'
    , 'DragTerm': 'object'
    , 'EphemerisType': np.int
    , 'ElemNumCheckSum': np.int
    , 'SatNumID': 'object'
    , 'Inclination': np.float
    , 'RightAscension': np.float
    , 'Eccentricity': np.float
    , 'ArgPerigree': np.float
    , 'MeanAnomaly': np.float
    , 'MeanMotion': np.float
    , 'RevNumEpochCheckSum': np.float
}

tle_columns = [col for col in tle_dtypes]

tle_data = dd.read_csv('./data/Scenario_Data/TLE/unzipped/*.clean.txt'
                       , names=tle_columns
                       , dtype=tle_dtypes
                       , sep='|'
                       , encoding='utf-8'
                      )

tle_data.head()

ValueError: invalid literal for int() with base 10: '94029ACD'

In [None]:
# 2018 TLE data
tle_data = dd.read_csv('./data/Scenario_Data/TLE/unzipped/*.clean.txt'
                       , names=tle_columns
                       , dtype=tle_dtypes
                       , sep='|'
                       , encoding='utf-8'
                      )

tle_data.head()

In [None]:
# Get AIS data

# Relocate CSV files to main AIS directory.
ais_files = read_files('./data/Scenario_Data/AIS/unzipped/AIS_ASCII_by_UTM_Month', '.csv')
for path in ais_files:
    filename = os.path.basename(path)
    print(filename)
    shutil.move(path, './data/Scenario_Data/AIS/'+filename)

In [14]:
# combine AIS CSV files.
ais_data = dd.read_csv('./data/Scenario_Data/AIS/*.csv', assume_missing=True)
ais_data.head()

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo
0,235091871.0,2015-01-01T00:08:26,52.78763,-175.62761,10.3,74.5,86.0,EVA BULKER,IMO9544164,2FJU4,70.0,under way using engine,185.0,31.0,6.6,70.0
1,247119100.0,2015-01-01T05:36:17,52.87994,-176.21738,10.7,-148.8,263.0,POLE,IMO9128245,IBTE,70.0,under way using engine,224.0,32.0,-12.8,70.0
2,247119100.0,2015-01-01T06:28:57,52.83234,-176.46662,11.0,-160.8,254.0,POLE,IMO9128245,IBTE,70.0,under way using engine,224.0,32.0,-12.8,70.0
3,247119100.0,2015-01-01T06:32:27,52.82851,-176.48291,11.0,-160.6,254.0,POLE,IMO9128245,IBTE,70.0,under way using engine,224.0,32.0,-12.8,70.0
4,247119100.0,2015-01-01T06:36:07,52.82446,-176.50022,11.0,-160.0,254.0,POLE,IMO9128245,IBTE,70.0,under way using engine,224.0,32.0,-12.8,70.0
