In [1]:
import dask
import dask.dataframe as dd
import pandas as pd
from bs4 import BeautifulSoup
import re
import requests
from zipfile import ZipFile
import os
import numpy as np

In [2]:
DATA_DIR = './data'
XML_DATA_URL = 'https://afdata.s3.us-gov-west-1.amazonaws.com'

In [3]:
def download_url(url, save_path, chunk_size=128):
    r = requests.get(url, stream=True)
    with open(save_path, 'wb') as fd:
        for chunk in r.iter_content(chunk_size=chunk_size):
            fd.write(chunk)
            
def read_files(path, ext):
    file_list = []
    for root, folders, docs in os.walk(path):
        file_list.extend( [os.path.join(root, doc) for doc in docs if ext in doc] )

    return file_list

In [4]:
# Download XML list of data sources.
xml_data_path = DATA_DIR+'/data_sources.xml'
xml_data = download_url(XML_DATA_URL, xml_data_path)

In [None]:
# Download all zip data files from the XML source.
with open(xml_data_path, 'r') as xml:
    soup = BeautifulSoup(xml, 'xml')
    
    # XML structure: <Contents><Key>filename</Key><Size>bytes</Size></Contents>
    contents_elements = soup.find_all('Contents')
    
    for contents in contents_elements:
        key = contents.find('Key')
        filename = str(key.text)
        if not re.search(r'\.zip$', filename):
            continue
        
        save_path = str.format('{0}/{1}', DATA_DIR, filename)
        url = str.format('{0}/{1}', XML_DATA_URL, filename)     
        expected_size = int(str(contents.find('Size').text))
        
        # Only download if the file doesn't exist with the expected size in bytes.
        if os.path.exists(save_path):
            actual_size = os.path.getsize(save_path)
            if expected_size == actual_size:
                print(str.format('{0} at {1} bytes already exists.', filename, expected_size))
                continue
        
        print(str.format('Downloading {0}...', url))
        
        download_url(url, save_path)
        actual_size = os.path.getsize(save_path)
        
        if actual_size != expected_size:
            print(str.format('WARNING: File size for {0} at {1} bytes does not match the expected size of {2} bytes.',
                            filename, actual_size, expected_size))
        else:  
            print(str.format('Successfully downloaded {0} to {1}. Filesize: {2} bytes.'
                             , url, save_path, actual_size))
        
    xml.close()

In [42]:
# Unzip all data files.
for path in read_files(path=DATA_DIR, ext='.zip'):
    extract_path = '/'.join(str(path).rsplit('/')[:-1]) + '/unzipped/'
    with ZipFile(path, 'r') as zipfile:
        zipfile.extractall(extract_path)

In [35]:
# Clean TLE data, Save as pipe delimitted datasets.
tle_files = read_files('./data/Scenario_Data/TLE/unzipped', '.txt')
for path in tle_files:
    filename = re.sub(r'\.txt$', '.clean.txt', path)
    with open(path, 'r') as file:
        with open(filename, 'w', encoding='utf-8') as newfile:
            while True:
                line = file.readline()
                if not line:
                    file.close()
                    newfile.close()
                    print(filename)
                    break
                
                newline = ''
                line2 = False
                if re.search(r'^2', line):
                    newline = '\r\n'
                    line2 = True
   
                line = re.sub(r'^(1|2)\s+', '', line)
                line = re.sub(r'(^\s+|\s+$)', '', line)
                line = re.sub(r'\\', '|', line)
                line = re.sub(r'\s+', '|', line)   

                # Sometimes Mean Motion and Revolution number are crammed together. Separate.
                rx = r'(?<=\|)\d{1,2}\.\d{13}'
                if line2 and re.search(rx, line):
                    last_num = re.findall(rx, line)[0]
                    mean_motion = last_num[:-5]
                    rev_num = last_num[-5:]
                    # print(mean_motion, rev_num)
                    line = re.sub(rx, mean_motion+'|'+rev_num, line)
                    
                line = line + newline
                newfile.write(line)

./data/Scenario_Data/TLE/unzipped/tle2004_1of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_2of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_3of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_4of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_5of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_6of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_7of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2004_8of8.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2005.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2006.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2007.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2008.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2009.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2010.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2011.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2012.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2013.clean.txt
./data/Scenario_Data/TLE/unzipped/tle2014.clean.txt
./data/Scenario_Data/TLE

In [60]:
tle_dtypes = {
    'SatNum': 'object'
    , 'IntnlDesignator': 'object'
    , 'EpochYear': np.float64
    , 'BallisticCoef': 'object'
    , 'SecDerivMeanMotion': 'object'
    , 'DragTerm': 'object'
    , 'EphemerisType': np.int
    , 'ElemNumCheckSum': np.int
    , 'SatNum2': 'object'
    , 'Inclination': np.float
    , 'RightAscension': np.float
    , 'Eccentricity': np.float
    , 'ArgPerigree': np.float
    , 'MeanAnomaly': np.float
    , 'MeanMotion': np.float
    , 'RevNumEpochCheckSum': np.float
}

tle_columns = [col for col in tle_dtypes]

tle_data = dd.read_csv('./data/Scenario_Data/TLE/unzipped/*.clean.txt'
                       , names=tle_columns
                       , dtype=tle_dtypes
                       , sep='|'
                       , encoding='utf-8'
                      ) # ISO-8859-1

tle_data.head()

Unnamed: 0,SatNum,IntnlDesignator,EpochYear,BallisticCoef,SecDerivMeanMotion,DragTerm,EphemerisType,ElemNumCheckSum,SatNum2,Inclination,RightAscension,Eccentricity,ArgPerigree,MeanAnomaly,MeanMotion,RevNumEpochCheckSum
0,26619U,00075A,4118.833903,-0.00012193,00000-0,-27028-2,0,720,26619,98.2038,186.7557,1921.0,94.7873,265.3585,14.570848,182472.0
1,18549U,68091DE,4118.596391,1.801e-05,00000-0,25919-2,0,283,18549,62.2415,180.1561,704892.0,265.6761,86.2771,12.852684,585614.0
2,18727U,87020E,4118.666444,-2e-08,00000-0,10000-3,0,4084,18727,73.36,345.6887,88152.0,270.3999,88.6911,12.642166,754869.0
3,18792U,88002E,4118.818402,2.9e-07,00000-0,10000-3,0,6838,18792,82.6017,352.4844,14698.0,138.3284,221.893,12.655116,752224.0
4,19027U,81053MK,4118.823075,1.28e-05,00000-0,10755-2,0,7395,19027,83.0239,250.9465,84934.0,184.3222,175.7249,13.856401,953590.0
