In [None]:
# 1 Create the folder Extract --> Get extract path
# 2 Go through every file for searching flights to HKG
# 2.1 Check if the flight has its own csv extracted file.
#     If not, create a file, add that flight to dictionary
# 2.2 Write a new row

In [15]:
# Create directary to store data
# Clean up the existing directary first then make a new one

import os
import shutil

extract_path = os.getcwd() + '/Extract'

try:
    shutil.rmtree(extract_path)
except OSError as e:
    print("Error: %s - %s." % (e.filename, e.strerror))

os.makedirs(extract_path)

Error: /home/kc/Research/air_traffic/Extract - No such file or directory.


In [19]:
from datetime import datetime
import pytz
import csv

HK_time_zone = pytz.timezone('Hongkong')

def new_record(flight_ID, flight_detail):
    # flight_ID should be the first column
    # flight_details is a list containing the remaining columns
    # e.g. "c0ae2e9": [ ... flight_detail ...]
    #
    # Global variables:
    #   flight_dict - containing stored flights
    #   extract_path - where to store the data
    #   HK_time_zone - "constant" for time zone
    #
    # Read a row of json file, and write that to trajectory csv in Opensky format 
    #
    
    file_path = ''
    created = False

    # 1. Check if flight is already in the flight dict
    if(flight_ID in flight_dict):  # flight_dict is a global variable
        created = True
        file_path = flight_dict[flight_ID]
    else:
        # 2. Change time to UTC+8, create date folder if not exist
        # folder_path = os.path.join(extract_path, \
        #                            datetime.utcfromtimestamp(flight_detail[10]).replace(tzinfo=pytz.utc).\
        #                            astimezone(HK_time_zone).strftime('%Y-%m-%d'))
        folder_path = os.path.join(extract_path, \
                                   datetime.utcfromtimestamp(flight_detail[10]).strftime('%Y-%m-%d'))


        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        
        # 3. Skip if it has no callsign ... callsign is used as filename
        if not flight_detail[13]:
            return
        
        # 4. Put new flight in flight_dict
        # file_path = os.path.join(folder_path, str(flight_detail[13] + '.csv'))
        # flight_dict[flight_ID] = file_path

        # if not flight_detail[16]:
        #     return
        
        file_path = os.path.join(folder_path, str(flight_detail[16] + '.csv'))
        flight_dict[flight_ID] = file_path
        
    # 5. Create / open trajectory file; write new data row
    with open(file_path, 'a') as f:
        writer = csv.writer(f)
        if not created:
            header = ['timestamp','lat', 'long', 'alt', 'hangle','gspeed','icao24','origin','destination','Airliner']
            writer.writerow(header)
            
        writer.writerow([flight_detail[10], flight_detail[1], flight_detail[2],\
                         flight_detail[4], flight_detail[3], flight_detail[5],\
                        flight_detail[0], flight_detail[11], flight_detail[12],flight_detail[17]])
    

In [13]:
import json
def process(file):
    # 1. Load FR24 json file
    # 2. Read data row by row
    # 3. Write data to csv if the flight is heading to HKG

    with open(file, 'rb') as f:
        content = f.read()
        parsed = {}
        try:
            parsed = json.loads(content)
        except Exception as e:
            print(e)
    
        # Each json file contains some useless meta-data
        for item in ["full_count", "version","stats"]:
            if item in parsed:
                del parsed[item]
            
        for record in parsed:
            
            detail = parsed[record]
            
            if detail[12] == 'HKG':
                new_record(record, detail)

In [20]:
# Use glob to generate a list of all files under the directory
# Then pass the data file to the process function one-by-one

import glob

flight_dict = {}
# path = os.getcwd() + '/Data'
# path = "/mnt/Passport/Lishuai_data/china/2017/201701"
path = "/home/kc/Research/air_traffic/data/fr24_profile/201701"
list_of_files = sorted(filter(os.path.isfile, glob.glob(path + '/**/*', recursive=True)))
for filename in list_of_files:
    print(filename)
    with open(os.path.join(path, filename), 'r') as f:
        process(filename)

/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101.zip
'utf-8' codec can't decode byte 0xb0 in position 53: invalid start byte
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0000_1.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0000_2.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0000_3.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0001_1.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0001_2.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0001_3.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0002_1.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24_China_20170101_Sun_0002_2.txt
/home/kc/Research/air_traffic/data/fr24_profile/201701/20170101/fr24

Clean extracted data

In [8]:
import pandas as pd

# Generate a list of all extracted file
list_of_extracted_files = sorted(filter(os.path.isfile, glob.glob(extract_path + '/**/*', recursive=True)))

for file_path in list_of_extracted_files:
    df = pd.read_csv(file_path)

    # This means a possible problem is have timestamp recorded as "timestamp"?
    list_of_index_of_string = df[df['timestamp'] == 'timestamp'].index.tolist()

    # Drop all row up to the biggest index with the "timestamp" problem
    # But won't that be an overkill?
    if len(list_of_index_of_string):
        max_index = max(df[df['timestamp'] == 'timestamp'].index.tolist())
        df.drop(df.index[range(max_index+1)],0,inplace=True)
        df.to_csv(file_path, index=False)

  df.drop(df.index[range(max_index+1)],0,inplace=True)


In [None]:
import pandas as pd

list_of_extracted_files = sorted(filter(os.path.isfile, glob.glob(extract_path + '/**/*', recursive=True)))
for filename in list_of_extracted_files:
    print(filename)
        
file_path = os.path.join(extract_path, '2021-03-03', 'AM9020.csv')
df = pd.read_csv(file_path)
list_of_index_of_string = df[df['timestamp'] == 'timestamp'].index.tolist()
if len(list_of_index_of_string):
    max_index = max(df[df['timestamp'] == 'timestamp'].index.tolist())
    df.drop(df.index[range(max_index+1)],0,inplace=True)
    df.to_csv(file_path, index=False)