In [1]:
from requests import get
from time import sleep
import json
import pickle
from multiprocessing.pool import ThreadPool as Pool
import os

import numpy as np
import pandas as pd

### Get all live flights
For now just over Europe. Will need to parse thru entire map, as 1 reutrn is maxed at 1500, and there are usually 15-20k flights live

In [15]:
headers = {'User-agent' : 'Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.1.5) Gecko/20091102 Firefox/3.5.5'}

link_above_HU = 'https://data-live.flightradar24.com/zones/fcgi/feed.js?faa=1&bounds=53.357%2C39.248%2C2.81%2C29.022&satellite=1&mlat=1&flarm=1&adsb=1&gnd=1&air=1&vehicles=1&estimated=1&maxage=14400&gliders=1&stats=1'
all_flights = get(link_above_HU, headers = headers).json()

Scraping flights from this area:

<img src="area_covered.png" width="600"/>

In [18]:
print('Total currently in Air:', all_flights['full_count'])
print('Visible:', sum(all_flights['stats']['visible'].values()))

Total currently in Air: 15957
Visible: 1144


In [19]:
flight_keys = [i for i in all_flights.keys() if not i in ['full_count', 'version', 'stats']]
print(len(flight_keys))

1144


### Get flights 1-by-1

In [20]:
def get_flight_data(current_flight):
    
    current_link = 'https://data-live.flightradar24.com/clickhandler/?version=1.5&flight=' + current_flight
    current_data = get(current_link, headers = headers).json()
    
    sleep(1)
    
    drop_keys = ['level', 'promote', 'owner', 'airspace', 'flightHistory', 'ems', 'availability', 'trail', 'trail', 's', 'firstTimestamp']

    for i in drop_keys:
        if i in current_data.keys():
            del current_data[i]
            
    del_keys_aircraft = ['msn', 'images']

    if 'aircraft' in current_data.keys():
        for i in del_keys_aircraft:
            if i in current_data['aircraft'].keys():
                del current_data['aircraft'][i]
                
    pkl_filename = 'data/' + current_flight + '.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(current_data, file)
        
    return current_data

### Run scraper

In [21]:
%%time

flight_dicts = []

pool = Pool(4)

for i in flight_keys:
    pool.apply_async(get_flight_data, (i,), callback = flight_dicts.append)

pool.close()
pool.join()

Wall time: 5min 23s


### Check missing

In [22]:
scraped = [i.replace('.pkl', '') for i in os.listdir('data/')]

In [24]:
# np.setdiff1d(flight_keys, scraped)

In [25]:
# todo: set sleep longer

In [None]:
%%time

flight_dicts = []

for i in os.listdir('data/'):
    flight_dicts.append(pd.read_pickle('data/' + i))

### Put together data

In [113]:
data = pd.json_normalize(flight_dicts)
data.replace({'None' : np.nan}, inplace = True)

In [114]:
null_ratio = data.isnull().sum() / data.shape[0]
drop_due_to_null = null_ratio[null_ratio > 0.9].index.tolist()

data.drop(drop_due_to_null, 1, inplace = True)

In [115]:
drop_cols = ['status.text', 'status.live', 'status.ambiguous', 'airport.origin.code.icao',
        'airport.origin.position.longitude', 'airport.origin.position.latitude', 'airport.origin.position.altitude', 
        'airport.origin.position.country.code', 'airport.origin.timezone.name', 'airport.origin.timezone.offset',
        'airport.origin.timezone.abbrName', 'airport.origin.timezone.isDst', 'airport.origin.visible', 'airport.origin.website',
        'airport.origin.info.terminal', 'airport.origin.info.gate', 'airport.destination.code.icao',
        'airport.destination.position.longitude', 'airport.destination.position.latitude', 'airport.destination.position.altitude', 
        'airport.destination.position.country.code', 'airport.destination.timezone.name', 'airport.destination.timezone.offset',
        'airport.destination.timezone.abbrName', 'airport.destination.timezone.isDst', 'airport.destination.visible', 'airport.destination.website',
        'airport.destination.info.terminal', 'airport.destination.info.gate', 'airport.destination.info.baggage', 'time.real.arrival',
        'aircraft.countryId', 'identification.row', 'status.generic.status.color', 'status.generic.status.type',
        'aircraft.hex', 'time.other.updated', 'airline.url', 'airline.code.icao', 'identification.number.alternative',
        'status.generic.eventTime.utc', 'status.generic.eventTime.local']

for i in drop_cols:
    if i in data.columns:
        data.drop(i, 1, inplace = True)

Calc time features

In [116]:
data['time.estimated.arrival'].fillna(data['time.other.eta'], inplace = True)

In [117]:
data.loc[data['time.scheduled.departure'] == 0.000000e+00, 'time.scheduled.departure'] = data.loc[data['time.scheduled.departure'] == 0.000000e+00, 'time.real.departure']

In [118]:
data = data[data['time.scheduled.arrival'] != 0.000000e+00]
data = data[data['time.other.eta'] != 0.000000e+00]

In [119]:
data['time_scheduled_flight_time'] = (data['time.scheduled.arrival'] - data['time.scheduled.departure']) / 60 / 60
data['time_estimated_flight_time'] = (data['time.estimated.arrival'] - data['time.real.departure']) / 60 / 60

data['time_departure_delay'] = (data['time.real.departure'] - data['time.scheduled.departure']) / 60
data['time_arrival_delay'] = (data['time.estimated.arrival'] - data['time.scheduled.arrival']) / 60

In [120]:
data.drop(['time.other.eta', 'time.real.departure', 'time.scheduled.departure', 
           'time.estimated.arrival', 'time.scheduled.arrival'], 1, inplace = True)

Rename columns

In [121]:
rename_dict = {'airport.origin.name' : 'origin_airport_name', 'airport.origin.code.iata' : 'origin_airport_code',
               'airport.origin.position.country.name' : 'origin_country',
               'airport.origin.position.region.city' : 'origin_city',
               'airport.origin.timezone.abbr' : 'origin_tz', 'airport.destination.timezone.abbr' : 'destination_tz',
               'airport.origin.timezone.offsetHours' : 'origin_tz_offset' ,
               'airport.destination.timezone.offsetHours' : 'destination_tz_offset',
               'airport.destination.name' : 'destination_airport_name', 'airport.destination.code.iata' : 'destination_airport_code',
               'airport.destination.position.country.name' : 'destination_country',
               'airport.destination.position.region.city' : 'destination_city',
               'identification.id' : 'id', 'identification.number.default' : 'number', 'identification.callsign' : 'callsign',
               'status.icon' : 'status_color', 'status.generic.status.text' : 'status_text', 'aircraft.registration' : 'aircraft_registration',
               'aircraft.model.code' : 'aircraft_model_code', 'aircraft.model.text' : 'aircraft_model_text',
               'time.historical.flighttime' : 'avg_flight_time', 'time.historical.delay' : 'avg_flight_delay',
               'airline.name' : 'airline_name', 'airline.code.iata' : 'airline_code'}
#                'airport.destination.timezone.offsetHours' : '',  'airport.destination.timezone.abbr',
#                'airport.origin.timezone.offsetHours' : '', 'airport.origin.timezone.abbr' : ''}

data.rename(columns = rename_dict, inplace = True)

Drop rows with many NaNs

In [122]:
data.dropna(axis=0, thresh = 6, how = 'any', inplace = True)

Consolidate airline name

In [123]:
data.loc[data['airline.short'].notnull(), 'airline_name'] = data.loc[data['airline.short'].notnull(), 'airline.short']

In [124]:
data.drop('airline.short', 1, inplace = True)

Merge with metadata from first scrape (i.e. altitude, speed...)

In [125]:
meta_data = pd.DataFrame.from_dict({k: v for k, v in all_flights.items() if k in flight_keys}, orient = 'index').reset_index()
meta_data.drop([0, 1, 2, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18], 1, inplace = True)
meta_data.columns = ['id', 'track', 'altitude', 'speed']

In [126]:
data = data.merge(meta_data, on = 'id')

Check

In [127]:
data[data['id'] == '298468b2']

Unnamed: 0,id,number,callsign,status_color,status_text,aircraft_model_code,aircraft_model_text,aircraft_registration,airline_name,airline_code,...,destination_tz,avg_flight_time,avg_flight_delay,time_scheduled_flight_time,time_estimated_flight_time,time_departure_delay,time_arrival_delay,track,altitude,speed
860,298468b2,VY6211,VLG6211,green,estimated,A20N,Airbus A320-271N,EC-NAY,Vueling,VY,...,CEST,8027,-927,2.75,2.234722,22.316667,-8.6,163,0,22


### Analyze

In [128]:
for i in ['avg_flight_time', 'avg_flight_delay']:
    data[i] = data[i].astype(float)

In [130]:
data['avg_flight_time'] = data['avg_flight_time'] / 60 / 60
data['avg_flight_delay'] = data['avg_flight_delay'] / 60 

In [132]:
data.describe(exclude = 'O').T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
avg_flight_time,781.0,2.268791,1.410278,0.423333,1.323056,1.988056,2.8525,11.614444
avg_flight_delay,781.0,-3.97006,23.369772,-44.833333,-12.966667,-6.75,-0.266667,392.183333
time_scheduled_flight_time,818.0,2.661318,1.467941,0.666667,1.666667,2.358333,3.25,12.333333
time_estimated_flight_time,813.0,2.272123,1.43916,0.428611,1.314167,1.965833,2.857222,11.5975
time_departure_delay,815.0,28.08499,39.131618,-72.733333,10.508333,18.9,33.541667,487.45
time_arrival_delay,816.0,4.761581,39.446007,-91.733333,-12.9625,-3.3,10.816667,485.383333
track,865.0,190.537572,104.522011,0.0,110.0,176.0,297.0,359.0
altitude,865.0,25864.515607,14273.159402,0.0,13425.0,34000.0,37000.0,47000.0
speed,865.0,354.116763,145.151865,0.0,317.0,421.0,448.0,512.0


In [133]:
data.describe(include = 'O')

Unnamed: 0,id,number,callsign,status_color,status_text,aircraft_model_code,aircraft_model_text,aircraft_registration,airline_name,airline_code,...,origin_country,origin_city,origin_tz_offset,origin_tz,destination_airport_name,destination_airport_code,destination_country,destination_city,destination_tz_offset,destination_tz
count,865,793,859,826,865,865,865,849,855,788,...,820,820,820,820,820,820,820,820,820,820
unique,865,793,850,3,5,73,179,849,143,106,...,57,180,9,18,187,187,53,173,7,15
top,29842b2e,0B139,Blocked,green,estimated,B738,Boeing 737-8AS,9H-QEL,Ryanair,FR,...,Germany,Amsterdam,2:00,CEST,Istanbul Airport,IST,Germany,Istanbul,2:00,CEST
freq,1,1,10,664,604,240,134,1,133,133,...,126,40,570,564,64,64,123,69,550,545


In [142]:
data[data['altitude'] > 40000]

Unnamed: 0,id,number,callsign,status_color,status_text,aircraft_model_code,aircraft_model_text,aircraft_registration,airline_name,airline_code,...,destination_tz,avg_flight_time,avg_flight_delay,time_scheduled_flight_time,time_estimated_flight_time,time_departure_delay,time_arrival_delay,track,altitude,speed
0,2983e26c,,Blocked,green,estimated,GLF5,Gulfstream G550,,,,...,,,,,,,,289,47000,465
1,298417ae,,NJE238Y,green,estimated,GL5T,Bombardier Global 5000,CS-GLY,NetJets Europe,,...,BST,,,3.033333,2.773056,21.333333,5.716667,304,43000,458
2,29832746,,Blocked,green,estimated,GLEX,Bombardier Global 6000,,,,...,,,,,,,,317,43000,476
3,29844f7d,3Z7434,TVP7434,red,delayed,B738,Boeing 737-86Q,OK-TVW,Smartwings,QS,...,EEST,2.345556,66.866667,2.833333,2.349167,305.583333,276.533333,155,41000,446
4,298352de,,TOM953P,yellow,delayed,B788,Boeing 787-8 Dreamliner,G-TUIH,TUI fly,X3,...,BST,,,7.164444,7.709722,0.0,32.716667,316,43000,460
5,2983d267,,Blocked,green,estimated,GLF4,Gulfstream G450,,,,...,,,,,,,,208,43000,480
6,29842b2e,,Blocked,green,estimated,GLF4,Gulfstream G450,,,,...,,,,,,,,225,41000,454
7,2983e7d7,BY856,TOM856,red,delayed,B788,Boeing 787-8 Dreamliner,G-TUIF,TUI fly,X3,...,+03,4.119444,13.266667,4.333333,3.891111,94.666667,68.133333,135,40975,468
8,29842022,TK6203,THY6203,red,delayed,A332,Airbus A330-243F,TC-JCI,Turkish Airlines,TK,...,+03,2.735,79.166667,4.083333,2.756667,224.933333,145.333333,120,41000,453
9,2983ebdb,BA167,BAW167,green,estimated,B772,Boeing 777-236(ER),G-YMMP,British Airways,BA,...,IDT,4.215,-5.75,5.166667,4.343611,32.383333,-17.0,135,41000,448
