In [3]:
import pandas as pd
import numpy as np
import glob
import pickle
import concurrent.futures
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import seaborn as sns
import matplotlib.pyplot as plt
import os.path

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

RND_STATE = 100412

In [4]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [5]:
def load_file(file_name):
    max_bytes = 2**31 - 1
    bytes_in = bytearray(0)
    input_size = os.path.getsize(file_name)
    with open(file_name, 'rb') as f_in:
        for _ in range(0, input_size, max_bytes):
            bytes_in += f_in.read(max_bytes)
    return pickle.loads(bytes_in)

In [6]:
def save_file(file_name, data_to_save):
    n_bytes = 2**31
    max_bytes = 2**31 - 1
    bytes_out = pickle.dumps(data_to_save)
    with open(file_name, 'w+b') as f_out:
        for idx in range(0, n_bytes, max_bytes):
            f_out.write(bytes_out[idx:idx+max_bytes])

In [7]:
def additional_preprocessing(data_df):  
    data_info = data_df.copy()
    data_info['previous_flight_delay'] = list(map(int, data_info['late_aircraft_delay'] > 30))
    data_info = data_info.drop(['cancellation_code', 'cancelled', 'carrier_delay', 'dep_delay_new', 'late_aircraft_delay', 'nas_delay', 'security_delay', 'weather_delay', 'diverted', 'origin_city_name', 'dest_city_name'], axis = 1)
    
    data_info = data_info.drop(['snowfall', 'snow_depth', 'thunder', 'dust', 'haze', 'snow', 'fog', 'hail', 'damaging_wind'], axis = 1)
    
    data_info['crs_dep_time'] = list(map(int, working_df['crs_dep_time'].values / 100))    
    return data_info

In [8]:
def add_airline_avg_delay(carrier):
    return carrier_average_delays[carrier_average_delays['carrier'] == carrier]['carrier_delay'].values[0]

In [9]:
def add_airline_delay_index(carrier):
    return delay_info[delay_info['Carrier'] == carrier]['Delay index'].values[0]

In [10]:
def process_data(data_df):
    data_info = data_df.copy()
    
    with concurrent.futures.ProcessPoolExecutor(16) as pool:
        data_info['airline_delay_index'] = list(pool.map(add_airline_delay_index, data_info['carrier'], chunksize=2_000))
        data_info['airline_avg_delay'] = list(pool.map(add_airline_avg_delay, data_info['carrier'], chunksize=2_000))
    data_info = pd.get_dummies(data_info, columns=['dest'])
    data_info['day_of_year'] = (data_info['fl_date'] - data_info['fl_date'].min())  / np.timedelta64(1,'D')
    
    data_info['weekend'] = np.where(data_info['day_of_week'] >= 6, 1, 0)
    
    data_info = data_info.drop(['fl_date', 'fl_num', 'origin', 'tail_num', 'carrier'], axis=1)
    return data_info, data_info.columns

## Configuration

In [11]:
DATA_FOLDER = 'historical_data'
DICT_FOLDER = 'dictionaries'
WEATHER_FOLDER = 'weather_data'

In [12]:
BEST_CLF = '../data/best_clf.data'

In [13]:
DATA_PICKLE = '../data/merged_data.data'

In [14]:
data = load_file(DATA_PICKLE)

In [15]:
clf = load_file(BEST_CLF)

In [16]:
airlines_group = data[['status', 'carrier']]
airlines_group_num = airlines_group.groupby(['carrier']).size()
airlines_group = data[['status', 'carrier']]
airlines_group = airlines_group[(airlines_group['status'] != 'no_delay')]
airlines_group_delays_num = airlines_group.groupby(['carrier']).size()
delay_info = pd.DataFrame({'Carrier': np.unique(airlines_group.carrier.values), 'Number of flights': airlines_group_num.values, 'Number of delays': airlines_group_delays_num.values})
delay_info['Delay index'] = delay_info['Number of delays'] / delay_info['Number of flights']
carrier_average_delays = pd.DataFrame(data.groupby(['carrier'])['carrier_delay'].mean()).reset_index()

## Final clfs

In [17]:
airports = np.unique(data['origin'].values)

In [16]:
results = []
for airport in log_progress(airports, every=1):
    try:
        working_df = data[data['origin'] == airport]
        
        working_df = additional_preprocessing(working_df)
        working_df, values_dict = process_data(working_df)
        
        working_df = working_df.dropna()
        
        X_train, X_test, y_train, y_test = train_test_split(working_df.loc[:, working_df.columns != 'status'], working_df['status'], test_size = 0.2, random_state = RND_STATE)

        clf.fit(X_train, y_train)

        save_file('../data/clfs/' + airport + '.data', clf)
        save_file('../data/values_dicts/' + airport + '.data', values_dict)
        y_pred = clf.predict(X_test)
        f1_micro = f1_score(y_pred, y_test, average='macro')
        f1_macro = f1_score(y_pred, y_test, average='weighted')
        results.append({'airport': airport, 'f1_micro': f1_micro, 'f1_weighted': f1_macro})
    except Exception as e:
        print(e)

VBox(children=(HTML(value=''), IntProgress(value=0, max=319)))

Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 15)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 14)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 14)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 15)) while a minimum of 1 is required.
Found array with 0 sample(s) (shape=(0, 13)) while a minimum of 1 is required.


In [25]:
hh = pd.DataFrame(results)

## Flight infos

In [18]:
flight_info = data.copy()
flight_info = flight_info.drop_duplicates(subset=['carrier', 'fl_num', 'origin'])
flight_info = flight_info[['carrier', 'crs_dep_time', 'crs_elapsed_time', 'origin', 'origin_city_name', 'dest', 'dest_city_name', 'fl_num']]
flight_info['crs_dep_time'] = list(map(int, flight_info['crs_dep_time'].values / 100))

In [20]:
save_file('../data/flight_info.data', flight_info)

In [21]:
delay_info = pd.DataFrame({'Carrier': np.unique(airlines_group.carrier.values), 'Number of flights': airlines_group_num.values, 'Number of delays': airlines_group_delays_num.values})
delay_info['Delay index'] = delay_info['Number of delays'] / delay_info['Number of flights']

In [22]:
save_file('../data/delays.data', delay_info)

In [23]:
carrier_average_delays = pd.DataFrame(data.groupby(['carrier'])['carrier_delay'].mean()).reset_index()

In [25]:
save_file('../data/avg_delays.data', carrier_average_delays)