In [1]:
import pandas as pd
import numpy as np
import glob
import pickle
import concurrent.futures
from sklearn.preprocessing import MultiLabelBinarizer
import warnings
import sklearn.exceptions
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

RND_STATE = 100412

In [2]:
def log_progress(sequence, every=None, size=None, name='Items'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

## Configuration

In [6]:
DATA_FOLDER = 'historical_data'
DICT_FOLDER = 'dictionaries'
WEATHER_FOLDER = 'weather_data'

In [None]:
BEST_CLF = '../data/best_clf.data'

In [None]:
DATA_PICKLE = '../data/merged_data.data'

In [None]:
data = load_file(DATA_PICKLE)

In [None]:
data = load_file(DATA_PICKLE)

In [None]:
clf = load_file(BEST_CLF)

## Final clfs

In [60]:
airports = np.unique(data['origin'].values)

In [61]:
results = []
for airport in log_progress(airports, every=1):
    try:
        working_df = data[data['origin'] == airport]
        working_df = additional_preprocessing(working_df)
        working_df, values_dict = process_data(working_df)
        
        working_df = working_df.dropna()
        
        X_train, X_test, y_train, y_test = train_test_split(working_df.loc[:, working_df.columns != 'status'], working_df['status'], test_size = 0.2, random_state = RND_STATE)

        clf.fit(X_train, y_train)

        save_clfs('clfs/' + airport + '.data', clf)
        save_clfs('values_dicts/' + airport + '.data', values_dict)
        y_pred = clf.predict(X_test)
        f1_micro = f1_score(y_pred, y_test, average='micro')
        f1_macro = f1_score(y_pred, y_test, average='weighted')
        results.append({'airport': airport, 'f1_micro': f1_micro, 'f1_weighted': f1_macro})
    except Exception as e:
        print(e)

VBox(children=(HTML(value=''), IntProgress(value=0, max=312)))

arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
arrays must all be same length
Found array with 0 sample(s) (shape=(0, 11)) while a minimum of 1 is required.
arrays must all be same length
arrays must all be same length


## Flight infos

In [71]:
flight_info = data.copy()
flight_info = flight_info.drop_duplicates(subset=['carrier', 'fl_num', 'origin'])
flight_info = flight_info[['carrier', 'crs_dep_time', 'crs_elapsed_time', 'origin', 'origin_city_name', 'dest', 'dest_city_name', 'fl_num']]
flight_info['crs_dep_time'] = list(map(int, flight_info['crs_dep_time'].values / 100))

In [72]:
save_clfs('flight_info.data', flight_info)

In [68]:
delay_info = pd.DataFrame({'Carrier': np.unique(airlines_group.carrier.values), 'Number of flights': airlines_group_num.values, 'Number of delays': airlines_group_delays_num.values})
delay_info['Delay index'] = delay_info['Number of delays'] / delay_info['Number of flights']

In [70]:
save_clfs('delays.data', delay_info)

In [74]:
data

Unnamed: 0,average_wind_speed,snowfall,snow_depth,thunder,dust,haze,snow,cancellation_code,cancelled,carrier,...,origin,origin_city_name,quarter,security_delay,tail_num,weather_delay,fog,hail,damaging_wind,status
0,32.0,0.0,89.0,0.0,0.0,0.0,0.0,E,0,OO,...,ABR,Aberdeen,1,0.0,N702BR,0.0,0,0,0,no_delay
1,32.0,0.0,89.0,0.0,0.0,0.0,0.0,E,0,OO,...,ABR,Aberdeen,1,0.0,N8968E,0.0,0,0,0,no_delay
2,43.0,0.0,64.0,0.0,0.0,0.0,0.0,E,0,OO,...,ABR,Aberdeen,1,0.0,N702BR,0.0,1,0,0,no_delay
3,43.0,0.0,64.0,0.0,0.0,0.0,0.0,E,0,OO,...,ABR,Aberdeen,1,0.0,N8968E,0.0,1,0,0,no_delay
4,35.0,0.0,0.0,0.0,0.0,0.0,0.0,E,0,EV,...,ABI,Abilene,1,0.0,N667GB,0.0,0,0,0,no_delay
5,25.0,0.0,0.0,0.0,0.0,1.0,0.0,E,0,EV,...,ABI,Abilene,1,0.0,N667GB,0.0,1,0,0,no_delay
6,9.0,0.0,191.0,0.0,0.0,0.0,0.0,E,0,AS,...,ADK,Adak Island,1,0.0,N762AS,6.0,1,0,0,no_delay
7,44.0,0.0,0.0,0.0,0.0,0.0,0.0,E,0,B6,...,BQN,Aguadilla,1,0.0,N527JB,0.0,0,0,0,no_delay
8,44.0,0.0,0.0,0.0,0.0,0.0,0.0,E,0,NK,...,BQN,Aguadilla,1,0.0,N662NK,0.0,0,0,0,no_delay
9,27.0,0.0,0.0,0.0,0.0,0.0,0.0,E,0,B6,...,BQN,Aguadilla,1,0.0,N948JB,0.0,1,0,0,no_delay
