In [1]:
import pandas as pd
import numpy as np
import glob
import pickle
import datetime
import requests
import json

In [2]:
def get_file_list(folder_name, prefix=''):
    return glob.glob(folder_name + '/*' + prefix + '.data')

In [3]:
def load_file(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

In [4]:
def init_data(clfs_folder, cols_folder, fl_info_file, delays_file, avg_delays_file):
    flight_info = load_file(fl_info_file)
    delays = load_file(delays_file)
    avg_delays = load_file(avg_delays_file)
    files_list = get_file_list(clfs_folder)
    clfs = {}
    for file in files_list:
        airport = file.replace(clfs_folder + '/', '').replace('.data', '')
        clfs[airport] = {'clf': load_file(file), 'cols': load_file(cols_folder + '/' + airport + '.data')}
    return {'clfs': clfs, 'fl_info': flight_info, 'delays': delays, 'avg_delays': avg_delays}

In [5]:
def get_series_value(series):
    return series.values[0]

In [6]:
def get_wind_speed_for_city(city_name, api_key):
    try:
        api_url = 'http://api.openweathermap.org/data/2.5/weather'
        r = requests.get(url=api_url, params=dict(q=city_name, APPID=api_key))
        result_json = json.loads(r.text)
        return result_json['wind']['speed'] * 10
    except Exception as e:
        print(e)
        return 0

In [45]:
def get_features(data, weather_api_key, origin, dest, carrier, flight, date):
    fl_info = data['fl_info'][(data['fl_info']['fl_num'] == flight) & (data['fl_info']['carrier'] == carrier) & (data['fl_info']['origin'] == origin)]
    df = pd.DataFrame(columns=data['clfs'][origin]['cols'])
    df = df.append({'average_wind_speed': get_wind_speed_for_city(get_series_value(fl_info['origin_city_name']), weather_api_key), 
                    'crs_dep_time': get_series_value(fl_info['crs_dep_time']), 
                    'crs_elapsed_time': get_series_value(fl_info['crs_elapsed_time']),
                    'day_of_month': date.day, 
                    'day_of_week': date.weekday(),
                    'month': date.month, 
                    'quarter': date.month // 4,
                    'previous_flight_delay': 0,
                    'airline_delay_index': get_series_value(data['delays'][data['delays']['Carrier'] == carrier]['Delay index']),
                    'airline_avg_delay': get_series_value(data['avg_delays'][data['avg_delays']['carrier'] == carrier]['carrier_delay']),
                    'dest_' + dest: 1, 
                    'day_of_year': int(date.strftime("%j"))}, ignore_index=True)
    df = df.drop('status', axis=1)
    df = df.fillna(0)
    return df

In [46]:
info_data = init_data('../data/clfs', '../data/values_dicts', '../data/flight_info.data', '../data/delays.data', '../data/avg_delays.data')

In [47]:
dt = datetime.datetime(1999, 8, 28)

In [48]:
test_sample = get_features(info_data, '', 'ABR', 'MSP', 'OO', 7363, dt)

In [49]:
test_sample.columns

Index(['average_wind_speed', 'crs_dep_time', 'crs_elapsed_time',
       'day_of_month', 'day_of_week', 'month', 'quarter',
       'previous_flight_delay', 'airline_delay_index', 'airline_avg_delay',
       'dest_MSP', 'day_of_year', 'weekend'],
      dtype='object')

In [50]:
info_data['clfs']['ABR']

{'clf': RandomForestClassifier(bootstrap=False, class_weight='balanced',
             criterion='entropy', max_depth=64, max_features=7,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=40, n_jobs=1, oob_score=False,
             random_state=100412, verbose=0, warm_start=False),
 'cols': Index(['average_wind_speed', 'crs_dep_time', 'crs_elapsed_time',
        'day_of_month', 'day_of_week', 'month', 'quarter', 'status',
        'previous_flight_delay', 'airline_delay_index', 'airline_avg_delay',
        'dest_MSP', 'day_of_year', 'weekend'],
       dtype='object')}

In [51]:
info_data['clfs']['ABR']['clf'].predict(test_sample)

array(['no_delay'], dtype=object)