In [1]:
import pandas as pd
import numpy as np
import glob
import pickle
import datetime

In [2]:
def get_file_list(folder_name, prefix=''):
    return glob.glob(folder_name + '/*' + prefix + '.data')

In [3]:
def load_file(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)

In [4]:
def init_data(clfs_folder, cols_folder, fl_info_file, delays_file):
    flight_info = load_file(fl_info_file)
    delays = load_file(delays_file)
    files_list = get_file_list(clfs_folder)
    clfs = {}
    for file in files_list:
        airport = file.replace(clfs_folder + '/', '').replace('.data', '')
        clfs[airport] = {'clf': load_file(file), 'cols': load_file(cols_folder + '/' + airport + '.data')}
    return {'clfs': clfs, 'fl_info': flight_info, 'delays': delays}

In [5]:
def get_series_value(series):
    return series.values[0]

In [18]:
def get_features(data, origin, dest, carrier, flight, wind_speed, date):
    fl_info = data['fl_info'][(data['fl_info']['fl_num'] == flight) & (data['fl_info']['carrier'] == carrier) & (data['fl_info']['origin'] == origin)]
    df = pd.DataFrame(columns=data['clfs'][origin]['cols'])
    df = df.append({'average_wind_speed': wind_speed, 'crs_dep_time': get_series_value(fl_info['crs_dep_time']), 'crs_elapsed_time': get_series_value(fl_info['crs_elapsed_time']),'dest_' + dest: 1, 
                    'carrier_' + carrier: 1, 'month': date.month, 'quarter': date.month // 4, 'day_of_month': date.day, 
                    'day_of_year': date.timetuple().tm_yday, 
                    'airline_delay_index': get_series_value(data['delays'][data['delays']['Carrier'] == carrier]['Delay index'])}, ignore_index=True)
    df = df.drop('status', axis=1)
    df = df.fillna(0)
    return df

In [19]:
info_data = init_data('clfs', 'values_dicts', 'flight_info.data', 'delays.data')

In [20]:
dt = datetime.datetime(1999, 8, 28)

In [21]:
test_sample = get_features(info_data, 'ABR', 'MSP', 'OO', 7363, 0, dt)