In [25]:
#imports
import pandas as pd
import numpy as np
import warnings
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score,mean_squared_error
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.decomposition import PCA
import pickle as pkl
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression, ElasticNet


#cohen kappa for scoring
def cohen_kappa_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    return cohen_kappa_score(y, y_pred)


#convert numeric response to categorical
def categorize_delays(delays):
    result = np.where(delays < 30, 'ontime',
             np.where((delays >= 30) & (delays <= 120), 'minordelay',
             np.where(delays > 120, 'majordelay', delays)))
    return result


#match columns of original dataset
def match_cols(data, cols):
    for col in cols:
        if col not in data.columns:
            data[col] = 0

    new_data = data[cols]
    return new_data


#returns x,y
def get_data(path):
    flights = pd.read_csv(path)
    planes = pd.read_csv('planes.csv')
    weather = pd.read_csv('weather.csv')

    #Get columns from model fit from pkl
    with open('model_cols.pkl','rb') as f:
        model_cols = list(pkl.load(f))

    #impute weather
    weather_orig = weather['origin']
    weather = weather.drop(['wind_gust','origin','time_hour','year'],axis=1)
    imputer = IterativeImputer(sample_posterior=True)
    weather = pd.DataFrame(imputer.fit_transform(weather), columns=weather.columns)
    weather.insert(0, 'origin', weather_orig)

    #impute airplanes
    planes = planes.drop('speed',axis=1)
    year_by_model = planes.groupby('model')['year'].first()
    planes['year'] = planes['year'].fillna(planes['model'].map(year_by_model)) #still some missing... use median
    planes['year'] = planes['year'].fillna(planes['year'].median())


    ##### New variables #####
    #delay severity
    flights['delay_severity'] = categorize_delays(flights['dep_delay'])

    #existance of a delay
    flights['is_delayed'] = np.where(flights['delay_severity'] == 'ontime', 0, 1)

    #snowing category
    weather['snowing'] = (weather['precip'] > 0) & (weather['temp'] <= 32).astype(int)

    #day of week + weekend category (F-M)
    flights['date'] = pd.to_datetime(flights[['year', 'month', 'day']])
    flights['day_of_week'] = flights['date'].dt.day_name()
    flights['is_weekend'] = flights['day_of_week'].isin(['Friday', 'Saturday', 'Sunday', 'Monday']).astype(int)

    #peak dates (Thanksgiving (11/28), Christmas, Memorial Day (5/27), July Fourth, and Labor Day(9/2)) pm 5 days
    peak_dates = pd.to_datetime(['2013-11-28', '2013-12-25', '2013-07-04', '2013-05-27', '2013-09-02'])

    peak_weeks = pd.DataFrame() #get 5 days before/after
    for date in peak_dates:
        date_range = pd.date_range(start=date - pd.Timedelta(days=5), 
                                end=date + pd.Timedelta(days=5))
        peak_weeks = pd.concat([peak_weeks, pd.DataFrame({'date': date_range})], ignore_index=True)
        
    flights['peak_week'] = flights['date'].isin(peak_weeks['date']).astype(int)

    #peak times (6PM-9PM)
    flights['peak_time'] = flights['hour'].between(18, 21)
    flights['peak_time'] = flights['peak_time'].astype(int)

    #prior airline, origin, and destination delays (takes 2 min to run)
    print('Getting new variables (1/3)')
    flights['date'] = pd.to_datetime(flights[['year', 'month', 'day', 'hour', 'minute']])

    flights['carrier_delay'] = flights.apply(
        lambda row: flights[(flights['carrier'] == row['carrier']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    print('Getting new variables (2/3)')
    flights['origin_delay'] = flights.apply(
        lambda row: flights[(flights['origin'] == row['origin']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    print('Getting new variables (3/3)')
    flights['dest_delay'] = flights.apply(
        lambda row: flights[(flights['dest'] == row['dest']) & 
                            (flights['date'] <= row['date']) & 
                            (flights['date'] > row['date'] - pd.Timedelta(hours=48))]['dep_delay'].mean(), axis=1)

    flights['carrier_delay'] = categorize_delays(flights['carrier_delay'])
    flights['carrier_delay'] = np.where(flights['carrier_delay'] == 'ontime', 0, 1)

    flights['origin_delay'] = categorize_delays(flights['origin_delay'])
    flights['origin_delay'] = np.where(flights['origin_delay'] == 'ontime', 0, 1)

    flights['dest_delay'] = categorize_delays(flights['dest_delay'])
    flights['dest_delay'] = np.where(flights['dest_delay'] == 'ontime', 0, 1)

    #number of flights leaving airport same day
    flights['flight_volume'] = flights.apply(
        lambda row: len(flights[(flights['origin'] == row['origin']) & 
                            (flights['year'] == row['year']) & 
                            (flights['month'] == row['month']) & 
                            (flights['day'] == row['day'])]),axis=1)

    #create final dataset
    flights = pd.merge(flights, weather, on=['month', 'day', 'hour', 'origin'])

    planes['year_manufactured'] = planes['year']
    planes = planes.drop('year',axis=1)
    flights = pd.merge(flights, planes, on='tailnum')

    #responses
    ys = flights[['dep_delay', 'delay_severity', 'is_delayed']]

    flights = flights.drop(['arr_time', 'arr_delay', 'flight','date','tailnum','air_time',
                            'year', 'month', 'day', 'dest', 'dep_time',
                            'dep_delay', 'delay_severity', 'is_delayed'],axis=1)
    
    #predictors
    x = pd.get_dummies(flights,dtype=int)

    #match columns to original data
    x = match_cols(x, model_cols)
    
    return x,ys


#Function to input delay data and return predictions
def predict_delays(path, verbose = True):
    x_input, y_input = get_data(path)
    
    with open('model_cols.pkl','rb') as f:
        model = pkl.load(f)

    if verbose: print('Fitting model...')
    preds = model.predict(x_input)
    
    if verbose: print(classification_report(y_input,preds))
    
    return preds

In [22]:
#Do this for data used in model
test_data = pd.read_csv('flight_data_full.csv')
test_data = test_data.drop(['Unnamed: 0','air_time','year', 'month', 'day', 'minute', 'dest', 'dep_time', 'dep_delay', 'delay_severity', 'is_delayed'],axis=1)
mod_cols = test_data.columns

with open('model_cols.pkl','wb') as f:
    pkl.dump(mod_cols,f)
    #can put multiple into 1 file

with open('model_cols.pkl','rb') as f:
    test = list(pkl.load(f))
    #can load in same order

print(test)

['carrier', 'origin', 'distance', 'hour', 'day_of_week', 'is_weekend', 'peak_week', 'peak_time', 'carrier_delay', 'origin_delay', 'dest_delay', 'flight_volume', 'temp', 'dewp', 'humid', 'wind_dir', 'wind_speed', 'precip', 'pressure', 'visib', 'freezing', 'type', 'manufacturer', 'model', 'engines', 'seats', 'engine', 'year_manufactured']


In [26]:
results = predict_delays('flights_set1.csv')

Getting new variables (1/3)
Getting new variables (2/3)
Getting new variables (3/3)
Fitting model...


AttributeError: 'Index' object has no attribute 'predict'