In [1]:
import pandas as pd
import numpy as np
import csv as csv

In [2]:
# link to metadata: https://www.kaggle.com/weil41/flights/data 
airlines2015 = pd.read_csv("airlines.csv")
airports2015 = pd.read_csv("airports.csv")
flights2015 = pd.read_csv("flights.csv", dtype={"ORIGIN_AIRPORT":str, "DESTINATION_AIRPORT":str})
flights2016 = pd.read_csv("2016_cleaned.csv")
flights2017 = pd.read_csv("current_flights.csv")

In [5]:
# figure out which flights are delayed
# make a delayed variable
cols = ['DAY_OF_WEEK', 'MONTH', 'DAY', 'AIRLINE', 'DISTANCE', 'SCHEDULED_DEPARTURE', 'SCHEDULED_TIME', 'DEPARTURE_DELAY']
col_types = {'DAY_OF_WEEK': float, 'MONTH': float, 'DAY': float, 'AIRLINE': str, 'DISTANCE': float, 'SCHEDULED_DEPARTURE': float, 'SCHEDULED_TIME': float, 'DEPARTURE_DELAY': float}

flights2015 = flights2015[cols]
flights2016 = flights2016[cols]
flights2017 = flights2017[cols]

flights2015['DELAYED'] = 0.0
flights2015.loc[flights2015['DEPARTURE_DELAY'] > 10, 'DELAYED'] = 1.0

flights2016['DELAYED'] = 0
flights2016.loc[flights2016['DEPARTURE_DELAY'] > 10, 'DELAYED'] = 1.0

flights2017['DELAYED'] = 0.0
flights2017.loc[flights2017['DEPARTURE_DELAY'] > 10, 'DELAYED'] = 1.0

In [6]:
#Preprocessing Datasets
from sklearn.preprocessing import Imputer, LabelEncoder, OneHotEncoder

#Preprocessing for 2015 data
#Imputer 2015
imputer = Imputer(missing_values ='NaN', strategy = 'mean', axis = 0)
imputer = imputer.fit(flights2015.values[:,0:2])
flights2015.iloc[:, 0:2] = imputer.transform(flights2015.values[:,0:2])

#LabelEncoder 2015
labelencoder_X = LabelEncoder()
flights2015.iloc[:,0] = labelencoder_X.fit_transform(flights2015.values[:, 0])
flights2015.iloc[:,1] = labelencoder_X.fit_transform(flights2015.values[:, 1])
flights2015.iloc[:,2] = labelencoder_X.fit_transform(flights2015.values[:, 2])
flights2015.iloc[:,3] = labelencoder_X.fit_transform(flights2015.values[:, 3])

#OneHotEncoder 2015
flights2015 = flights2015.astype(float)
flights2015 = flights2015.values
flights2015 = flights2015[~np.isnan(flights2015).any(axis=1)]
categ = [cols.index(x) for x in ['DAY_OF_WEEK', 'MONTH', 'DAY', 'AIRLINE']]
enc = OneHotEncoder(categorical_features = categ)
flights2015 = enc.fit_transform(flights2015).toarray()

# #Preprocessing for 2016 data
# #Imputer 2016
# imputer = Imputer(missing_values ='NaN', strategy = 'mean', axis = 0)
# imputer = imputer.fit(flights2016.values[:,0:2])
# flights2016.iloc[:, 0:2] = imputer.transform(flights2016.values[:,0:2])

# #LabelEncoder 2016
# labelencoder_X = LabelEncoder()
# flights2016.iloc[:,3] = labelencoder_X.fit_transform(flights2016.values[:, 3])

# #OneHotEncoder 2016
# flights2016 = flights2016.astype(float)
# flights2016 = flights2016.values
# flights2016 = flights2016[~np.isnan(flights2016).any(axis=1)]
# categ = [cols.index(x) for x in ['DAY_OF_WEEK', 'MONTH', 'DAY', 'AIRLINE']]
# enc = OneHotEncoder(categorical_features = categ)
# flights2016 = enc.fit_transform(flights2016).toarray()

# #Preprocessing for 2017 data
# #Imputer 2017
# imputer = Imputer(missing_values ='NaN', strategy = 'mean', axis = 0)
# imputer = imputer.fit(flights2017.values[:,0:2])
# flights2017.iloc[:, 0:2] = imputer.transform(flights2017.values[:,0:2])

# #LabelEncoder 2017
# labelencoder_X = LabelEncoder()
# flights2017.iloc[:,3] = labelencoder_X.fit_transform(flights2017.values[:, 3])

# #OneHotEncoder 2017
# flights2017 = flights2017.astype(float)
# flights2017 = flights2017.values
# flights2017 = flights2017[~np.isnan(flights2017).any(axis=1)]
# categ = [cols.index(x) for x in ['DAY_OF_WEEK', 'MONTH', 'DAY', 'AIRLINE']]
# enc = OneHotEncoder(categorical_features = categ)
# flights2017 = enc.fit_transform(flights2015).toarray()

In [12]:
len(flights2015[0])

69

In [None]:
flights2016.shape

In [None]:
flights2017.shape

In [141]:
# Required Python Packages
import pandas as pd
import numpy as np
import pdb
import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder


def dataset_headers(dataset):
    """
    To get the dataset header names
    :param dataset: loaded dataset into pandas DataFrame
    :return: list of header names
    """
    return list(dataset.columns.values)


def unique_observations(dataset, header, method=1):
    """
    To get unique observations in the loaded pandas DataFrame column
    :param dataset:
    :param header:
    :param method: Method to perform the unique (default method=1 for pandas and method=0 for numpy )
    :return:
    """
    try:
        if method == 0:
            # With Numpy
            observations = np.unique(dataset[[header]])
        elif method == 1:
            # With Pandas
            observations = pd.unique(dataset[header].values.ravel())
        else:
            observations = None
            print("Wrong method type, Use 1 for pandas and 0 for numpy")
    except Exception as e:
        observations = None
        print("Error: {error_msg} /n Please check the inputs once..!".format(error_msg=e.message))
    return observations


def feature_target_frequency_relation(dataset, f_t_headers):

    """
    To get the frequency relation between targets and the unique feature observations
    :param dataset:
    :param f_t_headers: feature and target header
    :return: feature unique observations dictionary of frequency count dictionary
    """

    feature_unique_observations = unique_observations(dataset, f_t_headers[0])
    unique_targets = unique_observations(dataset, f_t_headers[1])

    frequencies = {}
    for feature in feature_unique_observations:
        frequencies[feature] = {unique_targets[0]: len(
            dataset[(dataset[f_t_headers[0]] == feature) & (dataset[f_t_headers[1]] == unique_targets[0])]),
            unique_targets[1]: len(
                dataset[(dataset[f_t_headers[0]] == feature) & (dataset[f_t_headers[1]] == unique_targets[1])])}
    return frequencies


def feature_target_histogram(feature_target_frequencies, feature_header):
    """
    :param feature_target_frequencies:
    :param feature_header:
    :return:
    """
    keys = feature_target_frequencies.keys()
    y0 = [feature_target_frequencies[key][0] for key in keys]
    y1 = [feature_target_frequencies[key][1] for key in keys]

    trace1 = go.Bar(
        x=keys,
        y=y0,
        name='No Delay'
    )
    trace2 = go.Bar(
        x=keys,
        y=y1,
        name='Delay'
    )
    data = [trace1, trace2]
    layout = go.Layout(
        barmode='group',
        title='Feature :: ' + feature_header + ' No Delay Vs Delay  Frequency',
        xaxis=dict(title="Feature :: " + feature_header + " classes"),
        yaxis=dict(title="Delay Frequency")
    )
    fig = go.Figure(data=data, layout=layout)
    plot_url = py.plot(fig, filename=feature_header + ' - Target - Histogram')
    py.image.save_as(fig, filename=feature_header + '_Target_Histogram.png')


def train_logistic_regression(train_x, train_y):
    """
    Training logistic regression model with train dataset features(train_x) and target(train_y)
    :param train_x:
    :param train_y:
    :return:
    """

    logistic_regression_model = LogisticRegression()
    logistic_regression_model.fit(train_x, train_y)
    return logistic_regression_model


def model_accuracy(trained_model, features, targets):
    """
    Get the accuracy score of the model
    :param trained_model:
    :param features:
    :param targets:
    :return:
    """
    accuracy_score = trained_model.score(features, targets)
    return accuracy_score


def main():
    """
    Logistic Regression classifier main
    :return:
    """
    # Load the datasets for training and testing the logistic regression classifier
    dataset = flights2015
    dataset2 = flights2016
    dataset3 = flights2017
    
    print("Flights 2015 - Number of Observations :: ", len(dataset))
    print("Flights 2016 - Number of Observations :: ", len(dataset2))
    print("Flights 2017 - Number of Observations :: ", len(dataset3))

    training_features = ['MONTH', 'DAY', 'DAY_OF_WEEK', 'DISTANCE', 'SCHEDULED_DEPARTURE', 'AIRLINE_NUMBER', 'SCHEDULED_TIME']
    target = 'DELAYED'
    
    train_x2015 = dataset[:, :58]
    train_y2015 = dataset[:, 58]
    
    print("train_x2015 size :: ", train_x2015.shape)
    print("train_y2015 size :: ", train_y2015.shape)
    
    test_x2016 = dataset2[:, :58]
    test_y2016 = dataset2[:, 58]
    
    print("test_x2016 size :: ", test_x2016.shape)
    print("test_y2016 size :: ", test_y2016.shape)
    
    #test_x2017 = dataset3[:, :58]
    #test_y2017 = dataset3[:, 58]
    
    #print("test_x2017 size :: ", test_x2017.shape)
    #print("test_y2017 size :: ", test_y2017.shape)  

    # Training Logistic regression model
    print("training regression model")
    trained_logistic_regression_model = train_logistic_regression(train_x2015, train_y2015)
    print("regression model trained")

    train_accuracy = model_accuracy(trained_logistic_regression_model, train_x2015, train_y2015)
    print("train accuracy completed")

    # Testing the logistic regression model
    test_accuracy = model_accuracy(trained_logistic_regression_model, test_x2016, test_y2016)
    print("testing accuracy completed")

    print("Train Accuracy :: ", train_accuracy)
    print("Test Accuracy :: ", test_accuracy)
    
    #print(trained_logistic_regression_model.predict(flights2017[training_features]))
    #print(trained_logistic_regression_model.predict_proba(flights2017[training_features]))
    
    
if __name__ == "__main__":
    main()

Flights 2015 - Number of Observations ::  5732920
Flights 2016 - Number of Observations ::  1829534
Flights 2017 - Number of Observations ::  3
train_x2015 size ::  (5732920, 58)
train_y2015 size ::  (5732920,)
test_x2016 size ::  (1829534, 58)
test_y2016 size ::  (1829534,)
training regression model
regression model trained
train accuracy completed
testing accuracy completed
Train Accuracy ::  0.97986226914
Test Accuracy ::  0.787166021511
