In [None]:
!curl https://topcs.blob.core.windows.net/public/FlightData.csv -o flightdata.csv

In [None]:
import pandas as pd
df = pd.read_csv("flightdata.csv")
df.head()

In [None]:
df.isnull().values.any()

In [None]:
df.isnull().sum()

In [None]:
df = df[["MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK", "ORIGIN", "DEST", "CRS_DEP_TIME", "ARR_DEL15"]]

In [None]:
df[df.isnull().values.any(axis=1)].head()

In [None]:
df = df.fillna({"ARR_DEL15" : 1})
df.iloc[175:186]

In [None]:
df.head()

In [None]:
import math
for index, row in df.iterrows():
    df.loc[index, "CRS_DEP_TIME"] = math.floor(row["CRS_DEP_TIME"] / 100)
    
df.head()

In [None]:
df = pd.get_dummies(df, columns = ["ORIGIN", "DEST"])
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df.drop('ARR_DEL15', axis = 1), df['ARR_DEL15'], test_size = 0.2, random_state = 6)


In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state = 42)

model.fit(train_x, train_y)

In [None]:
predicted = model.predict(test_x)

model.score(test_x, test_y)

In [None]:
from sklearn.metrics import roc_auc_score
probabilities = model.predict_proba(test_x)

In [None]:
roc_auc_score(test_y, probabilities[:, 1])

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_y, predicted)


In [None]:
from sklearn.metrics import precision_score
training_predictions = model.predict(train_x)
precision_score(train_y, training_predictions)

In [None]:
from sklearn.metrics import recall_score
recall_score(train_y, training_predictions)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()

In [None]:
from sklearn.metrics import roc_curve
fpr, tpr, _ = roc_curve(test_y, probabilities[:, 1])
plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

In [None]:
def predict_delay(departure_date_time, origin, destination):
    from datetime import datetime
    
    try:
        departure_date_time_parsed = datetime.strptime(departure_date_time, '%d/%m/%Y %H:%M:%S')
    except ValueError as e:
        return "Error parsing the date and/or the time.".format(e)
    
    month = departure_date_time_parsed.month
    day = departure_date_time_parsed.day
    day_of_week = departure_date_time_parsed.isoweekday()
    hour = departure_date_time_parsed.hour
    
    origin = origin.upper()
    destination = destination.upper()
    
    input = [{'MONTH': month,
              'DAY': day,
              'DAY_OF_WEEK': day_of_week,
              'CRS_DEP_TIME': hour,
              'ORIGIN_ATL': 1 if origin == 'ATL' else 0,
              'ORIGIN_DTW': 1 if origin == 'DTW' else 0,
              'ORIGIN_JFK': 1 if origin == 'JFK' else 0,
              'ORIGIN_MSP': 1 if origin == 'MSP' else 0,
              'ORIGIN_SEA': 1 if origin == 'SEA' else 0,
              'DEST_ATL': 1 if destination == 'ATL' else 0,
              'DEST_DTW': 1 if destination == 'DTW' else 0,
              'DEST_JFK': 1 if destination == 'JFK' else 0,
              'DEST_MSP': 1 if destination == 'MSP' else 0,
              'DEST_SEA': 1 if destination == 'SEA' else 0 }]
    
    prediction = model.predict_proba(pd.DataFrame(input))[0][0]
    print(f"The probability your flight will arrive at {destination} on time is: {prediction * 100}%")
    return prediction

In [None]:
def main():
    print("This is a program which uses machine learning to determine the liklihood that your flight will be late. ")
    print("Please note this model only accepts the following airport code: ATL, DTW, JFK, MSP, SEA. ")
    date = input("Enter your flight takeoff details in the following format: 'Day/Month/Year Hour:Minute:Second'")
    origin = input("What is the airport code of your takeoff? ").upper()
    dest = input("What is the airport code of your arrival? ").upper()
    predict_delay(date, origin, dest)

In [None]:
main()