In [1]:
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [2]:
# Code to read csv file into Colaboratory:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [3]:
# URL to data file saved in google drive (output_data.csv)
link = 'https://drive.google.com/open?id=1r3FcsKcEPJHeHE2YoR82I82zvbibiK0l'

In [4]:
# Extract ID from google drive link
fluff, id = link.split('=')
print (id) # Verify that you have everything after '='

1r3FcsKcEPJHeHE2YoR82I82zvbibiK0l


In [4]:
# Load data

df3 = pd.read_csv('output_data.csv')
# Dataset is now stored in a Pandas Dataframe

df3.head()

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CANCELLED,CANCELLATION_CODE,...,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,country_code,airport_name,dest_longitude,dest_latitude,org_longitude,org_latitude
0,2017-01-01,AA,25,ORD,LAS,1705,1707.0,2.0,0.0,,...,,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603
1,2017-01-01,AA,300,ORD,LAS,1235,1232.0,-3.0,0.0,,...,,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603
2,2017-01-01,NK,245,ORD,LAS,1518,1507.0,-11.0,0.0,,...,,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603
3,2017-01-01,NK,357,ORD,LAS,927,1204.0,157.0,0.0,,...,0.0,130.0,0.0,0.0,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603
4,2017-01-01,UA,599,ORD,LAS,1940,1945.0,5.0,0.0,,...,,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603


In [5]:
# Categorize each flight in binary terms
df3.loc[df3['DEP_DELAY'] > 0, 'DELAYED'] = 'Delayed'
df3.loc[df3['DEP_DELAY'] < 0, 'DELAYED'] = 'Not Delayed'

df3.head()

Unnamed: 0,FL_DATE,OP_CARRIER,OP_CARRIER_FL_NUM,ORIGIN,DEST,CRS_DEP_TIME,DEP_TIME,DEP_DELAY,CANCELLED,CANCELLATION_CODE,...,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,country_code,airport_name,dest_longitude,dest_latitude,org_longitude,org_latitude,DELAYED
0,2017-01-01,AA,25,ORD,LAS,1705,1707.0,2.0,0.0,,...,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603,Delayed
1,2017-01-01,AA,300,ORD,LAS,1235,1232.0,-3.0,0.0,,...,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603,Not Delayed
2,2017-01-01,NK,245,ORD,LAS,1518,1507.0,-11.0,0.0,,...,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603,Not Delayed
3,2017-01-01,NK,357,ORD,LAS,927,1204.0,157.0,0.0,,...,130.0,0.0,0.0,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603,Delayed
4,2017-01-01,UA,599,ORD,LAS,1940,1945.0,5.0,0.0,,...,,,,US,Mc Carran Intl,-115.15225,36.080056,-87.904842,41.978603,Delayed


In [6]:
# Drop unecessary series data
data = df3.drop(["CANCELLED","OP_CARRIER","ORIGIN","DEST","OP_CARRIER_FL_NUM","DEP_TIME","DEP_DELAY","CANCELLATION_CODE","DIVERTED","CRS_ELAPSED_TIME","ACTUAL_ELAPSED_TIME","AIR_TIME","CARRIER_DELAY","WEATHER_DELAY", "NAS_DELAY", "SECURITY_DELAY", "LATE_AIRCRAFT_DELAY", "country_code", "airport_name", "org_longitude", "org_latitude"], axis=1)

# Drop rows with blank values
data2 = data.dropna()

data2

Unnamed: 0,FL_DATE,CRS_DEP_TIME,DISTANCE,dest_longitude,dest_latitude,DELAYED
0,2017-01-01,1705,1514.0,-115.152250,36.080056,Delayed
1,2017-01-01,1235,1514.0,-115.152250,36.080056,Not Delayed
2,2017-01-01,1518,1514.0,-115.152250,36.080056,Not Delayed
3,2017-01-01,927,1514.0,-115.152250,36.080056,Delayed
4,2017-01-01,1940,1514.0,-115.152250,36.080056,Delayed
...,...,...,...,...,...,...
585733,2018-12-15,1351,1219.0,-97.654389,26.228500,Not Delayed
585734,2018-12-19,1351,1219.0,-97.654389,26.228500,Not Delayed
585735,2018-12-22,1351,1219.0,-97.654389,26.228500,Not Delayed
585736,2018-12-26,1351,1219.0,-97.654389,26.228500,Not Delayed


In [7]:
# Extract month from flight date and save as new series
data2['FL_DATE']= pd.to_datetime(data2['FL_DATE'])
data2['MONTH'] = pd.DatetimeIndex(data2['FL_DATE']).month

# Drop original date column
data3 = data2.drop(["FL_DATE"],axis=1)

# Clean latitude and longitude values
decimals = 3   
data3['dest_longitude'] = data3['dest_longitude'].apply(lambda x: round(x, decimals))
data3['dest_latitude'] = data3['dest_latitude'].apply(lambda x: round(x, decimals))

feature_names = data3.columns

data3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['FL_DATE']= pd.to_datetime(data2['FL_DATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['MONTH'] = pd.DatetimeIndex(data2['FL_DATE']).month


Unnamed: 0,CRS_DEP_TIME,DISTANCE,dest_longitude,dest_latitude,DELAYED,MONTH
0,1705,1514.0,-115.152,36.08,Delayed,1
1,1235,1514.0,-115.152,36.08,Not Delayed,1
2,1518,1514.0,-115.152,36.08,Not Delayed,1
3,927,1514.0,-115.152,36.08,Delayed,1
4,1940,1514.0,-115.152,36.08,Delayed,1


In [8]:
# Define classification target
target = data3["DELAYED"]
target_names = ["Not Delayed", "Delayed"]

In [9]:
# Drop non-integer column before passing through model
data4 = data3.drop(['DELAYED'], axis =1)

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data4, target, random_state=42)

print(target)

0             Delayed
1         Not Delayed
2         Not Delayed
3             Delayed
4             Delayed
             ...     
585733    Not Delayed
585734    Not Delayed
585735    Not Delayed
585736    Not Delayed
585737    Not Delayed
Name: DELAYED, Length: 550740, dtype: object


In [11]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model.fit(X_train[:1000], y_train[:1000])


SVC(kernel='linear')

In [12]:
print('Test Acc: %.3f' % model.score(X_test, y_test))

Test Acc: 0.641


In [13]:
# Calculate classification report
from sklearn.metrics import classification_report
predictions = model.predict(X_test)
# print(classification_report(y_test, predictions,
#                             target_names=target_names))


df = pd.DataFrame(predictions)


In [14]:
print(X_test)

        CRS_DEP_TIME  DISTANCE  dest_longitude  dest_latitude  MONTH
498868           730     137.0         -85.523         42.881     10
325664          1825     594.0         -76.763         40.193      6
488535           840     122.0         -85.552         42.235      9
262041          1915     273.0         -87.532         38.037     11
114961           633    1197.0         -80.291         25.793     12
...              ...       ...             ...            ...    ...
345872           825     286.0         -85.736         38.174     10
82095           2055     733.0         -73.873         40.777      3
248754          1525     588.0         -97.433         37.650      8
363962          1125     343.0         -81.442         40.916     10
72569           1340     612.0         -77.038         38.852      1

[137685 rows x 5 columns]


In [15]:
test1 = [[600, 925,95.341,29.984,1]]

predictionex = model.predict(test1)

print(predictionex)

['Not Delayed']


In [16]:
x_test_df = pd.DataFrame(X_test)
predictions_df = pd.DataFrame(predictions)

In [17]:
x_test_df.to_csv("x_test_file.csv")
df.to_csv("predictions.csv")

# from google.colab import files
# files.download('x_test_file.csv')
# files.download('output_data.csv')
# files.download("predictions.csv")

In [18]:
import pickle

# Saving model to current directory
# Pickle serializes objects so they can be saved to a file, and loaded in a program again later on.
pickle.dump(model, open('model.pkl','wb'))