In [1]:
# Use the Azure Machine Learning data preparation package
from azureml.dataprep import package

# Use the Azure Machine Learning data collector to log various metrics
from azureml.logging import get_azureml_logger
logger = get_azureml_logger()


In [2]:
# Use Azure Machine Learning history magic to control history collection
# History is off by default, options are "on", "off", or "show"
# %azureml history on


In [20]:
# This call will load the referenced package and return a DataFrame.
# If run in a PySpark environment, this call returns a
# Spark DataFrame. If not, it returns a Pandas DataFrame.
df = package.run('flight_data_prepare.dprep', dataflow_idx=0)

# Remove this line and add code that uses the DataFrame
df.head(10)


Unnamed: 0,ARR_DELAY,ARR_HOUR,CANCELLED,CARRIER,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DELAY,DEP_HOUR,DEST,DISTANCE_GROUP,DIVERTED,ORIGIN
0,-19.0,21,0.0,OO,4.0,3.0,-5.0,20,CID,1.0,0.0,MSP
1,46.0,16,0.0,OO,9.0,1.0,45.0,15,RKS,2.0,0.0,DEN
2,0.0,14,0.0,WN,25.0,3.0,-3.0,13,ALB,2.0,0.0,BWI
3,-40.0,21,0.0,AA,2.0,1.0,-5.0,18,DFW,6.0,0.0,PHL
4,22.0,12,0.0,UA,30.0,1.0,30.0,10,IAH,2.0,0.0,MSY
5,-11.0,14,0.0,UA,8.0,7.0,-2.0,13,SAT,1.0,0.0,IAH
6,-11.0,8,0.0,DL,16.0,1.0,3.0,5,ATL,4.0,0.0,MSP
7,-13.0,20,0.0,WN,24.0,2.0,0.0,19,HOU,4.0,0.0,MCO
8,-20.0,9,0.0,B6,18.0,3.0,-12.0,8,EWR,1.0,0.0,BOS
9,7.0,10,0.0,UA,19.0,4.0,-7.0,7,MIA,4.0,0.0,IAH


In [21]:
import numpy as np
import pandas as pd
import itertools

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras import utils


In [22]:
df = df.drop(['CANCELLED', 'DIVERTED'], axis=1)

In [23]:
df['DAY_OF_WEEK'] = df['DAY_OF_WEEK'].astype(int)
df['DAY_OF_MONTH'] = df['DAY_OF_MONTH'].astype(int)

In [24]:
#lets create delayed label
#if flights are delayed more than 10minutes
df['DELAYED'] = np.where(df['ARR_DELAY']>=10, 1, 0)
df.head()

Unnamed: 0,ARR_DELAY,ARR_HOUR,CARRIER,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DELAY,DEP_HOUR,DEST,DISTANCE_GROUP,ORIGIN,DELAYED
0,-19.0,21,OO,4,3,-5.0,20,CID,1.0,MSP,0
1,46.0,16,OO,9,1,45.0,15,RKS,2.0,DEN,1
2,0.0,14,WN,25,3,-3.0,13,ALB,2.0,BWI,0
3,-40.0,21,AA,2,1,-5.0,18,DFW,6.0,PHL,0
4,22.0,12,UA,30,1,30.0,10,IAH,2.0,MSY,1


In [25]:
df= df.drop(['ARR_DELAY', 'DEP_DELAY'], axis=1)

In [26]:
dummy_fields = ['ARR_HOUR', 'DEP_HOUR', 'DAY_OF_WEEK', 'DEST', 'ORIGIN', 'DISTANCE_GROUP', 'CARRIER']
for each in dummy_fields:
    dummies = pd.get_dummies(df[each], prefix=each, drop_first=False)
    df=pd.concat([df, dummies], axis=1)

df.head(10)

Unnamed: 0,ARR_HOUR,CARRIER,DAY_OF_MONTH,DAY_OF_WEEK,DEP_HOUR,DEST,DISTANCE_GROUP,ORIGIN,DELAYED,ARR_HOUR_0,...,CARRIER_B6,CARRIER_DL,CARRIER_EV,CARRIER_F9,CARRIER_HA,CARRIER_NK,CARRIER_OO,CARRIER_UA,CARRIER_VX,CARRIER_WN
0,21,OO,4,3,20,CID,1.0,MSP,0,0,...,0,0,0,0,0,0,1,0,0,0
1,16,OO,9,1,15,RKS,2.0,DEN,1,0,...,0,0,0,0,0,0,1,0,0,0
2,14,WN,25,3,13,ALB,2.0,BWI,0,0,...,0,0,0,0,0,0,0,0,0,1
3,21,AA,2,1,18,DFW,6.0,PHL,0,0,...,0,0,0,0,0,0,0,0,0,0
4,12,UA,30,1,10,IAH,2.0,MSY,1,0,...,0,0,0,0,0,0,0,1,0,0
5,14,UA,8,7,13,SAT,1.0,IAH,0,0,...,0,0,0,0,0,0,0,1,0,0
6,8,DL,16,1,5,ATL,4.0,MSP,0,0,...,0,1,0,0,0,0,0,0,0,0
7,20,WN,24,2,19,HOU,4.0,MCO,0,0,...,0,0,0,0,0,0,0,0,0,1
8,9,B6,18,3,8,EWR,1.0,BOS,0,0,...,1,0,0,0,0,0,0,0,0,0
9,10,UA,19,4,7,MIA,4.0,IAH,0,0,...,0,0,0,0,0,0,0,1,0,0


In [27]:
df = df.drop(dummy_fields, axis=1)

In [28]:
#lets setup test and training data
train_size = int(len(df) * .8)
print(train_size)

7806


In [29]:
data = df[df.columns[:-1]]
labels = df['DELAYED']

In [30]:
labels = utils.to_categorical(labels, 2)

In [31]:
train_data = data[:train_size]
train_labels = labels[:train_size]
test_data = data[train_size:]
test_labels = labels[train_size:]

In [32]:
#lets build the model
batch_size = 100
epochs=3
train_data.shape

(7806, 603)

In [39]:
model=Sequential()
model.add(Dense(512,input_shape=(603,)))
model.add(Activation('sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, ))
model.add(Activation('softmax'))

In [40]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
history = model.fit(train_data, train_labels,
                   batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1
                   )

Train on 7025 samples, validate on 781 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [42]:
score = model.evaluate(test_data, test_labels, batch_size=batch_size, verbose=1)



In [44]:
print('Test score: ', score[0])
print('Test accuracy: ', score[1])

Test score:  0.28902789242
Test accuracy:  0.847848357358


In [45]:
predictions = model.predict(test_data)

In [47]:
results = np.argmax(predictions, 1)
actual = np.argmax(test_labels, 1)

In [48]:
# True Positive (TP): we predict a label of 1 (positive), and the true label is 1.
TP = np.sum(np.logical_and(results == 1, actual == 1))
 
# True Negative (TN): we predict a label of 0 (negative), and the true label is 0.
TN = np.sum(np.logical_and(results == 0, actual == 0))
 
# False Positive (FP): we predict a label of 1 (positive), but the true label is 0.
FP = np.sum(np.logical_and(results == 1, actual == 0))
 
# False Negative (FN): we predict a label of 0 (negative), but the true label is 1.
FN = np.sum(np.logical_and(results == 0, actual == 1))
 
print('TP: %i, FP: %i, TN: %i, FN: %i' % (TP,FP,TN,FN))

TP: 238, FP: 0, TN: 1417, FN: 297
