In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

import matplotlib.pyplot as plt
from matplotlib import rcParams

import sklearn
from sklearn import decomposition
from sklearn.decomposition import PCA
from sklearn import linear_model, cross_validation, metrics, svm, ensemble
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.cross_validation  import train_test_split, cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder

%matplotlib inline
rcParams['figure.figsize'] = 9,5
sb.set_style('whitegrid')

In [4]:
flights = pd.read_csv("C:\Users\Sim Keng Ying\Desktop/nycflights13_flights.csv")
weather = pd.read_csv("C:\Users\Sim Keng Ying\Desktop/nycflights13_weather.csv")
airports = pd.read_csv("C:\Users\Sim Keng Ying\Desktop/nycflights13_airport.csv")

df_with_weather = pd.merge(flights, weather, how='left', on=['year','month', 'day', 'hour'])
df_overall = pd.merge(df_with_weather, airports, how='left', left_on='dest', right_on='faa')
df_overall = df_overall.dropna()

Using logistic regression to determine whether flights are delayed or non-delayed, where delayed will be at least 15min.

In [9]:
pred = 'dep_delay'
features =  ['month','day','dep_time','arr_time','carrier','dest','air_time','distance', 
             'lat', 'lon', 'alt',  'dewp', 'humid', 'wind_speed', 'wind_gust', 
             'precip', 'pressure', 'visib' ]

features_v = df_overall[features]
pred_v = df_overall[pred]

pd.options.mode.chained_assignment = None
features_v['carrier'] = pd.factorize(features_v['carrier'])[0]
features_v['dest'] = pd.factorize(features_v['dest'])[0]

#setting delayed as 15min
how_late_is_late = 15.0

scaler = StandardScaler()
scaled_features_v = scaler.fit_transform(features_v)

features_train, features_test, pred_train, pred_test = train_test_split(
    scaled_features_v, pred_v, test_size=0.30, random_state=0)

In [16]:
clf_lr = sklearn.linear_model.LogisticRegression()                                                 
logistic_fit=clf_lr.fit(features_train, 
                        np.where(pred_train >= how_late_is_late,1,0))

predictions = clf_lr.predict(features_test)

Report

In [19]:
cm_lr = confusion_matrix(np.where(pred_test >= how_late_is_late,1,0), predictions)
print("Confusion matrix")
print(pd.DataFrame(cm_lr))

# Get accuracy
report_lr = precision_recall_fscore_support(
    list(np.where(pred_test >= how_late_is_late,1,0)), 
    list(predictions), average='binary')

#Print Accuracy
print ("\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f"
       % (report_lr[0], report_lr[1], report_lr[2],                                                                         
          accuracy_score(list(np.where(pred_test >= how_late_is_late,1,0)), list(predictions))))

Confusion matrix
       0     1
0  48403  1243
1  12569  2171

precision = 0.64, recall = 0.15, F1 = 0.24, accuracy = 0.79


In [None]:
#Note to self:
#cross_validation -> http://scikit-learn.org/stable/modules/cross_validation.html
#svm -> Support Vector Classification
#ensemble ->  combine the predictions of several base estimators built with a given learning algorithm
#in order to improve generalizability / robustness over a single estimator.
#confusion_matrix->  evaluate the accuracy of a classification - which were correctly predicted and incorrectly predicted
#precision_recall_fscore_support -> Compute precision, recall, F-measure and support for each class
#F-measure -> weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0.
#ShuffleSplit -> Random permutation cross-validation
#RandomForestClassifier -> http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
#train_test_split -> process of spliting arrays or matrices into random train and test subsets
#StandardScaler -> standardizes features by removing the mean and scaling to unit variance
#OneHotEncoder -> converts features to so-called binary

In [None]:
#The “balanced” mode uses the values of y to automatically adjust weights inversely proportional
#to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)).