## Scikit-learn Classification

In [20]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn as sklearn
from sklearn import linear_model, cross_validation, metrics, svm, ensemble
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.cross_validation  import train_test_split, cross_val_score, ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder


In [21]:


flights = pd.read_csv('../../data/nycflights13/flights.csv.gz')
weather = pd.read_csv('../../data/nycflights13/weather.csv.gz')
airports = pd.read_csv('../../data/nycflights13/airports.csv.gz')

df_withweather = pd.merge(flights, weather, how='left', on=['year','month', 'day', 'hour'])
df = pd.merge(df_withweather, airports, how='left', left_on='dest', right_on='faa')

df = df.dropna()


In [22]:
df

Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,carrier,...,visib,time_hour_y,faa,name,lat,lon,alt,tz,dst,tzone
4,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,...,10.0,2013-01-01 01:00:00,ATL,Hartsfield Jackson Atlanta Intl,33.636719,-84.428067,1026.0,-5.0,A,America/New_York
5,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,...,10.0,2013-01-01 01:00:00,ATL,Hartsfield Jackson Atlanta Intl,33.636719,-84.428067,1026.0,-5.0,A,America/New_York
6,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,DL,...,10.0,2013-01-01 01:00:00,ATL,Hartsfield Jackson Atlanta Intl,33.636719,-84.428067,1026.0,-5.0,A,America/New_York
8,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,...,10.0,2013-01-01 01:00:00,FLL,Fort Lauderdale Hollywood Intl,26.072583,-80.152750,9.0,-5.0,A,America/New_York
9,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,...,10.0,2013-01-01 01:00:00,FLL,Fort Lauderdale Hollywood Intl,26.072583,-80.152750,9.0,-5.0,A,America/New_York
10,2013,1,1,555.0,600,-5.0,913.0,854,19.0,B6,...,10.0,2013-01-01 01:00:00,FLL,Fort Lauderdale Hollywood Intl,26.072583,-80.152750,9.0,-5.0,A,America/New_York
11,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,...,10.0,2013-01-01 01:00:00,IAD,Washington Dulles Intl,38.944533,-77.455811,313.0,-5.0,A,America/New_York
12,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,...,10.0,2013-01-01 01:00:00,IAD,Washington Dulles Intl,38.944533,-77.455811,313.0,-5.0,A,America/New_York
13,2013,1,1,557.0,600,-3.0,709.0,723,-14.0,EV,...,10.0,2013-01-01 01:00:00,IAD,Washington Dulles Intl,38.944533,-77.455811,313.0,-5.0,A,America/New_York
14,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,B6,...,10.0,2013-01-01 01:00:00,MCO,Orlando Intl,28.429394,-81.308994,96.0,-5.0,A,America/New_York


In [23]:

pred = 'dep_delay'
features =  ['month','day','dep_time','arr_time','carrier','dest','air_time','distance', 
             'lat', 'lon', 'alt',  'dewp', 'humid', 'wind_speed', 'wind_gust', 
             'precip', 'pressure', 'visib' ]

features_v = df[features]
pred_v = df[pred]

how_late_is_late = 15.0;

pd.options.mode.chained_assignment = None  # default='warn'


# carrier is not a number, so transform it into an number
features_v['carrier'] = pd.factorize(features_v['carrier'])[0]

# dest is not a number, so transform it into a number
features_v['dest'] = pd.factorize(features_v['dest'])[0]

scaler = StandardScaler()
scaled_features_v = scaler.fit_transform(features_v)

features_train, features_test, pred_train, pred_test = train_test_split(
    scaled_features_v, pred_v, test_size=0.30, random_state=0)


In [24]:
# Perform logistic regression for classification

clf_lr = sklearn.linear_model.LogisticRegression(penalty='l2', 
                                                 class_weight='balanced')
logistic_fit=clf_lr.fit(scaled_features_train, 
                        np.where(pred_train >= how_late_is_late,1,0))

predictions = clf_lr.predict(scaled_features_test)

In [25]:
# Summary Report

# Confusion Matrix
cm_lr = confusion_matrix(np.where(pred_test >= how_late_is_late,1,0), 
                         predictions)
print("Confusion matrix")
print(pd.DataFrame(cm_lr))

# Get accuracy
report_lr = precision_recall_fscore_support(
    list(np.where(pred_test >= how_late_is_late,1,0)), 
    list(predictions), average='binary')

#Print Accuracy
print ("\nprecision = %0.2f, recall = %0.2f, F1 = %0.2f, accuracy = %0.2f"
       % (report_lr[0], report_lr[1], report_lr[2],                                                                         
          accuracy_score(list(np.where(pred_test >= how_late_is_late,1,0)), 
                                                                                             list(predictions))))



Confusion matrix
        0      1
0  135181  64481
1   17910  33883

precision = 0.34, recall = 0.65, F1 = 0.45, accuracy = 0.67


An accuracy of 67% is not particularly good. However, predicting flight delays from the data we have is not easy. 