In [1]:
# Python library imports: numpy, random, sklearn, pandas, etc

import warnings
warnings.filterwarnings('ignore')

import sys
import random
import numpy as np

# cross_validation is deprecated since version 0.18. This module will be removed in 0.20.
# Use sklearn.model_selection.train_test_split instead.
# from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.model_selection import train_test_split
from sklearn import linear_model, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [45]:
# function to read HDFS file into dataframe using PyDoop
import pydoop.hdfs as hdfs
def read_csv_from_hdfs(path, cols, col_types=None):
    files = hdfs.ls(path);
    pieces = []
    for f in files:
        fhandle = hdfs.open(f)
        pieces.append(pd.read_csv(fhandle, names=cols, dtype=col_types, skiprows=1))
        fhandle.close()
    return pd.concat(pieces, ignore_index=True)

## Perform analysis flight data

In [46]:
# Load data and get the shape of the dataframe
# cols = ['year', 'month', 'day', 'dow', 'DepTime', 'CRSDepTime', 'ArrTime', 'CRSArrTime', 'Carrier', 'FlightNum', 
#         'TailNum', 'ActualElapsedTime', 'CRSElapsedTime', 'AirTime', 'ArrDelay', 'DepDelay', 'Origin', 'Dest', 
#         'Distance', 'TaxiIn', 'TaxiOut', 'Cancelled', 'CancellationCode', 'Diverted', 'CarrierDelay', 
#         'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay'];
columns = ['year', 'month', 'day_of_month', 'day_of_week', 'dep_time', 'crs_dep_time', 'arr_time', 'crs_arr_time', 'unique_carrier', 'flight_num', 'tail_num',
           'actual_elapsed_time', 'crs_elapsed_time', 'air_time', 'arr_delay', 'dep_delay', 'origin', 'dest', 'distance', 'taxi_in', 'taxi_out','cancelled',
           'cancellation_code', 'diverted', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']

PROCESSING_FILE_NAME = '1996.csv'
processing_file_name = f"./result/{PROCESSING_FILE_NAME}"
flight_df = read_csv_from_hdfs(processing_file_name, cols)

flight_df.shape

(5034804, 29)

In [47]:
flight_df.head()

Unnamed: 0,year,month,day,dow,DepTime,CRSDepTime,ArrTime,CRSArrTime,Carrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,1996,1,29,1,1996-01-29 20:39:00,1996-01-29 19:30:00,1996-01-29 22:45:00,1996-01-29 21:39:00,DL,345,...,6,10,0,0,,,,,,
1,1996,1,30,2,1996-01-30 19:31:00,1996-01-30 19:30:00,1996-01-30 21:42:00,1996-01-30 21:39:00,DL,345,...,5,22,0,0,,,,,,
2,1996,1,31,3,1996-01-31 19:56:00,1996-01-31 19:30:00,1996-01-31 22:31:00,1996-01-31 21:39:00,DL,345,...,7,27,0,0,,,,,,
3,1996,1,1,1,1996-01-01 17:30:00,1996-01-01 15:50:00,1996-01-01 19:09:00,1996-01-01 17:45:00,DL,411,...,4,14,0,0,,,,,,
4,1996,1,2,2,1996-01-02 17:14:00,1996-01-02 15:50:00,1996-01-02 18:41:00,1996-01-02 17:45:00,DL,411,...,4,8,0,0,,,,,,


In [50]:
# Convert dataframe to correct type
flight_df.columns

def convert_df_types(dataframe):
    dataframe['cancelled'] = dataframe['Cancelled'].astype(bool)
    dataframe['Diverted'] = dataframe['Diverted'].astype(bool)
    dataframe['DepDelay'] = dataframe['DepDelay'].astype(int)
#     dataframe["DepDelay"] = pd.to_numeric(dataframe["DepDelay"], downcast='float')
    
convert_df_types(flight_df)
# flight_df.head()
flight_df['DepDelay']

0           69
1            1
2           26
3          100
4           84
          ... 
5034799      1
5034800      9
5034801     -1
5034802     59
5034803     82
Name: DepDelay, Length: 5034804, dtype: int64

In [51]:
# 
df = flight_df[flight_df['Origin']=='ORD'].dropna(subset=['DepDelay'])
df['DepDelayed'] = df['DepDelay'].apply(lambda x: x>=15)
print("total flights: " + str(df.shape[0]))
print("total delays: " + str(df['DepDelayed'].sum()))

total flights: 264436
total delays: 57183
