In [70]:
import datetime, warnings, scipy 
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from datetime import date, timedelta

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 999
pd.options.display.max_columns = 50

In [80]:
# Function that convert the 'HHMM' string to datetime.time
def format_heure(chaine):
    if pd.isnull(chaine):
        return np.nan
    else:
        if chaine == 2400: chaine = 0
        chaine = "{0:04d}".format(int(chaine))
        heure = datetime.time(int(chaine[0:2]), int(chaine[2:4]))
        return heure

# Function that combines a date and time to produce a datetime.datetime
def combine_date_heure(x):
    if pd.isnull(x[0]) or pd.isnull(x[1]):
        return np.nan
    else:
        return datetime.datetime.combine(x[0],x[1])

# Function that combine two columns of the dataframe to create a datetime format
def create_flight_time(df, col):    
    liste = []
    for index, cols in df[['DATE', col]].iterrows():    
        if pd.isnull(cols[1]):
            liste.append(np.nan)
        elif float(cols[1]) == 2400:
            cols[0] += datetime.timedelta(days=1)
            cols[1] = datetime.time(0,0)
            liste.append(combine_date_heure(cols))
        else:
            cols[1] = format_heure(cols[1])
            liste.append(combine_date_heure(cols))
    return pd.Series(liste)


def calculate_delayed(x):
    if x.days == 0:
        return (x.seconds * -1)/60
    if x.seconds > 50000:
        return (86400 - x.seconds)/60
    else:
        return (x.seconds * -1)/60

In [3]:
df = pd.read_csv('flights_train.csv', parse_dates=False)

In [101]:
df2 = df.copy()

In [102]:
df2['DATE'] = pd.to_datetime(df2[['YEAR','MONTH', 'DAY']])
df2['SCHEDULED_DEPARTURE'] = create_flight_time(df2, 'SCHEDULED_DEPARTURE')
df2['DEPARTURE_TIME'] = df2['DEPARTURE_TIME'].apply(format_heure)
df2['SCHEDULED_ARRIVAL'] = df2['SCHEDULED_ARRIVAL'].apply(format_heure)
df2['SCHEDULED_DEPARTURE_TIME'] = df2['SCHEDULED_DEPARTURE'].dt.time
df2['t1'] = df2['SCHEDULED_DEPARTURE_TIME'].apply(lambda x: datetime.datetime.strptime(str(x),'%H:%M:%S'))
df2['t2'] = df2['DEPARTURE_TIME'].apply(lambda x: datetime.datetime.strptime(str(x),'%H:%M:%S'))
df2['DELAYED_DEPARTURE'] = df2['t1'] - df2['t2']
df2['DELAYED_DEPARTURE'] = df2['DELAYED_DEPARTURE'].apply(calculate_delayed)

df2['SCHEDULED_TIME'] = df2['SCHEDULED_TIME'].astype('int')
df2['ARRIVAL_DELAY'] = df2['ARRIVAL_DELAY'].astype('int')
df2['TAXI_OUT'] = df2['TAXI_OUT'].astype('int')
df2['WHEELS_OFF'] = df2['WHEELS_OFF'].astype('int')

In [103]:
# sanity check

df2['DELAYED_DEPARTURE'][0] # -11
df2['DELAYED_DEPARTURE'][1] # -8
df2['DELAYED_DEPARTURE'][7] # 14
df2['DELAYED_DEPARTURE'][9] # 3
df2['DELAYED_DEPARTURE'][20] # 25
df2['DELAYED_DEPARTURE'][36] # 3

-660

-480

840

180

1500

180

In [104]:
columns_to_drop = ['YEAR', 'id', 'MONTH', 'DAY', 'FLIGHT_NUMBER', 
                   'DESTINATION_AIRPORT', 'TAIL_NUMBER', 'SCHEDULED_DEPARTURE', 
                   'DEPARTURE_TIME', 'DATE', 't1', 't2', 'SCHEDULED_ARRIVAL']

df3 = df2.drop(columns_to_drop, axis = 1, inplace = False)

In [105]:
hours = pd.to_datetime(df3['SCHEDULED_DEPARTURE_TIME'], format='%H:%M:%S').dt.hour

df3['TIME_OF_DAY'] = pd.cut(hours, 
                    bins=[0,6,12,18,24], 
                    include_lowest=True, 
                    labels=['Night','Morning','Afternoon','Evening'])

In [106]:
df3.head()

Unnamed: 0,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,DISTANCE,ARRIVAL_DELAY,SCHEDULED_DEPARTURE_TIME,DELAYED_DEPARTURE,TIME_OF_DAY
0,4,AS,ANC,21,15,205,1448,-22,00:05:00,-660,Night
1,4,AA,LAX,12,14,280,2330,-9,00:10:00,-480,Night
2,4,US,SFO,16,34,286,2296,5,00:20:00,-120,Night
3,4,AA,LAX,15,30,285,2342,-9,00:20:00,-300,Night
4,4,AS,SEA,11,35,235,1448,-21,00:25:00,-60,Night


In [107]:
df4 = df3.copy()
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(sparse=False)

integer_encoded = label_encoder.fit_transform(df4['ORIGIN_AIRPORT'])
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
origin_airport_encoded = onehot_encoder.fit_transform(integer_encoded)

integer_encoded = label_encoder.fit_transform(df4['AIRLINE'])
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
airline_encoded = onehot_encoder.fit_transform(integer_encoded)

integer_encoded = label_encoder.fit_transform(df4['TIME_OF_DAY'])
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
timeOfday_encoded = onehot_encoder.fit_transform(integer_encoded)