In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../datasets/flight-delays'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
low_memory=False

In [None]:
df = pd.read_csv('../datasets/flight-delays/flights.csv')

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['WHEELS_OFF'], bins=15)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['DAY'], bins=31)
plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['MONTH'], bins=12)
plt.show()

**Cleaning** Unnecessary Columns for the Delay Calculation

In [None]:
df.columns

In [None]:
df.drop(['ARRIVAL_TIME',  'DIVERTED',
       'CANCELLED', 'CANCELLATION_REASON', 'AIR_SYSTEM_DELAY',
       'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY',
       'WEATHER_DELAY','DEPARTURE_TIME','WHEELS_ON', 'TAXI_IN',
       'DEPARTURE_DELAY', 'TAXI_OUT', 'WHEELS_OFF',
         'ELAPSED_TIME', 'AIR_TIME',],axis = 1,inplace = True)

In [None]:
df.dtypes

In [None]:
df['AIRLINE'].value_counts()

In [None]:
df['ORIGIN_AIRPORT'].value_counts()

we see there are some airports, only flew 1 times. So we can filter out the airports with less than 50 flights. 

In [None]:
#x = df.groupby('DESTINATION_AIRPORT').count()
#y = x[x.iloc[:,0]<50].index
#y

We found a better approach, the iata codes of airports are already in airports.csv file, so we filter out the flights to elsewhere than the airports in the airports.csv list. 

In [None]:
airports = pd.read_csv('../datasets/flight-delays/airports.csv')
airports.head()

In [None]:
df = df[df['ORIGIN_AIRPORT'].isin(airports['IATA_CODE'])]
df = df[df['DESTINATION_AIRPORT'].isin(airports['IATA_CODE'])]
df.shape

Airlines in a single column -> One Hot Encoding -> Multiple Columns

In [None]:
pd.get_dummies(df['AIRLINE'])

In [None]:
df = pd.concat([df, pd.get_dummies(df['AIRLINE'])],axis = 1)
df.drop('AIRLINE',axis = 1,inplace = True)
df.head()

In [None]:
df.drop('FLIGHT_NUMBER',axis = 1, inplace = True)

In [None]:
df.dtypes

# **Calculating Ground Time** : 
Ground Time is the time before each flight and if the ground time longer, the expected delay will be less (because the staff will have more time for routines). 
Ground Time = (Scheduled Departure time) - (Scheduled Arrival time of previous flight)

So, how can we find the scheduled arrival time of previous flight?
Solution: 
1. Sort flights for each tail number.
2. Add (previous flight arrival time) to the next flight 
3. Subtract two values : (Scheduled Departure time) - (Scheduled Arrival time of previous flight)


In [None]:
df['TAIL_NUMBER'].value_counts()

Now, we discover some planes (tail_numbers) have less than 50 flights (even 1 flight), so we filter them out.

In [None]:
x = df.groupby('TAIL_NUMBER').count()
y = x[x.iloc[:,0]<50].index
y

In [None]:
df = df[~df['TAIL_NUMBER'].isin(y)]
df.shape

In [None]:
import matplotlib.pyplot as plt
plt.hist(df['SCHEDULED_DEPARTURE'], bins=1000)
plt.show()

We understand that, the scheduled_time column keeps time of departure in numeric format, for example 3:40 is converted to 340. Now we transform it to minutes from 00:00

In [None]:
df['SCHEDULED_DEPARTURE']=round(df['SCHEDULED_DEPARTURE']/100,0)*60 + df['SCHEDULED_DEPARTURE']%100
df['SCHEDULED_ARRIVAL']=round(df['SCHEDULED_ARRIVAL']/100,0)*60 + df['SCHEDULED_ARRIVAL']%100
plt.hist(df['SCHEDULED_DEPARTURE'], bins=1000)
plt.show()

now the values are between 0 to 1440 (24.00) and we can subtract them from each other

# Calculating the Ground Time and Delay from Previous Flight

We believe if there is a delay from the previous flight and there is a tiny ground time, the delay for this flight is inevitable. 

In [None]:
df= df.sample(frac=0.01, replace=True, random_state=1)

In [None]:
df.shape

In [None]:
result = pd.DataFrame(columns =['Previous_Arrival', 'Previous_Delay']  + list(df.columns)  )
for air_plane in df['TAIL_NUMBER'].unique():
    temp = df[df['TAIL_NUMBER']==air_plane]
    temp = temp.sort_values(['YEAR','MONTH','DAY','SCHEDULED_DEPARTURE'])
    temp['Previous_Arrival'] = temp['SCHEDULED_ARRIVAL'].shift(1)
    temp['Previous_Delay'] = temp['ARRIVAL_DELAY'].shift(1)
    result = pd.concat([result,temp])
result.head()


In [None]:
def f(x):
    if(x<0):
        return 1440 + x
    return x

result['Ground_Time'] = result['SCHEDULED_DEPARTURE'] - result['Previous_Arrival']
result['Ground_Time'] = result['Ground_Time'].apply(f)
df = result
df.head()

In [None]:

df.drop(['TAIL_NUMBER'],axis = 1,inplace = True)

In [None]:
df['Route'] = df['ORIGIN_AIRPORT'] + df['DESTINATION_AIRPORT']
df.drop(['ORIGIN_AIRPORT','DESTINATION_AIRPORT'],axis = 1,inplace = True)

In [None]:
df.head()

In [None]:
len(df['Route'].unique())

In [None]:
# Route should be a part of training data , most probably with one hot encoding
# because of memory limit we drop the column
df.drop('Route',axis = 1,inplace = True)

# Missing Data
For simplicity we replace the missing data with 0 
- ARRIVAL_DELAY = 0 , means no delay
- Previous_Delay = 0, no previous delay
- Ground_Time = 0 , no ground time before flight (usually for the first flight of the tail_number)

In [None]:
df = df.fillna(0)

# Machine Learning -> Modeling Phase
First, we create the input and target data sets , X and y
than create the training and test sets with train_test_split from sklearn

In [None]:
df.corr()

In [None]:
X = df.drop('ARRIVAL_DELAY',axis = 1)
y = df['ARRIVAL_DELAY']


We convert the problem to "delay" or "no delay" classes. 


**Assumption** : 
- if the delay is greater than 15 minutes we accept it as a delay = 1 , 
- if there is no delay or less than 15 minutes than delay = 0

In [None]:
y = y> 15
y = y.replace({True:1,False:0})
y

In [None]:
y.value_counts()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)
X_train.head()

First attempt is trying random forest algorithm

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test,y_pred)
acc

In [None]:
rfc.feature_importances_

In [None]:
X.columns