# **Airline Flight Delays**


In [1]:
# file imports

import pandas as pd
import numpy as np
import os


airlines = pd.read_csv('./data/airlines.csv')
cancellation_codes = pd.read_csv('./data/cancellation_codes.csv')
flights = pd.read_csv('./data/flights.csv', low_memory=False)

In [2]:
flights.columns

Index(['YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'AIRLINE', 'FLIGHT_NUMBER',
       'TAIL_NUMBER', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',
       'SCHEDULED_DEPARTURE', 'DEPARTURE_TIME', 'DEPARTURE_DELAY', 'TAXI_OUT',
       'WHEELS_OFF', 'SCHEDULED_TIME', 'ELAPSED_TIME', 'AIR_TIME', 'DISTANCE',
       'WHEELS_ON', 'TAXI_IN', 'SCHEDULED_ARRIVAL', 'ARRIVAL_TIME',
       'ARRIVAL_DELAY', 'DIVERTED', 'CANCELLED', 'CANCELLATION_REASON',
       'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
       'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'],
      dtype='object')

### **Data Manipulation**


In [3]:
# Creating an index column so that we can use it later for reference if we ever want to split up the dataframe

flights['ID'] = flights.index

# creating date data-type from the available date data

flights['flight_date'] = pd.to_datetime(
    flights['YEAR'].astype('str') + "-" +
    flights['MONTH'].astype('str') + "-" +
    flights['DAY'].astype('str')
)

# Since we only need the hours, we will integer divide the number to get the hour value in 24-hour format

flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'] // 100
flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'] // 100

# Lasty we remove the negative values since we don't need them anyway later on

cols_list = [
    'CANCELLED',
    'DEPARTURE_DELAY',
    'ARRIVAL_DELAY',
    'AIR_SYSTEM_DELAY',
    'SECURITY_DELAY',
    'AIRLINE_DELAY',
    'LATE_AIRCRAFT_DELAY',
    'WEATHER_DELAY'
]

for column in cols_list:
    flights[column] = np.where(flights[column] > 0, flights[column], np.nan)

In [4]:
# let's see how our dataframe looks like now

flights.head(100)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,ID,flight_date
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,0,...,0,,,,,,,,0,2015-01-01
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,0,...,0,,,,,,,,1,2015-01-01
2,2015,1,1,4,US,840,N171US,SFO,CLT,0,...,0,,,,,,,,2,2015-01-01
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,0,...,0,,,,,,,,3,2015-01-01
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,0,...,0,,,,,,,,4,2015-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2015,1,1,4,B6,2001,N358JB,BUF,JFK,5,...,0,,,,,,,,95,2015-01-01
96,2015,1,1,4,B6,2807,N190JB,PWM,JFK,5,...,0,,,,,,,,96,2015-01-01
97,2015,1,1,4,B6,2023,N324JB,JFK,SJU,5,...,0,,,,,19.0,,,97,2015-01-01
98,2015,1,1,4,EV,4557,N12967,CRP,IAH,5,...,0,,,,,,,,98,2015-01-01


In [5]:
# removing the unwanted columns from our dataframe and also rearranging the column order

flights_cleaned = flights[[

    'ID', 'flight_date', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT',

    'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY', 'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY', 'CANCELLED', 'CANCELLATION_REASON',

    'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'

]]

In [6]:
# joining the columns from cancellation codes and airlines dataframe

flights_cleaned = (
    flights_cleaned
    .join(
        cancellation_codes
        .set_index('CANCELLATION_REASON'), on='CANCELLATION_REASON'
    )
    .join(
        airlines
        .set_index('IATA_CODE'), on='AIRLINE', lsuffix='_'
    )
)

In [8]:
# lastly we again rearrange the order of columns and rename some of the columns

flights_cleaned = flights_cleaned[[
    'ID', 'flight_date', 'DAY_OF_WEEK', 'AIRLINE', 'ORIGIN_AIRPORT',
    'DESTINATION_AIRPORT', 'CANCELLED', 'CANCELLATION_DESCRIPTION',
    'SCHEDULED_DEPARTURE', 'DEPARTURE_DELAY',
    'SCHEDULED_ARRIVAL', 'ARRIVAL_DELAY',
    'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
    'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY'
]].rename(
    columns={
        'ID': 'ID',
        'flight_date': 'Date',
        'DAY_OF_WEEK': 'Week Day',
        'AIRLINE': 'Airline',
        'ORIGIN_AIRPORT': 'Departure Airport',
        'DESTINATION_AIRPORT': 'Arrival Airport',
        'CANCELLED': 'Cancelled',
        'CANCELLATION_DESCRIPTION': 'Cancellation Reason',
        'SCHEDULED_DEPARTURE': 'Departure Hour',
        'DEPARTURE_DELAY': 'Departure Delay',
        'SCHEDULED_ARRIVAL': 'Arrival Hour',
        'ARRIVAL_DELAY': 'Arrival Delay',
        'AIR_SYSTEM_DELAY': 'Air System Delay',
        'SECURITY_DELAY': 'Security Delay',
        'AIRLINE_DELAY': 'Airline Delay',
        'LATE_AIRCRAFT_DELAY': 'Late Aircraft Delay',
        'WEATHER_DELAY': 'Weather Delay'
    }
)

### **Exporting files**

In [9]:
# creating a new folder and writing the dataframe to that folder

try: os.mkdir('./clean-data')
except: pass

flights_cleaned.to_csv('./clean-data/flights_cleaned.csv', index=False)