In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('dark_background')

import scipy.stats as st
from sklearn import preprocessing as pre

In [2]:
flights = pd.read_csv('../Data/files/flights_no_missing.csv')
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay'],
      dtype='object')

#### **Task 5**: What is the average percentage of delays that is already created before departure? (aka are arrival delays caused by departure delays?) Are airlines able to lower the delay during the flights?

In [3]:
flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,crs_elapsed_time,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2019-05-19,UA,UA_CODESHARE,UA,4264,EV,N48901,4264,12266,IAH,...,52.0,51.0,26.0,1,127,0.0,0.0,0.0,0.0,0.0
1,2019-05-19,UA,UA_CODESHARE,UA,4266,EV,N12540,4266,13244,MEM,...,112.0,102.0,81.0,1,468,0.0,0.0,0.0,0.0,0.0
2,2019-05-19,UA,UA_CODESHARE,UA,4272,EV,N11164,4272,12266,IAH,...,176.0,184.0,143.0,1,1091,0.0,0.0,0.0,0.0,0.0
3,2019-05-19,UA,UA_CODESHARE,UA,4281,EV,N13995,4281,11042,CLE,...,80.0,68.0,49.0,1,310,0.0,0.0,0.0,0.0,0.0
4,2019-05-19,UA,UA_CODESHARE,UA,4286,EV,N13903,4286,13061,LRD,...,76.0,80.0,57.0,1,301,0.0,0.0,0.0,0.0,0.0


In [8]:
#flights[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']]

In [4]:
#flights[['crs_dep_time', 'dep_time', 'wheels_off', 'taxi_out', 'dep_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'crs_arr_time', 'wheels_on','arr_time', 'taxi_in', 'arr_delay']].head(20)

We take into consideration the columns that have relevance to depatures and arrivals. We also filter out flight that have had arrival delays (arr_delay > 0).

In [4]:
flights[['crs_dep_time', 'dep_time', 'wheels_off', 'taxi_out', 'dep_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'crs_arr_time', 'wheels_on','arr_time', 'taxi_in', 'arr_delay']].query('arr_delay > 0').head(20)

Unnamed: 0,crs_dep_time,dep_time,wheels_off,taxi_out,dep_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,crs_arr_time,wheels_on,arr_time,taxi_in,arr_delay
2,1155,1151.0,1224.0,33.0,-4.0,0.0,0.0,0.0,0.0,0.0,1551,1547.0,1555.0,8.0,4.0
7,1730,1729.0,1830.0,61.0,-1.0,0.0,0.0,38.0,0.0,0.0,1917,1945.0,1955.0,10.0,38.0
8,1830,1835.0,1854.0,19.0,5.0,5.0,0.0,11.0,0.0,0.0,2050,2058.0,2106.0,8.0,16.0
10,740,736.0,749.0,13.0,-4.0,0.0,0.0,0.0,0.0,0.0,923,930.0,935.0,5.0,12.0
14,1400,1400.0,1436.0,36.0,0.0,0.0,0.0,0.0,0.0,0.0,1724,1731.0,1735.0,4.0,11.0
15,610,849.0,902.0,13.0,159.0,140.0,0.0,0.0,0.0,0.0,704,914.0,924.0,10.0,140.0
17,715,733.0,748.0,15.0,18.0,0.0,0.0,0.0,0.0,0.0,855,855.0,901.0,6.0,6.0
19,2130,2120.0,2134.0,14.0,-10.0,0.0,0.0,0.0,0.0,0.0,2236,2241.0,2248.0,7.0,12.0
23,1940,2104.0,2127.0,23.0,84.0,0.0,0.0,0.0,0.0,82.0,2037,2154.0,2159.0,5.0,82.0
26,1833,1934.0,2011.0,37.0,61.0,0.0,0.0,61.0,0.0,19.0,2151,2306.0,2311.0,5.0,80.0


In [5]:
flights[['dep_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay', 'arr_delay']].query('arr_delay > 0').head(20)

Unnamed: 0,dep_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,arr_delay
2,-4.0,0.0,0.0,0.0,0.0,0.0,4.0
7,-1.0,0.0,0.0,38.0,0.0,0.0,38.0
8,5.0,5.0,0.0,11.0,0.0,0.0,16.0
10,-4.0,0.0,0.0,0.0,0.0,0.0,12.0
14,0.0,0.0,0.0,0.0,0.0,0.0,11.0
15,159.0,140.0,0.0,0.0,0.0,0.0,140.0
17,18.0,0.0,0.0,0.0,0.0,0.0,6.0
19,-10.0,0.0,0.0,0.0,0.0,0.0,12.0
23,84.0,0.0,0.0,0.0,0.0,82.0,82.0
26,61.0,0.0,0.0,61.0,0.0,19.0,80.0


In [11]:
#flights[['crs_dep_time', 'dep_time', 'dep_delay','taxi_out', 'wheels_off', 'wheels_on', 'taxi_in', 'crs_arr_time', 'arr_time', 'arr_delay']].query('arr_delay > 0')

In [6]:
flights.arr_delay

0           -2.0
1          -14.0
2            4.0
3          -20.0
4           -1.0
           ...  
2339957    131.0
2339958      0.0
2339959    -15.0
2339960     -5.0
2339961     21.0
Name: arr_delay, Length: 2339962, dtype: float64

**Delays created before departure**

TBD

In [9]:
((flights[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].sum(axis=1) > 0).sum() / flights.shape[0]) * 100

19.252534870224387

In [7]:
((flights[['carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay']].sum(axis=1) <= 0).sum() / flights.shape[0]) * 100

80.7474651297756

In [10]:
flights.shape[0]

2339962