#### **Task 8**: When (which hour) do most 'LONG', 'SHORT', 'MEDIUM' haul flights take off?

In [27]:
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
pd.set_option('display.max_rows', None)

In [28]:
df_flights = pd.read_csv(r'..\..\csvs\flights_random_sample.csv', sep=',')
df_flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,first_dep_time,total_add_gtime,longest_add_gtime,no_name
0,2018-01-01,WN,WN,WN,5431,WN,N291WN,5431,12889,LAS,...,986,1.0,0.0,0.0,0.0,18.0,,,,
1,2018-01-01,WN,WN,WN,989,WN,N287WN,989,12889,LAS,...,1099,0.0,0.0,11.0,0.0,35.0,,,,
2,2018-01-01,WN,WN,WN,1664,WN,N751SW,1664,12889,LAS,...,1099,,,,,,,,,
3,2018-01-01,WN,WN,WN,1106,WN,N704SW,1106,12889,LAS,...,197,,,,,,,,,
4,2018-01-01,WN,WN,WN,1559,WN,N423WN,1559,12889,LAS,...,197,37.0,0.0,0.0,0.0,0.0,,,,


In [29]:
# short, medium, long haul definitions from: https://en.wikipedia.org/wiki/Flight_length

In [30]:
df_flights['distance'].describe()

count    100998.000000
mean        774.058486
std         585.277397
min          31.000000
25%         345.000000
50%         612.000000
75%        1012.000000
max        4983.000000
Name: distance, dtype: float64

In [31]:
df_flights['haul'] = pd.cut(df_flights['distance'], bins = [0,800,2200,5000],labels=['short','medium','long'])

In [32]:
filter = df_flights[['haul','flights','dep_time']]
filter.isnull().sum()

haul           0
flights        0
dep_time    1573
dtype: int64

In [33]:
filter = filter.dropna()

In [34]:
filter.groupby(['haul'],as_index=False).count()

Unnamed: 0,haul,flights,dep_time
0,short,61937,61937
1,medium,33109,33109
2,long,4379,4379


In [36]:
filter['dep_time'].describe()

count    99425.000000
mean      1334.333437
std        502.777726
min          1.000000
25%        919.000000
50%       1327.000000
75%       1743.000000
max       2400.000000
Name: dep_time, dtype: float64

In [37]:
filter['dep_time'] = filter['dep_time'].astype(int)
filter['dep_time'] = pd.to_datetime(filter['dep_time'], format = '%H%M', errors = 'coerce')
filter['dep_time_hour'] = filter['dep_time'].dt.hour
filter.drop('dep_time', axis=1, inplace=True)
filter.head()

Unnamed: 0,haul,flights,dep_time_hour
0,medium,1,20.0
1,medium,1,22.0
2,medium,1,6.0
3,short,1,7.0
4,short,1,18.0


In [39]:
most_common_hour_by_type = filter.groupby(['haul','dep_time_hour'],as_index=False).count().sort_values(by=['haul','flights'],ascending=[True,False])
most_common_hour_by_type

Unnamed: 0,haul,dep_time_hour,flights
16,short,17.0,3958
11,short,12.0,3920
9,short,10.0,3781
14,short,15.0,3777
8,short,9.0,3760
10,short,11.0,3698
13,short,14.0,3690
15,short,16.0,3651
7,short,8.0,3616
5,short,6.0,3586


In [41]:
most_common_hour_by_type.groupby('haul').head(5)

Unnamed: 0,haul,dep_time_hour,flights
16,short,17.0,3958
11,short,12.0,3920
9,short,10.0,3781
14,short,15.0,3777
8,short,9.0,3760
28,medium,6.0,2369
29,medium,7.0,2317
33,medium,11.0,2160
34,medium,12.0,2137
30,medium,8.0,2134


- Most common hour for short-haul flights is 5 PM
- Most common hour for medium-haul flights is 6 AM
- Most common hour for long-haul flights is 8 AM

#### Implications for Feature Engineering:
- see if flight type (short, medium, long haul) is a predictor for arr_delay
- if so, try this as a feature in model