In [None]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
data = pd.read_csv('./data/airtrans_2020_Jan.csv') 
# if there are any missing values, then fill in with NaN
data = data.fillna(np.nan)
data.head()

#### Python lambda

- see the examples of lambda

https://www.w3schools.com/python/python_lambda.asp

In [None]:
# add a new column with 'delayed' as key with values either True or False
data['delayed'] = data['arr_delay'].apply(lambda x: x > 0) 

data.head()

In [None]:
# create a new DataFrame containing only delayed flights
delayed_flights = data[data['delayed'] == True]

delayed_flights.head()

In [None]:
# how many unique carriers that have delayed flights
delayed_flights['unique_carrier'].value_counts()

In [None]:
# retrieve flights with delayed information (False or True)
# group the data based on unique_carrier and count them.
delayed_by_carrier = data.groupby(['unique_carrier', 'delayed']).size().unstack()
delayed_by_carrier

In [None]:
# set default index (0, 1, 2, 3, ...)

delayed_by_carrier = delayed_by_carrier.reset_index()
delayed_by_carrier[:5]

In [None]:
# add a new column of the total flights
delayed_by_carrier['flight_count'] = delayed_by_carrier[False] + delayed_by_carrier[True]

# display top 5 rows
delayed_by_carrier[:5]

In [None]:
# calculate the ratio of delayed flights
delayed_by_carrier['proportion_delayed'] = delayed_by_carrier[True] / delayed_by_carrier['flight_count']

# display top 5 rows
delayed_by_carrier[:5]

In [None]:
# sort the data
delayed_by_carrier.sort_values('proportion_delayed', ascending=False)

### pivot_table()

- see examples of pivot_table()

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.pivot_table.html

In [None]:
data.pivot_table(columns='unique_carrier', values='arr_delay')

In [None]:
# retrieve the information of arrival delay of South West 
southwest = data[data['unique_carrier'] == 'WN']['arr_delay']
southwest.head()

In [None]:
southwest.describe()

count: there are 42,020 rows in the dataset, which is filtered to only show Southwest (WN).

mean: the average delay.
std: the standard deviation. More on this below.

min: the shortest delay in the dataset. In this case, the flight was very early.

25%: the 25th percentile. 25% of delays were lower than -9.00.

50%: the 50th percentile, or the median. 50% of delays were lower than 1.00.

75%: the 75th percentile. 75% of delays were lower than 19.00.

max: the longest delay in the dataset: 535.00.

In [None]:
wn_carrier = data[data['unique_carrier'] == 'WN']
wn_carrier['arr_delay'][:5]

### Another way to plot box plots and histograms

In [None]:
wn_carrier['arr_delay'].plot(kind='box', figsize=[4, 4])

In [None]:
# histograms
bin_values = np.arange(start=-50, stop=200, step=10)
wn_carrier['arr_delay'].hist(bins=bin_values, figsize = [14, 6])

In [None]:
hi_volume = data['origin'].value_counts()[:20]
hi_volume.head()

In [None]:
hi_volume_airports_names = hi_volume.index.tolist()
hi_volume_airports_names