In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('dark_background')

import scipy.stats as st
from sklearn import preprocessing as pre

In [2]:
flights = pd.read_csv('../Data/files/flights_no_missing.csv')
flights.columns

passengers = pd.read_csv('../Data/files/passengers_no_missing.csv')
passengers.columns

fuel = pd.read_csv('../Data/files/fuel_no_missing.csv')
fuel.columns

Index(['month', 'airline_id', 'unique_carrier', 'carrier', 'carrier_name',
       'carrier_group_new', 'sdomt_gallons', 'satl_gallons', 'spac_gallons',
       'slat_gallons', 'sint_gallons', 'ts_gallons', 'tdomt_gallons',
       'tint_gallons', 'total_gallons', 'sdomt_cost', 'satl_cost', 'spac_cost',
       'slat_cost', 'sint_cost', 'ts_cost', 'tdomt_cost', 'tint_cost',
       'total_cost', 'year'],
      dtype='object')

#### **Task 10**: Do bigger delays lead to bigger fuel comsumption per passenger? 
We need to do four things to answer this as accurate as possible:
- Find out average monthly delay per air carrier (monthly delay is sum of all delays in 1 month)
- Find out distance covered monthly by different air carriers
- Find out number of passengers that were carried by different air carriers
- Find out total fuel comsumption per air carrier.

Use this information to get the average fuel comsumption per passenger per km. Is this higher for the airlines with bigger average delays?

- Find out average monthly delay per air carrier (monthly delay is sum of all delays in 1 month)

In [3]:
flights.fl_date

0          2019-05-19
1          2019-05-19
2          2019-05-19
3          2019-05-19
4          2019-05-19
              ...    
2339957    2019-05-19
2339958    2019-05-19
2339959    2019-05-19
2339960    2019-05-19
2339961    2019-05-19
Name: fl_date, Length: 2339962, dtype: object

In [4]:
pd.DatetimeIndex(flights['fl_date'].apply(pd.to_datetime)).month

Int64Index([5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            ...
            5, 5, 5, 5, 5, 5, 5, 5, 5, 5],
           dtype='int64', name='fl_date', length=2339962)

In [21]:
flights[['fl_date']] = flights[['fl_date']].apply(pd.to_datetime)

In [23]:
flights['month'] = pd.DatetimeIndex(flights['fl_date']).month

In [24]:
flights.head()

Unnamed: 0,fl_date,mkt_unique_carrier,branded_code_share,mkt_carrier,mkt_carrier_fl_num,op_unique_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin,...,actual_elapsed_time,air_time,flights,distance,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,month
0,2019-05-19,UA,UA_CODESHARE,UA,4264,EV,N48901,4264,12266,IAH,...,51.0,26.0,1,127,0.0,0.0,0.0,0.0,0.0,5
1,2019-05-19,UA,UA_CODESHARE,UA,4266,EV,N12540,4266,13244,MEM,...,102.0,81.0,1,468,0.0,0.0,0.0,0.0,0.0,5
2,2019-05-19,UA,UA_CODESHARE,UA,4272,EV,N11164,4272,12266,IAH,...,184.0,143.0,1,1091,0.0,0.0,0.0,0.0,0.0,5
3,2019-05-19,UA,UA_CODESHARE,UA,4281,EV,N13995,4281,11042,CLE,...,68.0,49.0,1,310,0.0,0.0,0.0,0.0,0.0,5
4,2019-05-19,UA,UA_CODESHARE,UA,4286,EV,N13903,4286,13061,LRD,...,80.0,57.0,1,301,0.0,0.0,0.0,0.0,0.0,5


In [27]:
fuel.head()

Unnamed: 0,month,airline_id,unique_carrier,carrier,carrier_name,carrier_group_new,sdomt_gallons,satl_gallons,spac_gallons,slat_gallons,...,sdomt_cost,satl_cost,spac_cost,slat_cost,sint_cost,ts_cost,tdomt_cost,tint_cost,total_cost,year
0,1,21352.0,0WQ,0WQ,Avjet Corporation,1,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,396216,140239.0,536455,2016
1,1,21645.0,23Q,23Q,Songbird Airways Inc.,1,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,2016
2,1,21652.0,27Q,27Q,"Jet Aviation Flight Services, Inc.",1,0.0,0.0,0.0,0.0,...,0,0.0,0.0,0.0,0.0,0,0,0.0,0,2016
3,1,20408.0,5V,5V,Tatonduk Outfitters Limited d/b/a Everts Air A...,1,260848.0,0.0,0.0,0.0,...,522405,0.0,0.0,0.0,0.0,522405,569497,0.0,569497,2016
4,1,19917.0,5X,5X,United Parcel Service,3,32138000.0,9743000.0,16116000.0,2972000.0,...,34098000,9752000.0,17965000.0,3524000.0,31241000.0,65339000,34098000,31241000.0,65339000,2016


In [None]:
flights_fuel_join = pd.merge(flights, fuel)

- Find out distance covered monthly by different air carriers

In [34]:
flights.groupby(by=['mkt_unique_carrier','month']).sum()['distance']

mkt_unique_carrier  month
AA                  1        33445645
                    2        30951169
                    3        35191884
                    4        33628664
                    5        36084358
                               ...   
WN                  8        25145359
                    9        23554376
                    10       25400190
                    11       24732653
                    12       25617860
Name: distance, Length: 123, dtype: int64

In [44]:
flights.groupby(['mkt_unique_carrier', 'month']).agg({'distance': ['sum']}).reset_index()

Unnamed: 0_level_0,mkt_unique_carrier,month,distance
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,sum
0,AA,1,33445645
1,AA,2,30951169
2,AA,3,35191884
3,AA,4,33628664
4,AA,5,36084358
...,...,...,...
118,WN,8,25145359
119,WN,9,23554376
120,WN,10,25400190
121,WN,11,24732653


In [36]:
flights.columns

Index(['fl_date', 'mkt_unique_carrier', 'branded_code_share', 'mkt_carrier',
       'mkt_carrier_fl_num', 'op_unique_carrier', 'tail_num',
       'op_carrier_fl_num', 'origin_airport_id', 'origin', 'origin_city_name',
       'dest_airport_id', 'dest', 'dest_city_name', 'crs_dep_time', 'dep_time',
       'dep_delay', 'taxi_out', 'wheels_off', 'wheels_on', 'taxi_in',
       'crs_arr_time', 'arr_time', 'arr_delay', 'cancelled',
       'cancellation_code', 'diverted', 'dup', 'crs_elapsed_time',
       'actual_elapsed_time', 'air_time', 'flights', 'distance',
       'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay',
       'late_aircraft_delay', 'month'],
      dtype='object')

In [35]:
fuel.columns

Index(['month', 'airline_id', 'unique_carrier', 'carrier', 'carrier_name',
       'carrier_group_new', 'sdomt_gallons', 'satl_gallons', 'spac_gallons',
       'slat_gallons', 'sint_gallons', 'ts_gallons', 'tdomt_gallons',
       'tint_gallons', 'total_gallons', 'sdomt_cost', 'satl_cost', 'spac_cost',
       'slat_cost', 'sint_cost', 'ts_cost', 'tdomt_cost', 'tint_cost',
       'total_cost', 'year'],
      dtype='object')

- Find out number of passengers that were carried by different air carriers

- Find out total fuel comsumption per air carrier.

In [40]:
fuel.groupby(by='unique_carrier').sum()['total_gallons']

unique_carrier
09Q     41034111.0
0JQ       838615.0
0WQ      3253401.0
1BQ      6672792.0
23Q       856284.0
          ...     
X9     176878857.0
XP      19013295.0
YV             0.0
YX     287098389.0
ZW     130683426.0
Name: total_gallons, Length: 62, dtype: float64