**Predicting Airline Delays**<br>

Notebook: Feature Reduction
  
Team: Jimmy Nguyen, Maha Jayapal, Roberto Cancel<br>

In [1]:
!pip install --upgrade numpy #ensure numpy and pandas are upgraded to same versions for easier exploration (avoiding errors)
!pip install --upgrade pandas #ensure numpy and pandas are upgraded to same versions for easier exploration (avoiding errors)

# IMPORT LIBRARIES REQUIRED THROUGHOUT THE NOTEBOOK
import boto3 # AWS SDK for Python
import pandas as pd # for importing and manipulating data
import numpy as np
import io # for encoding issues with raw data sets
from io import StringIO # converting dataframe to csv and uploading to s3 bucket /tranformed folder

  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m
  from cryptography.utils import int_from_bytes
  from cryptography.utils import int_from_bytes
You should consider upgrading via the '/opt/conda/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
# IDENTIFY FILES IN S3 BUCKET
session = boto3.Session()

#Then use the session to get the resource
s3 = session.resource('s3')

my_bucket = s3.Bucket('ads-508-airline')

for my_bucket_object in my_bucket.objects.all():
    print(my_bucket_object.key)


raw/
raw/B43_AIRCRAFT_INVENTORY.csv
raw/CARRIER_DECODE.csv
raw/ONTIME_REPORTING_12.csv
raw/P10_EMPLOYEES.csv
raw/airport_weather_dec_2019.csv
raw/airports_list.csv
transformed/
transformed/B43_AIRCRAFT_INVENTORY.csv
transformed/CARRIER_DECODE.csv
transformed/ON_TIME_REPORTING_12.csv
transformed/P10_EMPLOYEES.csv
transformed/airport_weather_dec_2019.csv
transformed/airports_list.csv


xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>
INGEST DATA SETS<br>
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>

In [3]:
# INGEST FLIGHT DATA

s3_client = boto3.client("s3")

BUCKET='ads-508-airline'
KEY='raw/ONTIME_REPORTING_12.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
dec_flight = pd.read_csv(response.get("Body"))
dec_flight.head()

Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,12,8,7,WN,N8651A,3689,15016,STL,"St. Louis, MO",14679,...,245.0,266.0,1557.0,7,0.0,0.0,18.0,0.0,0.0,
1,12,8,7,WN,N939WN,2600,15016,STL,"St. Louis, MO",14683,...,145.0,125.0,786.0,4,,,,,,
2,12,8,7,WN,N7741C,2770,15016,STL,"St. Louis, MO",14683,...,140.0,131.0,786.0,4,,,,,,
3,12,8,7,WN,N550WN,6654,15016,STL,"St. Louis, MO",14747,...,275.0,256.0,1709.0,7,,,,,,
4,12,8,7,WN,N8319F,3402,15016,STL,"St. Louis, MO",14771,...,270.0,256.0,1735.0,7,,,,,,


In [4]:
# INGEST AIRCRAFT DATA - raw data that requires encoding='latin1'

KEY='raw/B43_AIRCRAFT_INVENTORY.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
s3_data = io.BytesIO(response.get('Body').read())
aircraft = pd.read_csv(s3_data, encoding='latin1')
aircraft.head()

Unnamed: 0,MANUFACTURE_YEAR,TAIL_NUM,NUMBER_OF_SEATS
0,1944,N54514,0.0
1,1945,N1651M,0.0
2,1953,N100CE,0.0
3,1953,N141FL,0.0
4,1953,N151FL,0.0


In [5]:
# INGEST CARRIER NAMES DICTIONARY

KEY='raw/CARRIER_DECODE.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
names = pd.read_csv(response.get("Body"))
names.head()


Unnamed: 0,AIRLINE_ID,OP_UNIQUE_CARRIER,CARRIER_NAME
0,21754,2PQ,21 Air LLC
1,21754,2PQ,21 Air LLC
2,21754,2PQ,21 Air LLC
3,20342,Q5,40-Mile Air
4,20342,WRB,40-Mile Air


In [6]:
# INGEST CARRIER EMPLOYEE / STAFFING DATA

KEY='raw/P10_EMPLOYEES.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
employees = pd.read_csv(response.get("Body"))
employees.head()

Unnamed: 0,YEAR,AIRLINE_ID,OP_UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,CARRIER,CARRIER_NAME,ENTITY,GENERAL_MANAGE,PILOTS_COPILOTS,OTHER_FLT_PERS,...,GEN_ARCFT_TRAF_HANDLING,AIRCRAFT_CONTROL,PASSENGER_HANDLING,CARGO_HANDLING,TRAINEES_INTRUCTOR,STATISTICAL,TRAFFIC_SOLICITERS,OTHER,TRANSPORT_RELATED,TOTAL
0,2019,21352,0WQ,Avjet Corporation,0WQ,Avjet Corporation,D,4,53,6,...,0,0,0,3,1,18,0,7,0,161
1,2019,21492,1BQ,"Eastern Airlines f/k/a Dynamic Airways, LLC",1BQ,"Eastern Airlines f/k/a Dynamic Airways, LLC",I,14,50,0,...,0,0,0,0,1,13,0,3,0,161
2,2019,21712,2HQ,Elite Airways LLC,2HQ,Elite Airways LLC,D,9,32,0,...,0,0,0,0,0,7,0,0,0,123
3,2019,21974,3EQ,"Scott Aviation, LLC d/b/a Silver Air",3EQ,"Scott Aviation, LLC d/b/a Silver Air",D,0,29,0,...,0,0,0,0,0,0,0,0,0,69
4,2019,20408,5V,Tatonduk Outfitters Limited d/b/a Everts Air A...,5V,Tatonduk Outfitters Limited d/b/a Everts Air A...,D,14,54,11,...,0,0,0,0,4,45,5,20,0,347


In [7]:
# INGEST DECEMBER 2019 DAILY WEATHER OBSERVATIONS

KEY='raw/airport_weather_dec_2019.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
weather_report = pd.read_csv(response.get("Body"))
weather_report.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,PSUN,SN32,SNOW,SNWD,...,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10,WT18
0,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,12/1/2019,16.11,,0.04,0.0,0.0,64.0,67.0,...,,,,,,,,,,
1,USW00013904,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",12/1/2019,10.29,,0.0,0.0,0.0,62.0,66.0,...,,,,,,,,,,
2,USW00093721,"BALTIMORE WASHINGTON INTERNATIONAL AIRPORT, MD US",12/1/2019,8.05,,0.62,0.0,0.0,41.0,45.0,...,,1.0,,,,,,,,
3,USW00013881,"CHARLOTTE DOUGLAS AIRPORT, NC US",12/1/2019,10.29,,0.6,0.0,0.0,56.0,68.0,...,,,,,,,,,,
4,USW00093812,"CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US",12/1/2019,11.41,,0.09,,,,60.0,...,,,,,,,,,,


In [8]:
# INGEST CITY AND AIRPORT NAME DICTIONARY

KEY='raw/airports_list.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
cities = pd.read_csv(response.get("Body"))
cities.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US"
1,10257,Albany International,"Albany, NY","ALBANY INTERNATIONAL AIRPORT, NY US"
2,10140,Albuquerque International Sunport,"Albuquerque, NM","ALBUQUERQUE INTERNATIONAL AIRPORT, NM US"
3,10299,Anchorage International,"Anchorage, AK","ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A..."
4,10397,Atlanta Municipal,"Atlanta, GA",ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...


xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>
REMOVE REDUNDANT/IRRELEVANT DEC_FLIGHT FEATURES DATA<br>
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>

Rationalization for dropped features:

MONTH - MONTH OF THE YEAR  - ** DROP SINCE THIS STUDY IS ONLY FOR DECEMBER **  <br>
OP_CARRIER_FL_NUM - CARRIER FLIGHT NUMBER - DROP - IRRELEVANT FOR PROJECT SCOPE<br>
ORIGIN_AIRPORT_ID - DEPARTURE AIRPORT ID - DROP - REDUNDANT AIRPORT ID <br>
ORIGIN_CITY_NAME - DEPARTURE CITY NAME, STATE - DROP - REDUNDANT AIRPORT ID  <br>
DEST_AIRPORT_ID - ARRIVAL AIRPORT ID - DROP - REDUNDANT AIRPORT ID  <br>
DEST_CITY_NAME - ARRIVAL CITY NAME, STATE - DROP - REDUNDANT AIRPORT ID<br>
CRS_DEP_TIME - SCHEDULED DEPARTURE TIME (local time: hhmm) - DROP -  REDUNDANT - DEP_TIME_BLK  <br>
DEP_TIME - ACTUAL DEPARTURE TIME (local time: hhmm)  - DROP REDUNDANT - DEP-DELAY15    <br>
DEP_DELAY_NEW - NUMBER OF MINUTES DELAYED  (EARLY=0) - DROP REDUNDANT - DEP-DELAY15 <br>
CRS_ARR_TIME - SCHEDULED ARRIVAL TIME (local time: hhmm) - DROP REDUNDANT - ARR_TIME_BLK  <br>
ARR_TIME - ACTUAL ARRIVAL TIME (local time: hhmm) - DROP -  IRRELEVANT FOR PROJECT SCOPE  <br>
ARR_DELAY_NEW - NUMBER OF MINUTES ARRIVAL DELAYED - DROP - IRRELEVANT    <br>
CANCELLATION_CODE - CANCELLED FLIGHT CODE - DROP -  REDUNDANT - CANCELLED  <br>
ACTUAL_ELAPSED_TIME - ACTUAL ELAPSED TIME - DROP -  IRRELEVANT FOR PROJECT SCOPE  <br>
Unnamed: 32 - BLANK ERROR CELL FROM SOURCE ** DROP ERROR ** <br>


In [9]:
# Dropping Redundant and Irrelevant features (flight specific, redundant airport ids, all actual departure data, all actual arrival data)
flight_no = ['MONTH', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST_CITY_NAME', 'CRS_DEP_TIME', 
           'DEP_TIME', 'DEP_DELAY_NEW', 'CRS_ARR_TIME', 'ARR_TIME', 'ARR_DELAY_NEW', 'CANCELLATION_CODE', 'ACTUAL_ELAPSED_TIME', 'Unnamed: 32']
dec_flight.drop(flight_no, inplace=True, axis=1)

In [10]:
# Save updated flight info to transformed folder in bucket
csv_buffer=StringIO()
dec_flight.to_csv(csv_buffer, index=False)

BUCKET_NAME = 'ads-508-airline'
FileName= 'transformed/ON_TIME_REPORTING_12.csv'

s3csv = boto3.client('s3')

response=s3csv.put_object(Body=csv_buffer.getvalue(),
                           Bucket=BUCKET_NAME,
                           Key=FileName)

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>
MASSAGE NAMES DATA<br>
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>

In [11]:
names

Unnamed: 0,AIRLINE_ID,OP_UNIQUE_CARRIER,CARRIER_NAME
0,21754,2PQ,21 Air LLC
1,21754,2PQ,21 Air LLC
2,21754,2PQ,21 Air LLC
3,20342,Q5,40-Mile Air
4,20342,WRB,40-Mile Air
...,...,...,...
2705,20379,ZKQ,Zantop International
2706,19771,ZAQ,Zas Airline Of Egypt
2707,21118,37,Zeal 320
2708,22069,ZG,ZIPAIR Tokyo Inc.


In [12]:
# Drop Duplicates to retain a dictionary
names.drop_duplicates(subset='OP_UNIQUE_CARRIER', inplace=True)
names = names.reset_index(drop=True)
names

Unnamed: 0,AIRLINE_ID,OP_UNIQUE_CARRIER,CARRIER_NAME
0,21754,2PQ,21 Air LLC
1,20342,Q5,40-Mile Air
2,20342,WRB,40-Mile Air
3,19627,CIQ,A/S Conair
4,19072,AAE,AAA Airlines
...,...,...,...
1739,20379,ZKQ,Zantop International
1740,19771,ZAQ,Zas Airline Of Egypt
1741,21118,37,Zeal 320
1742,22069,ZG,ZIPAIR Tokyo Inc.


In [13]:
# Save updated flight info to transformed folder in bucket
csv_buffer=StringIO()
names.to_csv(csv_buffer, index=False)

BUCKET_NAME = 'ads-508-airline'
FileName= 'transformed/CARRIER_DECODE.csv'

s3csv = boto3.client('s3')

response=s3csv.put_object(Body=csv_buffer.getvalue(),
                           Bucket=BUCKET_NAME,
                           Key=FileName)

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>
MASSAGE EMPLOYEES DATA<br>
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>

In [14]:
employees

Unnamed: 0,YEAR,AIRLINE_ID,OP_UNIQUE_CARRIER,UNIQUE_CARRIER_NAME,CARRIER,CARRIER_NAME,ENTITY,GENERAL_MANAGE,PILOTS_COPILOTS,OTHER_FLT_PERS,...,GEN_ARCFT_TRAF_HANDLING,AIRCRAFT_CONTROL,PASSENGER_HANDLING,CARGO_HANDLING,TRAINEES_INTRUCTOR,STATISTICAL,TRAFFIC_SOLICITERS,OTHER,TRANSPORT_RELATED,TOTAL
0,2019,21352,0WQ,Avjet Corporation,0WQ,Avjet Corporation,D,4,53,6,...,0,0,0,3,1,18,0,7,0,161
1,2019,21492,1BQ,"Eastern Airlines f/k/a Dynamic Airways, LLC",1BQ,"Eastern Airlines f/k/a Dynamic Airways, LLC",I,14,50,0,...,0,0,0,0,1,13,0,3,0,161
2,2019,21712,2HQ,Elite Airways LLC,2HQ,Elite Airways LLC,D,9,32,0,...,0,0,0,0,0,7,0,0,0,123
3,2019,21974,3EQ,"Scott Aviation, LLC d/b/a Silver Air",3EQ,"Scott Aviation, LLC d/b/a Silver Air",D,0,29,0,...,0,0,0,0,0,0,0,0,0,69
4,2019,20408,5V,Tatonduk Outfitters Limited d/b/a Everts Air A...,5V,Tatonduk Outfitters Limited d/b/a Everts Air A...,D,14,54,11,...,0,0,0,0,4,45,5,20,0,347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,2019,20207,XP,TEM Enterprises dba Avelo Airlines,XP,XTRA Airways,D,10,3,4,...,0,0,0,0,0,0,0,0,0,21
85,2019,20378,YV,Mesa Airlines Inc.,YV,Mesa Airlines Inc.,D,93,1312,0,...,55,57,0,0,4,93,0,109,0,2962
86,2019,20378,YV,Mesa Airlines Inc.,YV,Mesa Airlines Inc.,L,8,110,0,...,5,5,0,0,0,8,0,9,0,249
87,2019,20452,YX,Republic Airline,YX,Republic Airline,D,37,2444,19,...,0,183,23,0,67,267,0,260,0,6360


In [15]:
# Combine Carrier information for different entities and retain entitiy (only domestic), passenger handling (flight attendant), pass_gen_svc_admin (ground service), pilot_copilots and maintanence
employees = employees[['OP_UNIQUE_CARRIER', 'ENTITY', 'PILOTS_COPILOTS', 'PASSENGER_HANDLING', 'PASS_GEN_SVC_ADMIN', 'MAINTENANCE']]
# Drop on domestic entities
employees.drop(employees[employees['ENTITY'] != 'D'].index, inplace = True)
# Combine any remaining duplicates
employees = employees.groupby('OP_UNIQUE_CARRIER').sum().reset_index()
# Drop Parcel carriers (airlines with no flight attendants)
employees.drop(employees[employees['PASSENGER_HANDLING'] == 0].index, inplace = True)
employees = employees.reset_index(drop=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
employees

Unnamed: 0,OP_UNIQUE_CARRIER,PILOTS_COPILOTS,PASSENGER_HANDLING,PASS_GEN_SVC_ADMIN,MAINTENANCE
0,AA,8586,8586,15502,9677
1,AS,2893,1062,5737,898
2,B6,2840,4905,3888,726
3,DL,9293,15331,15809,6122
4,F9,1473,2496,154,237
5,G4,953,200,1626,420
6,HA,586,893,1466,419
7,MQ,2109,4923,1510,1565
8,NK,2126,264,3592,395
9,OO,5175,1407,4076,2145


In [17]:
# Save updated flight info to transformed folder in bucket
csv_buffer=StringIO()
employees.to_csv(csv_buffer, index=False)

BUCKET_NAME = 'ads-508-airline'
FileName= 'transformed/P10_EMPLOYEES.csv'

s3csv = boto3.client('s3')

response=s3csv.put_object(Body=csv_buffer.getvalue(),
                           Bucket=BUCKET_NAME,
                           Key=FileName)

xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>
MASSAGE WEATHER DATA<br>
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx<br>

In [18]:
# limit scope of weather metrics (date, precipitation, snow, max temp, and wind)
weather = weather_report[['DATE', 'NAME', 'PRCP', 'SNOW', 'SNWD','TMAX', 'AWND']]

In [19]:
# Save weather flight info to transformed folder in bucket
csv_buffer=StringIO()
weather.to_csv(csv_buffer, index=False)

BUCKET_NAME = 'ads-508-airline'
FileName= 'transformed/airport_weather_dec_2019.csv'

s3csv = boto3.client('s3')

response=s3csv.put_object(Body=csv_buffer.getvalue(),
                           Bucket=BUCKET_NAME,
                           Key=FileName)