# Packages 

In [1]:
import pandas as pd 
import numpy as np
import boto3
import sagemaker
import time

# Load Data from S3 Public Bucket

## Import December Flights Information

In [2]:
# Using boto for s3
s3_client = boto3.client("s3")

# Bucket name
BUCKET='ads-508'

# file name
KEY='ONTIME_REPORTING_12.csv'

# Get object in bucket
dec_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

# read in data as pandas data frame
Dec_flight = pd.read_csv(dec_response.get("Body"))
Dec_flight.head()


Unnamed: 0,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN,ORIGIN_CITY_NAME,DEST_AIRPORT_ID,...,CRS_ELAPSED_TIME,ACTUAL_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 32
0,12,8,7,WN,N8651A,3689,15016,STL,"St. Louis, MO",14679,...,245.0,266.0,1557.0,7,0.0,0.0,18.0,0.0,0.0,
1,12,8,7,WN,N939WN,2600,15016,STL,"St. Louis, MO",14683,...,145.0,125.0,786.0,4,,,,,,
2,12,8,7,WN,N7741C,2770,15016,STL,"St. Louis, MO",14683,...,140.0,131.0,786.0,4,,,,,,
3,12,8,7,WN,N550WN,6654,15016,STL,"St. Louis, MO",14747,...,275.0,256.0,1709.0,7,,,,,,
4,12,8,7,WN,N8319F,3402,15016,STL,"St. Louis, MO",14771,...,270.0,256.0,1735.0,7,,,,,,


In [3]:
Dec_flight.isna().sum()

MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
OP_UNIQUE_CARRIER           0
TAIL_NUM                  457
OP_CARRIER_FL_NUM           0
ORIGIN_AIRPORT_ID           0
ORIGIN                      0
ORIGIN_CITY_NAME            0
DEST_AIRPORT_ID             0
DEST                        0
DEST_CITY_NAME              0
CRS_DEP_TIME                0
DEP_TIME                 5510
DEP_DELAY_NEW            5510
DEP_DEL15                5510
DEP_TIME_BLK                0
CRS_ARR_TIME                0
ARR_TIME                 6045
ARR_DELAY_NEW            7151
ARR_TIME_BLK                0
CANCELLED                   0
CANCELLATION_CODE      619970
CRS_ELAPSED_TIME            0
ACTUAL_ELAPSED_TIME      7151
DISTANCE                    0
DISTANCE_GROUP              0
CARRIER_DELAY          498818
WEATHER_DELAY          498818
NAS_DELAY              498818
SECURITY_DELAY         498818
LATE_AIRCRAFT_DELAY    498818
Unnamed: 32            625763
dtype: int

In [4]:
# Drop observations with missing departure time and tail number.  
Dec_flight.drop(Dec_flight.loc[Dec_flight['DEP_TIME'].isna()].index, axis=0, inplace=True) 
Dec_flight.drop(Dec_flight.loc[Dec_flight['ARR_TIME'].isna()].index, axis=0, inplace=True) 
 
# Drop cancelled flights 
Dec_flight.drop(Dec_flight[Dec_flight['CANCELLED'] == 1].index, inplace = True)

In [5]:
Dec_flight.isna().sum()

MONTH                       0
DAY_OF_MONTH                0
DAY_OF_WEEK                 0
OP_UNIQUE_CARRIER           0
TAIL_NUM                    0
OP_CARRIER_FL_NUM           0
ORIGIN_AIRPORT_ID           0
ORIGIN                      0
ORIGIN_CITY_NAME            0
DEST_AIRPORT_ID             0
DEST                        0
DEST_CITY_NAME              0
CRS_DEP_TIME                0
DEP_TIME                    0
DEP_DELAY_NEW               0
DEP_DEL15                   0
DEP_TIME_BLK                0
CRS_ARR_TIME                0
ARR_TIME                    0
ARR_DELAY_NEW            1106
ARR_TIME_BLK                0
CANCELLED                   0
CANCELLATION_CODE      619718
CRS_ELAPSED_TIME            0
ACTUAL_ELAPSED_TIME      1106
DISTANCE                    0
DISTANCE_GROUP              0
CARRIER_DELAY          492773
WEATHER_DELAY          492773
NAS_DELAY              492773
SECURITY_DELAY         492773
LATE_AIRCRAFT_DELAY    492773
Unnamed: 32            619718
dtype: int

In [6]:
# Investigate the observations with missing Arrival delay information 
MISSING_ARR_DELAY = Dec_flight[Dec_flight['ARR_DELAY_NEW'].isna()] 
 
# Manually calculate the arrival delay value and check if that solves the missing arrival delay information 
MISSING_ARR_DELAY['ARR_DELAY_NEW'] = MISSING_ARR_DELAY['ARR_TIME'] - MISSING_ARR_DELAY['CRS_ARR_TIME'] 
 
# Review the descriptive statistics for the new calculated values to ensure no negatives or errors 
MISSING_ARR_DELAY['ARR_DELAY_NEW'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


count    1106.000000
mean      -22.830922
std       786.554676
min     -2305.000000
25%       136.000000
50%       258.000000
75%       375.000000
max      1758.000000
Name: ARR_DELAY_NEW, dtype: float64

## Import Aircraft Information 

In [20]:
# file name
KEY= 'B43_AIRCRAFT_INVENTORY.csv'

# Get object in bucket
air_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

# Import aircraft information - tail numbers, manufacture year, and capacity information
aircraft = pd.read_csv(air_response.get("Body"), encoding='latin1', engine = 'python')
aircraft

# Drop Duplicates to retain a dictionary 
aircraft.drop_duplicates(subset='TAIL_NUM', inplace=True) 
aircraft.head()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 6: invalid start byte

In [8]:
pd.read_csv("../src/data/raw_data/B43_AIRCRAFT_INVENTORY.csv", engine = 'python')

Unnamed: 0,MANUFACTURE_YEAR,TAIL_NUM,NUMBER_OF_SEATS
0,1944,N54514,0.0
1,1945,N1651M,0.0
2,1953,N100CE,0.0
3,1953,N141FL,0.0
4,1953,N151FL,0.0
...,...,...,...
7378,2019,N14011,337.0
7379,2019,N16008,337.0
7380,2019,N16009,337.0
7381,2019,N2250U,276.0


## Import Decode Carrier Names Information

In [9]:
# file name
KEY= 'CARRIER_DECODE.csv'

# Get object in bucket
decode_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

# Create dictionary for consistent carrier names using OP_Unique_carrier and Airline_ID 
names = pd.read_csv(decode_response.get("Body"))
 
# Drop Duplicates to retain a dictionary 
names.drop_duplicates(subset='OP_UNIQUE_CARRIER', inplace=True) 
names = names.reset_index(drop=True)
names.head()

Unnamed: 0,AIRLINE_ID,OP_UNIQUE_CARRIER,CARRIER_NAME
0,21754,2PQ,21 Air LLC
1,20342,Q5,40-Mile Air
2,20342,WRB,40-Mile Air
3,19627,CIQ,A/S Conair
4,19072,AAE,AAA Airlines


## Import Employee Information

In [10]:
# file name
KEY= 'P10_EMPLOYEES.csv'

# Get object in bucket
emp_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

#Import Employee information 
employees = pd.read_csv(emp_response.get("Body"))

# Combine Carrier information for different entities and retain entitiy (only domestic), passenger handling (flight attendant), pass_gen_svc_admin (ground service), pilot_copilots and maintanence 
employees = employees[['OP_UNIQUE_CARRIER', 'ENTITY', 'PILOTS_COPILOTS', 'PASSENGER_HANDLING', 'PASS_GEN_SVC_ADMIN', 'MAINTENANCE']] 

# Drop on domestic entities 
employees.drop(employees[employees['ENTITY'] != 'D'].index, inplace = True) 
 
# Combine any remaining duplicates 
employees = employees.groupby('OP_UNIQUE_CARRIER').sum().reset_index() 
 
# Drop Parcel carriers (airlines with no flight attendants) 
employees.drop(employees[employees['PASSENGER_HANDLING'] == 0].index, inplace = True) 
employees = employees.reset_index(drop=True)
employees

Unnamed: 0,OP_UNIQUE_CARRIER,PILOTS_COPILOTS,PASSENGER_HANDLING,PASS_GEN_SVC_ADMIN,MAINTENANCE
0,AA,8586,8586,15502,9677
1,AS,2893,1062,5737,898
2,B6,2840,4905,3888,726
3,DL,9293,15331,15809,6122
4,F9,1473,2496,154,237
5,G4,953,200,1626,420
6,HA,586,893,1466,419
7,MQ,2109,4923,1510,1565
8,NK,2126,264,3592,395
9,OO,5175,1407,4076,2145


## Import Airport Weather Information

In [11]:
# file name
KEY= 'airport_weather_dec_2019.csv'

# Get object in bucket
weather_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

#Import weather airport information 
weather_report = pd.read_csv(weather_response.get("Body"))
weather_report.head()

Unnamed: 0,STATION,NAME,DATE,AWND,PGTM,PRCP,PSUN,SN32,SNOW,SNWD,...,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10,WT18
0,USW00013874,ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...,12/1/2019,16.11,,0.04,0.0,0.0,64.0,67.0,...,,,,,,,,,,
1,USW00013904,"AUSTIN BERGSTROM INTERNATIONAL AIRPORT, TX US",12/1/2019,10.29,,0.0,0.0,0.0,62.0,66.0,...,,,,,,,,,,
2,USW00093721,"BALTIMORE WASHINGTON INTERNATIONAL AIRPORT, MD US",12/1/2019,8.05,,0.62,0.0,0.0,41.0,45.0,...,,1.0,,,,,,,,
3,USW00013881,"CHARLOTTE DOUGLAS AIRPORT, NC US",12/1/2019,10.29,,0.6,0.0,0.0,56.0,68.0,...,,,,,,,,,,
4,USW00093812,"CINCINNATI MUNICIPAL AIRPORT LUNKEN FIELD, OH US",12/1/2019,11.41,,0.09,,,,60.0,...,,,,,,,,,,


## Import Airport City Information

In [12]:
# file name
KEY= 'airports_list.csv'

# Get object in bucket
cities_response = s3_client.get_object(Bucket=BUCKET, Key=KEY)

# Import city and airport name dictionary
cities = pd.read_csv(cities_response.get("Body"))
cities.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US"
1,10257,Albany International,"Albany, NY","ALBANY INTERNATIONAL AIRPORT, NY US"
2,10140,Albuquerque International Sunport,"Albuquerque, NM","ALBUQUERQUE INTERNATIONAL AIRPORT, NM US"
3,10299,Anchorage International,"Anchorage, AK","ANCHORAGE TED STEVENS INTERNATIONAL AIRPORT, A..."
4,10397,Atlanta Municipal,"Atlanta, GA",ATLANTA HARTSFIELD JACKSON INTERNATIONAL AIRPO...


# Merge Data sets

In [13]:
# Merge weather report and cities for future merging 
weather_merge = pd.merge(cities, weather_report, how='left', on='NAME') 
weather_merge.head()

Unnamed: 0,ORIGIN_AIRPORT_ID,DISPLAY_AIRPORT_NAME,ORIGIN_CITY_NAME,NAME,STATION,DATE,AWND,PGTM,PRCP,PSUN,...,WT02,WT03,WT04,WT05,WT06,WT07,WT08,WT09,WT10,WT18
0,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,12/1/2019,10.07,,0.0,0.0,...,,,,,,,,,,
1,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,12/2/2019,4.03,,0.0,0.0,...,,,,,,,,,,
2,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,12/3/2019,4.03,,0.0,0.0,...,,,,,,,,,,
3,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,12/4/2019,2.91,,0.0,0.0,...,,,,,,,,,,
4,12992,Adams Field,"Little Rock, AR","NORTH LITTLE ROCK AIRPORT, AR US",USW00003952,12/5/2019,4.92,,0.01,0.0,...,,,,,,,,,,


In [14]:
# limit scope of weather metrics (date, precipitation, snow, max temp, and wind) 
weather = weather_merge[['DATE', 'PRCP', 'SNOW', 'SNWD','TMAX', 'AWND', 'ORIGIN_AIRPORT_ID']]
weather.isna().sum()

DATE                   1
PRCP                   1
SNOW                 621
SNWD                   2
TMAX                   9
AWND                   1
ORIGIN_AIRPORT_ID      0
dtype: int64

In [15]:
# Replace missing snow values with 0 for days with 0 preciptation 
weather.SNOW.where((~weather.SNOW.isna()) & (weather.PRCP != '0.00'), 
                     '0.00', inplace=True) 
 
# Replace missing snow depth (SNWD) values with 0 for days with no snow 
weather.SNWD.where((~weather.SNWD.isna()) & (weather.SNOW != '0.00'), 
                     '0.00', inplace=True)

  res_values = method(rvalues)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [16]:
weather.isna().sum()

DATE                 1
PRCP                 1
SNOW                 0
SNWD                 0
TMAX                 9
AWND                 1
ORIGIN_AIRPORT_ID    0
dtype: int64

In [17]:
# Change date format to match flight data (Month & Day_of_Month) 
weather['DATE'] = pd.to_datetime(weather['DATE']) 
weather['MONTH'] = pd.DatetimeIndex(weather['DATE']).month 
weather['DAY_OF_MONTH'] = pd.DatetimeIndex(weather['DATE']).day 
weather

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,DATE,PRCP,SNOW,SNWD,TMAX,AWND,ORIGIN_AIRPORT_ID,MONTH,DAY_OF_MONTH
0,2019-12-01,0.00,0.00,0.00,270.0,10.07,12992,12.0,1.0
1,2019-12-02,0.00,0.00,0.00,40.0,4.03,12992,12.0,2.0
2,2019-12-03,0.00,0.00,0.00,240.0,4.03,12992,12.0,3.0
3,2019-12-04,0.00,0.00,0.00,320.0,2.91,12992,12.0,4.0
4,2019-12-05,0.01,0.00,0.00,150.0,4.92,12992,12.0,5.0
...,...,...,...,...,...,...,...,...,...
2972,2019-12-27,0.00,32,35,320.0,5.82,10713,12.0,27.0
2973,2019-12-28,0.00,29,39,300.0,2.24,10713,12.0,28.0
2974,2019-12-29,0.04,31,32,140.0,6.26,10713,12.0,29.0
2975,2019-12-30,0.00,31,34,130.0,2.46,10713,12.0,30.0


In [18]:
# Merge aircraft information into flight data file 
Dec_flight = pd.merge(Dec_flight, aircraft, how="left", on='TAIL_NUM')

KeyError: 'TAIL_NUM'

In [None]:
#Merge names and flight data for consistent airline names 
Dec_flight = pd.merge(Dec_flight, names, how='left', on=['OP_UNIQUE_CARRIER'])

In [None]:
#Merge Employee
Dec_flight = pd.merge(Dec_flight, employees, how='left', on=['OP_UNIQUE_CARRIER'])

In [None]:
Dec_flight.isna().sum()

In [None]:
#rop airlines with missing employee data 
Dec_flight.drop(Dec_flight.loc[Dec_flight['PASSENGER_HANDLING'].isna()].index, axis=0, inplace=True)

In [None]:
Semi_final = Dec_flight.reset_index(drop=True)
Semi_final.isna().sum()

In [None]:
# Dropping Redundant feature 
Final = Semi_final.drop(columns=['ORIGIN_AIRPORT_ID', 'ORIGIN_CITY_NAME', 'DEST_AIRPORT_ID', 'DEST_CITY_NAME'])
 
# Dropping extensive missing features 
Final = Final.drop(columns=['CARRIER_DELAY', 'WEATHER_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'NAS_DELAY', 'CANCELLATION_CODE', 'Unnamed: 32', 'CANCELLED']) 

In [None]:
Final.isna().sum()

# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}