In [1]:
import pandas as pd
import numpy as np
import datetime
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Information about the data
The U.S. Department of Transportation's (DOT) Bureau of Transportation Statistics tracks the on-time performance of domestic flights operated by large air carriers. Summary information on the number of on-time, delayed, canceled, and diverted flights is published in DOT's monthly Air Travel Consumer Report and in this dataset of 2015 flight delays and cancellations.

#### Acknowledgements
The flight delay and cancellation data was collected and published by the DOT's Bureau of Transportation Statistics.

In [2]:
#ingest data
#airlines= pd.read_csv('airlines.csv')
#airports= pd.read_csv('airports.csv')
flights = pd.read_csv('flights.csv')

pd.set_option("display.max_columns", 100)

### Data Processing

#### Cleaning the data and augmenting


###### For Flights
- convert to columns to a single date 
- convert integer columns to a time or minutes from data
- add if its a holiday, weekend as a binary term

In [3]:
flights.head(5)

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0,0,,,,,,


In [4]:
#Take all the date columns and combine into a single column
#Currently there is a separate column for month, day, year
flights['DATE'] = pd.to_datetime(flights[['YEAR','MONTH', 'DAY']])

#remove redundant columns
flights.drop(['YEAR','MONTH', 'DAY'], axis=1,inplace=True)

In [5]:
#################################################################
#Realized that there are a bunch of 5 digit integers as the origin_airport instead of the 3 digit code
#display(successful_flights['ORIGIN_AIRPORT'].unique())
#display(successful_flights['DESTINATION_AIRPORT'].unique())
#################################################################


unique_codes=list(flights['ORIGIN_AIRPORT'].unique())
unique_codes_str = []
#we want only the number codes in the data
for x in unique_codes:
    if isinstance(x,int):
        unique_codes_str.append(str(x))
    elif x.isnumeric():
        unique_codes_str.append(x)

print(unique_codes_str)
#build our dictionary to translate the codes to the letter codes        
airport_numeric_codes = pd.read_csv('airport_num_codes.csv')
airport_numeric_codes['Description']= airport_numeric_codes['Description'].astype(str)


airport_letter_codes = pd.read_csv('airports_letter_codes.csv')
airport_letter_codes['Description']= airport_letter_codes['Description'].astype(str)

airport_codes = airport_numeric_codes.merge(airport_letter_codes, on=['Description'])
airport_codes.drop(['Description'], axis=1, inplace=True)
    
code_dict = dict(zip(airport_codes.Codes,airport_codes.Code))
#since this is a huge dictionary, need to trim it (exponentially improves the speed)
new_data = {k: v for k, v in code_dict.items() if str(k) in unique_codes_str}
#replace the number codes with letter ones
flights['ORIGIN_AIRPORT'].replace(new_data,inplace=True)
flights['DESTINATION_AIRPORT'].replace(new_data,inplace=True)


['14747', '14771', '12889', '12892', '14869', '10299', '11292', '14107', '11630', '10732', '14254', '10141', '10627', '11982', '12173', '13930', '14683', '12266', '11618', '10721', '13487', '11884', '15919', '13851', '11111', '10693', '12191', '14783', '15016', '14487', '10423', '15370', '11953', '13891', '15376', '11778', '11278', '14100', '13204', '15304', '11637', '14842', '10155', '11775', '11298', '11057', '13931', '10821', '14122', '11049', '10990', '10631', '13158', '14108', '13198', '11447', '12206', '13495', '14057', '15624', '10747', '15411', '12891', '10994', '13256', '10792', '14492', '12451', '13127', '10781', '14960', '12278', '14685', '11995', '13485', '11977', '10257', '13796', '13232', '13296', '14570', '14893', '14524', '12217', '10713', '10208', '10136', '11603', '14689', '11471', '11315', '13264', '12478', '14814', '11308', '11066', '12896', '10397', '14307', '11721', '11140', '10185', '13277', '11203', '13342', '11433', '11697', '12953', '10599', '12156', '14952', 

In [6]:
#American holidays for 2015
american_holidays = ['2015-01-01', '2015-01-19','2015-02-16','2015-05-25','2015-07-03','2015-09-07','2015-10-12','2015-11-11','2015-11-26','2015-12-25']

# - Change time from integer to actual time/date or minutes from a time
def format_time(value):
    if np.isnan(value):
        return np.nan
    if value == 2400: value = 0
    value = "{0:04d}".format(int(value))
    formatted_time = datetime.time(int(value[0:2]), int(value[2:4]))
    return formatted_time

#Add a binary term if it is a holiday or weekend
flights['weekend'] = np.where(flights['DAY_OF_WEEK']> 5, True, False)
flights['holiday'] = np.where(flights['DATE'].isin(american_holidays), True, False)

#Add a binary term if it is a long flight > 1400 miles
flights['long_flight'] = np.where(flights['DISTANCE']>1400, True, False)


#########################################################
# subset of data where flight is either cancelled or diverted
#x = data[(data['Cancelled']==1) 
########################################################
successful_flights = flights[np.isnan(flights['ARRIVAL_TIME'])==False]
cancelled_flights = flights[np.isnan(flights['ARRIVAL_TIME'])]

#Cancelled flights, 
# - delete all empty columns
cancelled_flights.drop(['DEPARTURE_TIME','DEPARTURE_DELAY','TAXI_OUT','WHEELS_OFF','ELAPSED_TIME','AIR_TIME',
                        'TAXI_IN','WHEELS_ON','ARRIVAL_TIME','ARRIVAL_DELAY','AIR_SYSTEM_DELAY','SECURITY_DELAY','AIRLINE_DELAY'
                        ,'LATE_AIRCRAFT_DELAY','WEATHER_DELAY'], axis=1,inplace=True)
# - change time format
cancelled_flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'].apply(format_time)
cancelled_flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_time)

#Successful flights,
# - fill NaN values with 0 for AIR_SYSTEM_DELAY, SECURITY_DELAY , AIRLINE_DELAY, LATE_AIRCRAFT_DELAY, WEATHER_DELAY
successful_flights=successful_flights.fillna(0)
# - delete diverted, cancelled and cancelled reason
successful_flights.drop(['DIVERTED','CANCELLED', 'CANCELLATION_REASON'], axis=1,inplace=True)

# - change time format
successful_flights['SCHEDULED_DEPARTURE'] = flights['SCHEDULED_DEPARTURE'].apply(format_time)
successful_flights['DEPARTURE_TIME'] = flights['DEPARTURE_TIME'].apply(format_time)
successful_flights['SCHEDULED_ARRIVAL'] = flights['SCHEDULED_ARRIVAL'].apply(format_time)
successful_flights['ARRIVAL_TIME'] = flights['ARRIVAL_TIME'].apply(format_time)
successful_flights['WHEELS_OFF'] = flights['WHEELS_OFF'].apply(format_time)
successful_flights['WHEELS_ON'] = flights['WHEELS_ON'].apply(format_time)

In [7]:
#make sure everything looks right
successful_flights.head(5)

Unnamed: 0,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,weekend,holiday,long_flight
0,4,AS,98,N407AS,ANC,SEA,00:05:00,23:54:00,-11.0,21.0,00:15:00,205.0,194.0,169.0,1448,04:04:00,4.0,04:30:00,04:08:00,-22.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,False,True,True
1,4,AA,2336,N3KUAA,LAX,PBI,00:10:00,00:02:00,-8.0,12.0,00:14:00,280.0,279.0,263.0,2330,07:37:00,4.0,07:50:00,07:41:00,-9.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,False,True,True
2,4,US,840,N171US,SFO,CLT,00:20:00,00:18:00,-2.0,16.0,00:34:00,286.0,293.0,266.0,2296,08:00:00,11.0,08:06:00,08:11:00,5.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,False,True,True
3,4,AA,258,N3HYAA,LAX,MIA,00:20:00,00:15:00,-5.0,15.0,00:30:00,285.0,281.0,258.0,2342,07:48:00,8.0,08:05:00,07:56:00,-9.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,False,True,True
4,4,AS,135,N527AS,SEA,ANC,00:25:00,00:24:00,-1.0,11.0,00:35:00,235.0,215.0,199.0,1448,02:54:00,5.0,03:20:00,02:59:00,-21.0,0.0,0.0,0.0,0.0,0.0,2015-01-01,False,True,True


###### For Airport:
- get average delay for each source airport 
- average taxi time for each source airport
- most common delay reason

In [8]:
#Building the new Airports dataframe
airport_data = successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'ARRIVAL_DELAY':'mean'})
airport_data['destination_airport_avg_delay'] = successful_flights.groupby(by='DESTINATION_AIRPORT').agg({'ARRIVAL_DELAY':'mean'})
airport_data['airport_avg_taxi_out_time'] = successful_flights.groupby(by='DESTINATION_AIRPORT').agg({'TAXI_OUT':'mean'})
airport_data['airport_avg_taxi_in_time'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'TAXI_IN':'mean'})
airport_data['total_airline_delay'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'AIRLINE_DELAY':'sum'})
airport_data['total_security_delay'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'SECURITY_DELAY':'sum'})
airport_data['total_air_system_delay'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'AIR_SYSTEM_DELAY':'sum'})
airport_data['total_late_aircraft_delay'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'LATE_AIRCRAFT_DELAY':'sum'})
airport_data['total_weather_delay'] =  successful_flights.groupby(by='ORIGIN_AIRPORT').agg({'WEATHER_DELAY':'sum'})
airport_data.rename(columns={'ARRIVAL_DELAY': 'source_airport_avg_delay'},inplace=True)
airport_data

Unnamed: 0_level_0,source_airport_avg_delay,destination_airport_avg_delay,airport_avg_taxi_out_time,airport_avg_taxi_in_time,total_airline_delay,total_security_delay,total_air_system_delay,total_late_aircraft_delay,total_weather_delay
ORIGIN_AIRPORT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10135,-11.461538,-12.090909,17.545455,6.923077,0.0,0.0,0.0,23.0,0.0
10136,2.166667,26.909091,14.363636,9.250000,0.0,0.0,0.0,0.0,146.0
10140,7.048544,6.361905,13.885714,7.669903,463.0,0.0,185.0,458.0,152.0
10141,1.750000,-6.250000,16.250000,10.000000,0.0,4.0,14.0,0.0,0.0
10146,-7.200000,-10.800000,14.400000,6.400000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
WRG,4.937411,5.616361,11.212976,4.165007,646.0,30.0,578.0,6054.0,112.0
WYS,-1.865385,4.173077,20.192308,5.975962,258.0,0.0,98.0,472.0,22.0
XNA,10.269605,8.096960,19.449753,11.421682,39780.0,11.0,44038.0,60428.0,12013.0
YAK,-5.483051,0.369014,11.691549,3.940678,401.0,0.0,345.0,3223.0,76.0


###### For Airline
- average delay for each carrier
- average difference between schedule and elapsed time
- percentage of flights delayed by carrier 
- average wheels up 
- most common delay reason

In [9]:
airline_avg_arr_delay = successful_flights.groupby(by='AIRLINE').agg({'ARRIVAL_DELAY':'mean'})
airline_avg_dep_delay = successful_flights.groupby(by='AIRLINE').agg({'DEPARTURE_DELAY':'mean'})
sched_elapsed_diff = successful_flights['SCHEDULED_TIME']- successful_flights['ELAPSED_TIME']
successful_flights["sched_elapsed_diff"] = sched_elapsed_diff
airline_scheduled_elapsed_diff = successful_flights.groupby(by='AIRLINE').agg({'sched_elapsed_diff':'mean'})
successful_flights['DELAYED'] = np.where(successful_flights['ARRIVAL_DELAY']>10, 1, 0)
num_delayed_flight = successful_flights.groupby(by='AIRLINE').agg({'DELAYED':'sum'}) 
total_num_flight = successful_flights.groupby(by='AIRLINE').agg({'SCHEDULED_DEPARTURE':'count'})  

air_delay = successful_flights.groupby(by='AIRLINE').agg({'AIR_SYSTEM_DELAY':'sum'})
weather_delay = successful_flights.groupby(by='AIRLINE').agg({'WEATHER_DELAY':'sum'})
security_delay = successful_flights.groupby(by='AIRLINE').agg({'SECURITY_DELAY':'sum'}) 
airline_delay = successful_flights.groupby(by='AIRLINE').agg({'AIRLINE_DELAY':'sum'})
aircraft_delay = successful_flights.groupby(by='AIRLINE').agg({'LATE_AIRCRAFT_DELAY':'sum'})
#percent_delayed_flight.columns=['AIRLINE','percent_delayed_flight']
#print(percent_delayed_flight.head(5))
delay_reason_totals = air_delay.join(weather_delay, on='AIRLINE') 
delay_reason_totals = delay_reason_totals.join(security_delay, on='AIRLINE')
delay_reason_totals = delay_reason_totals.join(airline_delay, on='AIRLINE')
delay_reason_totals = delay_reason_totals.join(aircraft_delay, on='AIRLINE')

airlines= delay_reason_totals.join(airline_avg_arr_delay, on='AIRLINE')
airlines= airlines.join(airline_avg_dep_delay,on='AIRLINE')
airlines= airlines.join(airline_scheduled_elapsed_diff,on='AIRLINE')

airlines['percent_delayed_flight']=num_delayed_flight["DELAYED"]/total_num_flight["SCHEDULED_DEPARTURE"]
#airlines= airlines.join(percent_delayed_flight,on='AIRLINE')
airlines

Unnamed: 0_level_0,AIR_SYSTEM_DELAY,WEATHER_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,ARRIVAL_DELAY,DEPARTURE_DELAY,sched_elapsed_diff,percent_delayed_flight
AIRLINE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AA,1760561.0,467420.0,16158.0,2753994.0,2833302.0,3.442102,8.848322,5.880078,0.215588
AS,301478.0,38832.0,5825.0,347425.0,381417.0,-0.975124,1.730191,3.041505,0.164102
B6,991461.0,115770.0,11417.0,1074056.0,1417496.0,6.661464,11.481192,5.361057,0.258616
DL,1779383.0,602901.0,3910.0,2707569.0,2136128.0,0.186396,7.345122,7.429001,0.162727
EV,1687894.0,169313.0,0.0,2363973.0,2628976.0,6.565578,8.64284,2.375179,0.231596
F9,581234.0,21616.0,0.0,346950.0,634039.0,12.485166,13.305449,1.064202,0.300772
HA,6241.0,11429.0,401.0,196422.0,126699.0,2.02171,0.475208,-1.39938,0.155271
MQ,930774.0,402305.0,7049.0,1055033.0,1417073.0,6.443083,9.995351,3.763251,0.247542
NK,941423.0,44088.0,5147.0,471115.0,701218.0,14.452979,15.899604,1.635149,0.338373
OO,1333972.0,250325.0,9896.0,2043703.0,2868684.0,5.834233,7.749692,2.113791,0.222096


###### New Daily Data (each day is an entry):
- average delay on all flights
- number of flights 
- number of planes flying each day
- number of unique flight plans

In [10]:
#building daily data df
average_arr_delay_daily = successful_flights.groupby(by='DATE').agg({'ARRIVAL_DELAY':'mean'})
average_dep_delay_daily = successful_flights.groupby(by='DATE').agg({'DEPARTURE_DELAY':'mean'})
num_flight_daily = successful_flights.groupby(by='DATE').agg({'SCHEDULED_DEPARTURE':'count'})
daily_data = average_arr_delay_daily.join(average_dep_delay_daily,on='DATE')
daily_data = daily_data.join(num_flight_daily,on='DATE')
daily_data.rename(columns = {'ARRIVAL_DELAY':'average_arr_delay_daily','DEPARTURE_DELAY':'average_dep_delay_daily','SCHEDULED_DEPARTURE':'num_flight_daily'}, inplace = True) 
daily_data


Unnamed: 0_level_0,average_arr_delay_daily,average_dep_delay_daily,num_flight_daily
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-01-01,5.346142,9.573887,13480
2015-01-02,9.825171,12.635415,16479
2015-01-03,25.394367,25.127966,15090
2015-01-04,31.920762,31.519417,15914
2015-01-05,18.780932,20.881491,16100
...,...,...,...
2015-12-27,27.976374,30.801919,15322
2015-12-28,24.406002,28.815338,14096
2015-12-29,26.255322,32.235970,15502
2015-12-30,26.095689,30.410139,15958


###### For Predictions

- keep the cancelled and successful flight data together
- remove the useless data
- add average airline delay
- add average source airport delay
- add total number of flights that day from the source
- Prediction task one: predicts the delay in minutes for each flight
- Prediction task two: predicts if it is delayed more than 10mins or not

In [11]:
#Remove the useless data
flight_predictions = flights.drop(['FLIGHT_NUMBER','TAIL_NUMBER'], axis=1)

flight_predictions['airline_avg_arrival_delay'] = flight_predictions.groupby('AIRLINE')['ARRIVAL_DELAY'].transform(np.mean)
flight_predictions['airline_avg_departure_delay'] = flight_predictions.groupby('AIRLINE')['DEPARTURE_DELAY'].transform(np.mean)

##gather aggregrate data for airports
#get mean arrival delay for each airport, rename the column
flight_predictions['source_airport_avg_departure_delay'] = flight_predictions.groupby('ORIGIN_AIRPORT')['DEPARTURE_DELAY'].transform(np.mean)

#get mean departure delay for each airport, rename the column
flight_predictions['destination_airport_avg_delay'] = flight_predictions.groupby('DESTINATION_AIRPORT')['ARRIVAL_DELAY'].transform(np.mean)

#get total flights by source, rename the column
flight_predictions['total_flights_source'] = flight_predictions.groupby('ORIGIN_AIRPORT')['SCHEDULED_DEPARTURE'].transform(np.size)

#get total flights by destination, rename the column
flight_predictions['total_flights_destination'] = flight_predictions.groupby('DESTINATION_AIRPORT')['SCHEDULED_DEPARTURE'].transform(np.size)

flight_predictions

Unnamed: 0,DAY_OF_WEEK,AIRLINE,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY,DATE,weekend,holiday,long_flight,airline_avg_arrival_delay,airline_avg_departure_delay,source_airport_avg_departure_delay,destination_airport_avg_delay,total_flights_source,total_flights_destination
0,4,AS,ANC,SEA,5,2354.0,-11.0,21.0,15.0,205.0,194.0,169.0,1448,404.0,4.0,430,408.0,-22.0,0,0,,,,,,,2015-01-01,False,True,True,-0.976563,1.785801,3.277481,1.739120,17162,120672
1,4,AA,LAX,PBI,10,2.0,-8.0,12.0,14.0,280.0,279.0,263.0,2330,737.0,4.0,750,741.0,-9.0,0,0,,,,,,,2015-01-01,False,True,True,3.451372,8.900856,10.233195,6.363335,211388,24216
2,4,US,SFO,CLT,20,18.0,-2.0,16.0,34.0,286.0,293.0,266.0,2296,800.0,11.0,806,811.0,5.0,0,0,,,,,,,2015-01-01,False,True,True,3.706209,6.141137,10.795402,1.758501,161361,109366
3,4,AA,LAX,MIA,20,15.0,-5.0,15.0,30.0,285.0,281.0,258.0,2342,748.0,8.0,805,756.0,-9.0,0,0,,,,,,,2015-01-01,False,True,True,3.451372,8.900856,10.233195,5.325487,211388,74908
4,4,AS,SEA,ANC,25,24.0,-1.0,11.0,35.0,235.0,215.0,199.0,1448,254.0,5.0,320,259.0,-21.0,0,0,,,,,,,2015-01-01,False,True,True,-0.976563,1.785801,6.529802,1.623563,120665,17159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5819074,4,B6,LAX,BOS,2359,2355.0,-4.0,22.0,17.0,320.0,298.0,272.0,2611,749.0,4.0,819,753.0,-26.0,0,0,,,,,,,2015-12-31,False,False,True,6.677861,11.514353,10.233195,5.784157,211388,117454
5819075,4,B6,JFK,PSE,2359,2355.0,-4.0,17.0,12.0,227.0,215.0,195.0,1617,427.0,3.0,446,430.0,-16.0,0,0,,,,,,,2015-12-31,False,False,True,6.677861,11.514353,11.791644,9.569987,101637,809
5819076,4,B6,JFK,SJU,2359,2350.0,-9.0,17.0,7.0,221.0,222.0,197.0,1598,424.0,8.0,440,432.0,-8.0,0,0,,,,,,,2015-12-31,False,False,True,6.677861,11.514353,11.791644,6.339494,101637,26399
5819077,4,B6,MCO,SJU,2359,2353.0,-6.0,10.0,3.0,161.0,157.0,144.0,1189,327.0,3.0,340,330.0,-10.0,0,0,,,,,,,2015-12-31,False,False,False,6.677861,11.514353,11.541799,6.339494,119450,26399


#### Put data into new csv files for Streamlit dashboard

- For flights -> flights_clean.csv
- For cancelled_flights -> cancelled_flights_clean.csv
- For airports -> airports_clean.csv - done
- For airlines -> airlines_clean.csv
- For daily info -> daily_clean.csv

In [13]:
successful_flights.to_csv('flights_clean.csv', index = False, header=True)
cancelled_flights.to_csv('cancelled_flights_clean.csv', index = False, header=True)
airport_data.to_csv('airports_clean.csv', index = True, header=True)
airlines.to_csv('airlines_clean.csv', index = True, header=True)
daily_data.to_csv('daily_clean.csv', index = True, header=True)
flight_predictions.to_csv('flight_predictions.csv', index = False, header=True)