**Load Files**

There are [40 csv files](https://drive.google.com/drive/folders/1pJuy-0JjUC8BUY6m_5CEjVCczB_mWm0F?usp=sharing) required to run all 15 notebooks.

For all code to run properly, the files should be structured as follows:
 - current directory (where the notebooks are saved)
 - data
     - model
     - weather 
       - 2017
           - 2017 csv files (12)
       - 2018
           - 2018 csv files (12)
       - 2019
           - 2019 csv files (12)
     - weather 
        - weather csv files (4)
 
Note: the same structure is followed on the Google Drive to make the process easier

This notebook loads 36 csv data files with flight information into 3 data frames (one for each year) and then exports each of the 3 as a csv.

In [1]:
import numpy as np
import pandas as pd

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
# create a variable containing the file names of all 2017 data

import os
file_paths = os.listdir('data/model/2017')
file_paths

['201707.csv',
 '201706.csv',
 '201712.csv',
 '201704.csv',
 '201710.csv',
 '201711.csv',
 '201705.csv',
 '201701.csv',
 '201702.csv',
 '201703.csv',
 '201708.csv',
 '201709.csv']

In [4]:
# read in each of the 2017 .csv files, combine into one dataframe and ensure everything loads correctly

flights2017ORIGINAL_df = pd.DataFrame()

for file in file_paths:
    df_temp = pd.read_csv('data/model/2017/' + file)
    flights2017ORIGINAL_df = flights2017ORIGINAL_df.append(df_temp)

flights2017ORIGINAL_df = flights2017ORIGINAL_df.reset_index(drop = True)
flights2017ORIGINAL_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,...,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
0,6,2017-07-01,AS,N559AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,...,2329,10,,,,,,1053,1021.0,-32.0
1,6,2017-07-01,AS,N513AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,...,2329,10,,,,,,2153,2129.0,-24.0
2,6,2017-07-01,AS,N588AS,DCA,"Washington, DC",VA,SEA,"Seattle, WA",WA,...,2329,10,,,,,,2157,2107.0,-50.0
3,6,2017-07-01,AS,N538AS,SEA,"Seattle, WA",WA,DCA,"Washington, DC",VA,...,2329,10,36.0,0.0,0.0,0.0,0.0,1617,1653.0,36.0
4,6,2017-07-01,AS,N536AS,DCA,"Washington, DC",VA,LAX,"Los Angeles, CA",CA,...,2311,10,,,,,,1138,1115.0,-23.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5674616,2,2017-09-05,UA,N456UA,EWR,"Newark, NJ",NJ,LAS,"Las Vegas, NV",NV,...,2227,9,20.0,0.0,0.0,0.0,27.0,1110,1157.0,47.0
5674617,2,2017-09-05,UA,N809UA,RDU,"Raleigh/Durham, NC",NC,DEN,"Denver, CO",CO,...,1436,6,,,,,,1817,1828.0,11.0
5674618,2,2017-09-05,UA,N69816,DEN,"Denver, CO",CO,SAT,"San Antonio, TX",TX,...,794,4,,,,,,1835,1817.0,-18.0
5674619,2,2017-09-05,UA,N17752,EWR,"Newark, NJ",NJ,DFW,"Dallas/Fort Worth, TX",TX,...,1372,6,,,,,,845,827.0,-18.0


In [5]:
# export the combined 2017 files to a single csv

flights2017ORIGINAL_df.to_csv('flights2017.csv')

In [6]:
# create a variable containing the file names of all 2018 data

import os
file_paths = os.listdir('data/model/2018')
file_paths

['201808.csv',
 '201809.csv',
 '201802.csv',
 '201803.csv',
 '201801.csv',
 '201810.csv',
 '201804.csv',
 '201805.csv',
 '201811.csv',
 '201807.csv',
 '201812.csv',
 '201806.csv']

In [7]:
# read in each of the 2018 .csv files, combine into one dataframe and ensure everything loads correctly

flights2018ORIGINAL_df = pd.DataFrame()

for file in file_paths:
    df_temp = pd.read_csv('data/model/2018/' + file)
    flights2018ORIGINAL_df = flights2018ORIGINAL_df.append(df_temp)

flights2018ORIGINAL_df = flights2018ORIGINAL_df.reset_index(drop = True)
flights2018ORIGINAL_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,...,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
0,5,2018-08-31,UA,N75432,EWR,"Newark, NJ",NJ,DEN,"Denver, CO",CO,...,1605,7,,,,,,1818,1802.0,-16.0
1,5,2018-08-31,UA,N461UA,ORD,"Chicago, IL",IL,GEG,"Spokane, WA",WA,...,1498,6,,,,,,2128,2137.0,9.0
2,5,2018-08-31,UA,N76514,SNA,"Santa Ana, CA",CA,DEN,"Denver, CO",CO,...,846,4,0.0,54.0,5.0,0.0,135.0,1642,1956.0,194.0
3,5,2018-08-31,UA,N76533,LAX,"Los Angeles, CA",CA,SFO,"San Francisco, CA",CA,...,337,2,,,,,,730,710.0,-20.0
4,5,2018-08-31,UA,N35204,SFO,"San Francisco, CA",CA,SAN,"San Diego, CA",CA,...,447,2,,,,,,2311,2317.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7213441,4,2018-06-21,DL,N970AT,ORD,"Chicago, IL",IL,ATL,"Atlanta, GA",GA,...,606,3,0.0,19.0,18.0,0.0,0.0,1640,1717.0,37.0
7213442,4,2018-06-21,DL,N927AT,BNA,"Nashville, TN",TN,MSP,"Minneapolis, MN",MN,...,695,3,,,,,,1642,1614.0,-28.0
7213443,4,2018-06-21,DL,N927AT,MSP,"Minneapolis, MN",MN,BNA,"Nashville, TN",TN,...,695,3,,,,,,1333,1317.0,-16.0
7213444,4,2018-06-21,DL,N320NB,AUS,"Austin, TX",TX,SLC,"Salt Lake City, UT",UT,...,1086,5,,,,,,2058,2111.0,13.0


In [8]:
# export the combined 2018 files to a single csv

flights2018ORIGINAL_df.to_csv('flights2018.csv')

In [9]:
# create a variable containing the file names of all 2019 data

import os
file_paths = os.listdir('data/model/2019')
file_paths

['201908.csv',
 '201909.csv',
 '201901.csv',
 '201902.csv',
 '201903.csv',
 '201907.csv',
 '201912.csv',
 '201906.csv',
 '201910.csv',
 '201904.csv',
 '201905.csv',
 '201911.csv']

In [10]:
# read in each of the 2018 .csv files, combine into one dataframe and ensure everything loads correctly

flights2019ORIGINAL_df = pd.DataFrame()

for file in file_paths:
    df_temp = pd.read_csv('data/model/2019/' + file)
    flights2019ORIGINAL_df = flights2019ORIGINAL_df.append(df_temp)

flights2019ORIGINAL_df = flights2019ORIGINAL_df.reset_index(drop = True)
flights2019ORIGINAL_df

Unnamed: 0,DayOfWeek,FlightDate,Reporting_Airline,Tail_Number,Origin,OriginCityName,OriginState,Dest,DestCityName,DestState,...,Distance,DistanceGroup,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay,CRSArrTime,ArrTime,ArrDelay
0,4,2019-08-01,DL,N354NW,ATL,"Atlanta, GA",GA,DFW,"Dallas/Fort Worth, TX",TX,...,731,3,1.0,0.0,0.0,0.0,16.0,2114,2131.0,17.0
1,4,2019-08-01,DL,N320US,DFW,"Dallas/Fort Worth, TX",TX,ATL,"Atlanta, GA",GA,...,731,3,,,,,,2024,2012.0,-12.0
2,4,2019-08-01,DL,N931DN,IAH,"Houston, TX",TX,ATL,"Atlanta, GA",GA,...,689,3,17.0,0.0,0.0,0.0,20.0,2102,2139.0,37.0
3,4,2019-08-01,DL,N851DN,PDX,"Portland, OR",OR,SLC,"Salt Lake City, UT",UT,...,630,3,,,,,,1601,1558.0,-3.0
4,4,2019-08-01,DL,N775DE,SLC,"Salt Lake City, UT",UT,PDX,"Portland, OR",OR,...,630,3,,,,,,926,911.0,-15.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7422032,5,2019-11-08,DL,N978AT,HOU,"Houston, TX",TX,ATL,"Atlanta, GA",GA,...,696,3,,,,,,1624,1605.0,-19.0
7422033,5,2019-11-08,DL,N6716C,CHS,"Charleston, SC",SC,ATL,"Atlanta, GA",GA,...,259,2,,,,,,817,820.0,3.0
7422034,5,2019-11-08,DL,N851DN,MSP,"Minneapolis, MN",MN,BWI,"Baltimore, MD",MD,...,936,4,,,,,,2130,2116.0,-14.0
7422035,5,2019-11-08,DL,N352NB,GEG,"Spokane, WA",WA,SLC,"Salt Lake City, UT",UT,...,546,3,,,,,,1258,1237.0,-21.0


In [11]:
# export the combined 2019 files to a single csv

flights2019ORIGINAL_df.to_csv('flights2019.csv')