In [1]:
"""
data filter: 
1. Remove attributes with more than 50% data are NaN.
2. Only select flights with both origin and destination airports are 9 main airports: ATL, ORD, DFW, DEN, CLT, LAX, IAH, PHX, SFO.
3. Only select flights from top 5 airlines: UA, OO, WN, AA, DL.
"""

def data_filter(df):
    # remove attributes with more than 50% data are NaN
    nan_percentage = df.isnull().mean()
    cols_to_remove = nan_percentage[nan_percentage > 0.5].index.tolist()
    df_filtered = df.drop(columns=cols_to_remove)
    
    # select flights with both origin and destination airport are 9 main airports
    main_airports = ['ATL', 'ORD', 'DFW', 'DEN', 'CLT', 'LAX', 'IAH', 'PHX', 'SFO']
    df_filtered = df_filtered[df_filtered['Origin'].isin(main_airports) & df_filtered['Dest'].isin(main_airports)]
     
    # select flights from top 5 airlines
    top_airlines = ['UA', 'OO', 'WN', 'AA', 'DL']
    df_filtered = df_filtered[df_filtered['Reporting_Airline'].isin(top_airlines)]
    
    return df_filtered

In [2]:
# read in multiple csv files in the csv_flight folder, filter the data and combine them into one csv file
import glob
import pandas as pd

csv_files = glob.glob('csv_flight/report_*.csv')

# initialize an empty DataFrame to hold the cleaned DataFrames
combined_df = pd.DataFrame()

# track the number of files processed
files_processed = 0

for file in csv_files:
    df = pd.read_csv(file, low_memory=False)
    # filter the data before combing them
    df_cleaned = data_filter(df)
    
    # combine the cleaned DataFrame with the combined dataframe
    combined_df = pd.concat([combined_df, df_cleaned], ignore_index=True)

    files_processed += 1
    if files_processed % 5 == 0:
        print(f"Processed {files_processed}/{len(csv_files)} files. Current shape: {combined_df.shape}")

print("Final shape of combined dataframe:", combined_df.shape)

Processed 5/60 files. Current shape: (114913, 56)
Processed 10/60 files. Current shape: (232864, 56)
Processed 15/60 files. Current shape: (364001, 56)
Processed 20/60 files. Current shape: (487663, 56)
Processed 25/60 files. Current shape: (633337, 56)
Processed 30/60 files. Current shape: (779914, 56)
Processed 35/60 files. Current shape: (937123, 56)
Processed 40/60 files. Current shape: (1081226, 56)
Processed 45/60 files. Current shape: (1225583, 56)
Processed 50/60 files. Current shape: (1372129, 56)
Processed 55/60 files. Current shape: (1509971, 56)
Processed 60/60 files. Current shape: (1658130, 56)
Final shape of combined dataframe: (1658130, 56)


In [3]:
print(combined_df.columns)

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'DOT_ID_Reporting_Airline',
       'IATA_CODE_Reporting_Airline', 'Tail_Number',
       'Flight_Number_Reporting_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest',
       'DestCityName', 'DestState', 'DestStateFips', 'DestStateName',
       'DestWac', 'CRSDepTime', 'DepTime', 'DepDelay', 'DepDelayMinutes',
       'DepDel15', 'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut',
       'WheelsOff', 'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrTime', 'ArrDelay',
       'ArrDelayMinutes', 'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk',
       'Cancelled', 'Diverted', 'CRSElapsedTime', 'ActualElapsedTime',
       'AirTime', 'Flights', 'Distance', 'DistanceGroup',
       'DivAirportLandings'],
      

In [4]:
"""
Reporting_Airline, DOT_ID_Reporting_Airline and DOT_ID_Reporting_Airline all 
represent the airline of a flight, DOT_ID_Reporting_Airline and DOT_ID_Reporting_Airline will be removed.
"""

print(combined_df[['Reporting_Airline', 'DOT_ID_Reporting_Airline', 'DOT_ID_Reporting_Airline']].head(5))
print("Unique Reporting Airline:", combined_df['Reporting_Airline'].unique())
print("Unique DOT ID Reporting Airline:", combined_df['DOT_ID_Reporting_Airline'].unique())
print("Unique IATA CODE Reporting Airline:", combined_df['IATA_CODE_Reporting_Airline'].unique())

  Reporting_Airline  DOT_ID_Reporting_Airline  DOT_ID_Reporting_Airline
0                DL                     19790                     19790
1                DL                     19790                     19790
2                DL                     19790                     19790
3                DL                     19790                     19790
4                DL                     19790                     19790
Unique Reporting Airline: ['DL' 'OO' 'WN' 'UA' 'AA']
Unique DOT ID Reporting Airline: [19790 20304 19393 19977 19805]
Unique IATA CODE Reporting Airline: ['DL' 'OO' 'WN' 'UA' 'AA']


In [5]:
"""
OriginAirportID and Origin both represent the origin airport of a flight, OriginAirportID will be removed.
"""

print(combined_df[['OriginAirportID', 'Origin']].head(5))
print("Length of Unique OriginAirportID:", len(combined_df['OriginAirportID'].unique()))
print("Length of Unique Origin:", len(combined_df['Origin'].unique()))

   OriginAirportID Origin
0            10397    ATL
1            11298    DFW
2            10397    ATL
3            10397    ATL
4            14107    PHX
Length of Unique OriginAirportID: 9
Length of Unique Origin: 9


In [6]:
"""
OriginCityMarketID and OriginCityName both represent the origin city of a flight, OriginCityMarketID will be removed.
"""

print(combined_df[['OriginCityMarketID', 'OriginCityName']].head(5))
print("Length of Unique OriginCityMarketID:", len(combined_df['OriginCityMarketID'].unique()))
print("Length of Unique OriginCityName:", len(combined_df['OriginCityName'].unique()))

   OriginCityMarketID         OriginCityName
0               30397            Atlanta, GA
1               30194  Dallas/Fort Worth, TX
2               30397            Atlanta, GA
3               30397            Atlanta, GA
4               30466            Phoenix, AZ
Length of Unique OriginCityMarketID: 9
Length of Unique OriginCityName: 9


In [7]:
"""
OriginState, OriginStateFips, OriginStateName and OriginWac all 
represent the origin state of a flight; OriginStateFips, OriginStateName and OriginWac will be removed.
"""

print(combined_df[['OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac']].head(5))
print("Length of Unique OriginState:", len(combined_df['OriginState'].unique()))
print("Length of Unique OriginStateFips:", len(combined_df['OriginStateFips'].unique()))
print("Length of Unique OriginStateName:", len(combined_df['OriginStateName'].unique()))
print("Length of Unique OriginWac:", len(combined_df['OriginWac'].unique()))

  OriginState  OriginStateFips OriginStateName  OriginWac
0          GA               13         Georgia         34
1          TX               48           Texas         74
2          GA               13         Georgia         34
3          GA               13         Georgia         34
4          AZ                4         Arizona         81
Length of Unique OriginState: 7
Length of Unique OriginStateFips: 7
Length of Unique OriginStateName: 7
Length of Unique OriginWac: 7


In [8]:
"""
DestAirportID and Dest both represent the destination airport of a flight, DestAirportID will be removed.
"""

print(combined_df[['DestAirportID', 'Dest']].head(5))
print("Length of Unique DestAirportID:", len(combined_df['DestAirportID'].unique()))
print("Length of Unique Dest:", len(combined_df['Dest'].unique()))

   DestAirportID Dest
0          12266  IAH
1          10397  ATL
2          11292  DEN
3          14107  PHX
4          10397  ATL
Length of Unique DestAirportID: 9
Length of Unique Dest: 9


In [9]:
"""
DestCityMarketID and DestCityName both represent the destination city of a flight, DestCityMarketID will be removed.
"""

print(combined_df[['DestCityMarketID', 'DestCityName']].head(5))
print("Length of Unique DestCityMarketID:", len(combined_df['DestCityMarketID'].unique()))
print("Length of Unique DestCityName:", len(combined_df['DestCityName'].unique()))

   DestCityMarketID DestCityName
0             31453  Houston, TX
1             30397  Atlanta, GA
2             30325   Denver, CO
3             30466  Phoenix, AZ
4             30397  Atlanta, GA
Length of Unique DestCityMarketID: 9
Length of Unique DestCityName: 9


In [10]:
"""
DestState, DestStateFips, DestStateName and DestWac all 
represent the destination state of a flight; DestStateFips, DestStateName and DestWac will be removed.
"""

print(combined_df[['DestState', 'DestStateFips', 'DestStateName', 'DestWac']].head(5))
print("Length of Unique DestState:", len(combined_df['DestState'].unique()))
print("Length of Unique DestStateFips:", len(combined_df['DestStateFips'].unique()))
print("Length of Unique DestStateName:", len(combined_df['DestStateName'].unique()))
print("Length of Unique DestWac:", len(combined_df['DestWac'].unique()))

  DestState  DestStateFips DestStateName  DestWac
0        TX             48         Texas       74
1        GA             13       Georgia       34
2        CO              8      Colorado       82
3        AZ              4       Arizona       81
4        GA             13       Georgia       34
Length of Unique DestState: 7
Length of Unique DestStateFips: 7
Length of Unique DestStateName: 7
Length of Unique DestWac: 7


In [11]:
"""
DepDelay is simply (DepTime - CRSDeptime). Since we will use DepDelay to generate our targets, we will remove DepTime.
"""

print(combined_df[['CRSDepTime', 'DepTime', 'DepDelay']].head(10))

   CRSDepTime  DepTime  DepDelay
0        2145   2143.0      -2.0
1         945    937.0      -8.0
2        1855   1853.0      -2.0
3        1634   1633.0      -1.0
4         700    653.0      -7.0
5        1855   1858.0       3.0
6        1445   1442.0      -3.0
7         715    709.0      -6.0
8        1205   1205.0       0.0
9        1400   1355.0      -5.0


In [12]:
"""
DepDelay: Difference in minutes between scheduled and actual departure time. Early departures show negative numbers.
DepDelayMinutes: Difference in minutes between scheduled and actual departure time. Early departures set to 0.
DepDel15: Departure Delay Indicator, 15 Minutes or More (1=Yes).
DepartureDelayGroups: Departure Delay intervals, every (15 minutes from <-15 to >180).

These four variables all represents delay time. We only need to keep DepDelay and we can derive any columns from it.
"""

print(combined_df[['DepDelay', 'DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups']].head(10))

   DepDelay  DepDelayMinutes  DepDel15  DepartureDelayGroups
0      -2.0              0.0       0.0                  -1.0
1      -8.0              0.0       0.0                  -1.0
2      -2.0              0.0       0.0                  -1.0
3      -1.0              0.0       0.0                  -1.0
4      -7.0              0.0       0.0                  -1.0
5       3.0              3.0       0.0                   0.0
6      -3.0              0.0       0.0                  -1.0
7      -6.0              0.0       0.0                  -1.0
8       0.0              0.0       0.0                   0.0
9      -5.0              0.0       0.0                  -1.0


In [13]:
"""
CRSDepTime: CRS Departure Time (local time: hhmm).
DepTimeBlk: CRS Departure Time Block, Hourly Intervals.

These two variables both represents CRS departure time. We only need to keep CRSDepTime and we can derive any columns from it.
"""

print(combined_df[['CRSDepTime', 'DepTimeBlk']].head(10))

   CRSDepTime DepTimeBlk
0        2145  2100-2159
1         945  0900-0959
2        1855  1800-1859
3        1634  1600-1659
4         700  0700-0759
5        1855  1800-1859
6        1445  1400-1459
7         715  0700-0759
8        1205  1200-1259
9        1400  1400-1459


In [14]:
"""
TaxiOut: Taxi Out Time, in Minutes.
WheelsOff: Wheels Off Time (local time: hhmm).
WheelsOn: Wheels On Time (local time: hhmm).
TaxiIn: Taxi In Time, in Minutes.
ArrTime: Actual Arrival Time (local time: hhmm).
ArrDelay: Difference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers.
ArrDelayMinutes: Difference in minutes between scheduled and actual arrival time. Early arrivals set to 0.
ArrDel15: Arrival Delay Indicator, 15 Minutes or More (1=Yes).
ArrivalDelayGroups: Arrival Delay intervals, every (15-minutes from <-15 to >180).
ActualElapsedTime: Elapsed Time of Flight, in Minutes.
AirTime: Flight Time, in Minutes.
DivAirportLandings: Number of Diverted Airport Landings.

The goal for this project is to predict if a flight will be delayed before it leaves. These arrival statistics will be removed.
"""
print(combined_df[['TaxiOut', 'ArrTime', 'ArrDelay', 'ActualElapsedTime']].head(10))

   TaxiOut  ArrTime  ArrDelay  ActualElapsedTime
0     19.0   2245.0     -20.0              122.0
1     14.0   1241.0      -3.0              124.0
2     23.0   2019.0      -7.0              206.0
3     17.0   1847.0      -4.0              254.0
4     13.0   1214.0     -13.0              201.0
5     22.0   2010.0       8.0               72.0
6     15.0   1915.0     -19.0              153.0
7     12.0   1002.0      -8.0              113.0
8     17.0   1312.0      -3.0               67.0
9     15.0   1502.0     -17.0               67.0


In [15]:
"""
CRSArrTime: CRS Arrival Time (local time: hhmm).
ArrTimeBlk: CRS Arrival Time Block, Hourly Intervals

These two variables both represents arrive time. We only need to keep CRSArrTime and we can derive any columns from it.
"""

print(combined_df[['CRSArrTime', 'ArrTimeBlk']].head(10))

   CRSArrTime ArrTimeBlk
0        2305  2300-2359
1        1244  1200-1259
2        2026  2000-2059
3        1851  1800-1859
4        1227  1200-1259
5        2002  2000-2059
6        1934  1900-1959
7        1010  1000-1059
8        1315  1300-1359
9        1519  1500-1559


In [16]:
"""
Flights: Number of Flights.

The only value of this variable is 1. It will be removed.
"""

print("Unique Flights:", combined_df['Flights'].unique())

Unique Flights: [1.]


In [17]:
"""
Distance: Distance between airports (miles).
DistanceGroup: Distance Intervals, every 250 Miles, for Flight Segment.

These two variables both represents distance. We only need to keep Distance and we can derive any columns from it.
"""

print(combined_df[['Distance', 'DistanceGroup']].head(10))

   Distance  DistanceGroup
0     689.0              3
1     731.0              3
2    1199.0              5
3    1587.0              7
4    1587.0              7
5     226.0              1
6    1199.0              5
7     689.0              3
8     226.0              1
9     226.0              1


In [18]:
columns_to_remove = ['DOT_ID_Reporting_Airline', 'IATA_CODE_Reporting_Airline', 'OriginAirportID', 'OriginCityMarketID', 
                     'OriginStateFips', 'OriginStateName', 'OriginWac', 'DestAirportID', 'DestCityMarketID', 'DestStateFips', 
                     'DestStateName', 'DestWac', 'DepTime', 'DepDelayMinutes', 'DepDel15', 'DepartureDelayGroups', 
                     'DepTimeBlk', 'TaxiOut','WheelsOff', 'WheelsOn', 'TaxiIn', 'ArrTime', 'ArrDelay', 'ArrDelayMinutes', 
                     'ArrDel15', 'ArrivalDelayGroups', 'ArrTimeBlk', 'ActualElapsedTime','AirTime', 'Flights', 
                     'DistanceGroup','DivAirportLandings']
combined_df = combined_df.drop(columns=columns_to_remove, axis=1)
print("Shape of combined dataframe after removing extra columns:", combined_df.shape)

Shape of combined dataframe after removing extra columns: (1658130, 24)


In [19]:
print(combined_df.columns)

Index(['Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek', 'FlightDate',
       'Reporting_Airline', 'Tail_Number', 'Flight_Number_Reporting_Airline',
       'OriginAirportSeqID', 'Origin', 'OriginCityName', 'OriginState',
       'DestAirportSeqID', 'Dest', 'DestCityName', 'DestState', 'CRSDepTime',
       'DepDelay', 'CRSArrTime', 'Cancelled', 'Diverted', 'CRSElapsedTime',
       'Distance'],
      dtype='object')


In [20]:
combined_df.to_csv('csv_flight/combined_df.csv', index=False)  