In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',None)

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

In [10]:
data.info()

data[(data['Airline'] == 'SkyWest Airlines Inc.') & (data['Carrier_ID(DOT)'] == 20304)]
data[(data['Airline'] == 'Alaska Airlines Inc.') & (data['Carrier_ID(DOT)'] == 19930.0)]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  890981 non-null   float64
 4   Estimated_Arrival_Time    890960 non-null   float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              890985 non-null   object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         890921 non-null   object 
 13  Distance                  10

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
29,TRAIN_000029,10,20,800.0,,0,0,SEA,14747,Washington,ANC,10299,Alaska,1448.0,Alaska Airlines Inc.,AS,19930.0,N457AS,
33,TRAIN_000033,4,12,2158.0,545.0,0,0,LAX,12892,California,FLL,11697,Florida,2343.0,Alaska Airlines Inc.,AS,19930.0,N621VA,
57,TRAIN_000057,11,26,2310.0,704.0,0,0,KOA,12758,,SEA,14747,Washington,2688.0,Alaska Airlines Inc.,AS,19930.0,N588AS,
67,TRAIN_000067,6,24,50.0,520.0,0,0,ANC,10299,Alaska,SEA,14747,Washington,1448.0,Alaska Airlines Inc.,AS,19930.0,N280AK,Not_Delayed
76,TRAIN_000076,11,16,1745.0,2130.0,0,0,EWR,11618,New Jersey,SFO,14771,California,2565.0,Alaska Airlines Inc.,AS,19930.0,N517AS,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999617,TRAIN_999617,10,12,1135.0,1215.0,0,0,PSG,14256,Alaska,JNU,12523,Alaska,123.0,Alaska Airlines Inc.,AS,19930.0,N615AS,
999770,TRAIN_999770,5,28,1025.0,1345.0,0,0,SEA,14747,,OGG,13830,Hawaii,2640.0,Alaska Airlines Inc.,AS,19930.0,N594AS,
999797,TRAIN_999797,4,10,1145.0,1315.0,0,0,LAX,12892,,SFO,14771,California,337.0,Alaska Airlines Inc.,AS,19930.0,N523AS,
999798,TRAIN_999798,7,25,1748.0,2033.0,0,0,RDU,14492,North Carolina,SEA,14747,Washington,2354.0,Alaska Airlines Inc.,AS,19930.0,N440AS,


# 데이터 전처리 파이프라인

In [153]:
from tqdm import tqdm
from datetime import datetime, timedelta
pd.set_option('mode.chained_assignment',  None)
class Processing:
    def __init__(self, x, test): # 이후 테스트 데이터도 넣는 버전 만들어야함
        self.x = x.set_index('ID')
        self.test = test
        
    
    def dummy(self):
        # 디버깅용 함수
        # airline, carrier id, state(출발,도착) 채워줌 
        # 캐리어 코드, 시간 채우는거 해야함
        data = self.fill_airline_and_id(self.x)
        data = self.fill_state(data)
        
        return data
    
    def time_padding(self, data):
        # 시간 패딩
        data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']] = data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']].fillna(0)
        data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']] = data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']].astype(int)
        data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']] = data[['Estimated_Departure_Time', 'Estimated_Arrival_Time']].astype(str)
        data['Estimated_Departure_Time'] = data['Estimated_Departure_Time'].str.pad(width=4, side='left', fillchar='0')
        data['Estimated_Arrival_Time'] = data['Estimated_Arrival_Time'].str.pad(width=4, side='left', fillchar='0')
        
        # 시간 둘 다 NULL값인 경우 제거함.
        # 제거하면 안될듯. 테스트데이터에도 있어서 어떻게든 해야함
        # 방안 1. 빈도수 높은거나 규칙에 따라 채운다.
        # 방안 2. 그냥 0으로 채우고 돌린다.
        data = data[~((data['Estimated_Departure_Time'] == '0000') & (data['Estimated_Arrival_Time'] == '0000'))]
        
        return data
    
    def fill_airline_and_id(self, data):
        '''
        Carrier ID - Airline 한 세트
        Carrier Code는 항공사, ID 겹침..
        '''
        for airline in data['Airline'].dropna().unique():

            id = data.loc[data['Airline'] == airline, 'Carrier_ID(DOT)'].dropna().unique()
        

            data.loc[data['Airline'] == airline, 'Carrier_ID(DOT)'] = id[0]
                

        for id in data['Carrier_ID(DOT)'].dropna().unique():

            
            airline = data.loc[data['Carrier_ID(DOT)'] == id, 'Airline'].dropna().unique()

            data.loc[data['Carrier_ID(DOT)'] == id, 'Airline'] = airline[0]
                        
    
        return data
    
    def fill_state(self, data):
        for id in data['Origin_Airport_ID'].dropna().unique():
            data.loc[data['Origin_Airport_ID'] == id, 'Origin_State'] = data.loc[data['Origin_Airport_ID'] == id, 'Origin_State'].unique()[0]
        
        for id in data['Destination_Airport_ID'].dropna().unique():
            data.loc[data['Destination_Airport_ID'] == id, 'Destination_State'] = data.loc[data['Destination_Airport_ID'] == id, 'Destination_State'].unique()[0]
        
        return data
        

    def diff_time(self, df):
    
        time = timedelta(hours=df['H'], minutes=df['M']) - timedelta(hours=df['h'], minutes=df['m'])
        # print(time)
        time = round(time.total_seconds())
        
        hours = time // 3600
        minutes = (time % 3600) // 60
        
        if hours < 0:
            hours += 24
        
        # if minutes < 0:
        #     minutes += 60
        return hours, minutes 
        
    
    
    def make_noise_zero_data(self):
        # 시간 NULL값 없는 경우만 뽑음
        # nzdata = self.x[~(self.x(['Estimated_Departure_Time'] == '0000') | (self.x['Estimated_Arrival_Time'] == '0000'))]
        nzdata = self.x[~((self.x['Estimated_Departure_Time'] == '0000') | (self.x['Estimated_Arrival_Time'] == '0000'))]
        nzdata = self.make_time_feature(nzdata)
        time = nzdata.apply(self.diff_time, axis=1, result_type='expand')
        nzdata['diff_hour'] = time[0]
        nzdata['diff_minute'] = time[1]
        
        return nzdata
        

    
    def make_time_feature(self, data):
        data['h'] = [data['Estimated_Departure_Time'][i][:2] for i in range(len(data))]
        data['m'] = [data['Estimated_Departure_Time'][i][2:] for i in range(len(data))]
        data['H'] = [data['Estimated_Arrival_Time'][i][:2] for i in range(len(data))]
        data['M'] = [data['Estimated_Arrival_Time'][i][2:] for i in range(len(data))]

        data['h'] = data['h'].astype(int)
        data['m'] = data['m'].astype(int)
        data['H'] = data['H'].astype(int)
        data['M'] = data['M'].astype(int)

        return data
    
    def fill_time(self):
        print('시간 전처리')
        x = self.make_time_feature(self.x)
        check = self.make_noise_zero_data()
        
        # 출발시간
        print('출발시간')
        idx = x[x['Estimated_Departure_Time'] == '0000'].index
        for i in tqdm(idx):
            at = x.loc[i, 'Estimated_Arrival_Time']
            dt = x.loc[i, 'Estimated_Departure_Time']
            oa = x.loc[i, 'Origin_Airport']
            da = x.loc[i, 'Destination_Airport']
            al = x.loc[i, 'Airline']
            
            check_data = check[(check['Origin_Airport'] == oa) & (check['Destination_Airport'] == da) & (check['Airline'] == al)]
            # print(check_data)
            
            # 동일 시간 존재 시 동일한 수로 대체 (복수 존재 시 제일 많은걸로)
            insert_time = check_data.loc[check_data['Estimated_Arrival_Time'] == at, 'Estimated_Departure_Time']
            if len(insert_time) != 0:
                try:
                    x.loc[i, 'Estimated_Departure_Time'] = insert_time.value_counts().index.drop('0000')[0]
                
                except:

                    
                    x.loc[i, 'Estimated_Departure_Time'] = insert_time.value_counts().index[0]
                    
                x.loc[i, 'h'] = int(x.loc[i, 'Estimated_Departure_Time'][:2])
                x.loc[i, 'm'] = int(x.loc[i, 'Estimated_Departure_Time'][2:])
                
            # 가장 빈도수 높은 경과시간으로 계산하여 대체
            elif len(insert_time) == 0:
                

                try:
                    x.loc[i, 'Estimated_Departure_Time'] = check_data['Estimated_Departure_Time'].value_counts().index.drop('0000')[0]
                
                except:
                    print(at, dt, oa, da, al)
                    print(insert_time.value_counts())
                    print(insert_time.value_counts().index[0])
                    x.loc[i, 'Estimated_Departure_Time'] = check_data['Estimated_Departure_Time'].value_counts().index[0]
                    
                x.loc[i, 'H'] = int(x.loc[i, 'Estimated_Departure_Time'][:2])
                x.loc[i, 'M'] = int(x.loc[i, 'Estimated_Departure_Time'][2:])
            
            # 동일 시간 없을 시 평균 경과시간으로 대체
            # elif len(insert_time) == 0:
            #     ht = round(check_data['diff_hour'].mean())
            #     mt = round(check_data['diff_minute'].mean())
                
            #     insert_time =  timedelta(hours=x.loc[i, 'H'], minutes=x.loc[i, 'M']) - timedelta(hours=ht, minutes=mt)
            #     insert_time = round(insert_time.total_seconds())
            #     hours = insert_time // 3600
            #     minutes = (insert_time % 3600) // 60
                
            #     if hours < 0:
            #         hours += 24

            #     x.loc[i, 'h'] = hours
            #     x.loc[i, 'm'] = minutes
                
                
        # 도착시간
        print('도착시간')
        idx = x[x['Estimated_Arrival_Time'] == '0000'].index
        for i in tqdm(idx):
            at = x.loc[i, 'Estimated_Arrival_Time']
            dt = x.loc[i, 'Estimated_Departure_Time']
            oa = x.loc[i, 'Origin_Airport']
            da = x.loc[i, 'Destination_Airport']
            al = x.loc[i, 'Airline']
            
            check_data = check[(check['Origin_Airport'] == oa) & (check['Destination_Airport'] == da) & (check['Airline'] == al)]
            # 동일 시간 존재 시 동일한 수로 대체 (복수 존재 시 제일 많은걸로)
            insert_time = check_data.loc[check_data['Estimated_Departure_Time'] == dt, 'Estimated_Arrival_Time']
            if len(insert_time) != 0:
                try:
                    x.loc[i, 'Estimated_Arrival_Time'] = insert_time.value_counts().index.drop('0000')[0]
                
                except:
                    x.loc[i, 'Estimated_Arrival_Time'] = insert_time.value_counts().index[0]
                    
                x.loc[i, 'H'] = int(x.loc[i, 'Estimated_Arrival_Time'][:2])
                x.loc[i, 'M'] = int(x.loc[i, 'Estimated_Arrival_Time'][2:])
            
            # 가장 빈도수 높은 경과시간으로 계산하여 대체
            elif len(insert_time) == 0:
                try:
                    x.loc[i, 'Estimated_Arrival_Time'] = check_data['Estimated_Arrival_Time'].value_counts().index.drop('0000')[0]
                
                except:
                    x.loc[i, 'Estimated_Arrival_Time'] = check_data['Estimated_Arrival_Time'].value_counts().index[0]
                    
                x.loc[i, 'H'] = int(x.loc[i, 'Estimated_Arrival_Time'][:2])
                x.loc[i, 'M'] = int(x.loc[i, 'Estimated_Arrival_Time'][2:])
            
            
            
            
            # 동일 시간 없을 시 평균 경과시간으로 대체
            # elif len(insert_time) == 0:
            #     ht = round(check_data['diff_hour'].mean())
            #     mt = round(check_data['diff_minute'].mean())
                
            #     # 23 50 / 1 40 -> 1 50
                
            #     insert_time =  timedelta(hours=x.loc[i, 'h'], minutes=x.loc[i, 'm']) + timedelta(hours=ht, minutes=mt)
            #     insert_time = round(insert_time.total_seconds())
                    
            #     hours = insert_time // 3600
            #     minutes = (insert_time % 3600) // 60
                
            #     if hours > 24:
            #         hours -= 24

            #     x.loc[i, 'h'] = hours
            #     x.loc[i, 'm'] = minutes
                
        return x
    
    
    def testset_processing(self):
        pass
    
    
                
                
            
        

In [154]:
pc = Processing(data, test)

ddata = pc.dummy()

In [155]:
ddata

Unnamed: 0_level_0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,19977.0,N595UA,
TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,Illinois,PHL,14100,Pennsylvania,678.0,United Air Lines Inc.,UA,19977.0,N477UA,
TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,North Dakota,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,20304.0,N439SW,
TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,California,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,
TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,,ATL,10397,Georgia,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,


# dummy

In [7]:
'''
케리어 ID, 케리어 코드, 항공사 - 한 세트
도착 공항, 도착 항공 아이디, 지역 - 한 세트(출발 동일)
'''

# def fill_null_carrier(x):
    
    
data[['Airline', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)']].value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0
Airline,Carrier_Code(IATA),Carrier_ID(DOT),Unnamed: 3_level_1
Southwest Airlines Co.,WN,19393.0,144837
Delta Air Lines Inc.,DL,19790.0,71282
United Air Lines Inc.,UA,19977.0,66749
American Airlines Inc.,AA,19805.0,65726
JetBlue Airways,B6,20409.0,32993
SkyWest Airlines Inc.,UA,20304.0,31285
SkyWest Airlines Inc.,DL,20304.0,24934
Alaska Airlines Inc.,AS,19930.0,23213
Envoy Air,AA,20398.0,21835
Spirit Air Lines,NK,20416.0,19824


In [12]:
for airline in data['Airline'].unique():
    print(airline)
    print(data[data['Airline'] == airline][['Carrier_Code(IATA)', 'Carrier_ID(DOT)']].value_counts())
    print('-'*50)

Southwest Airlines Co.
Carrier_Code(IATA)  Carrier_ID(DOT)
WN                  19393.0            144837
dtype: int64
--------------------------------------------------
SkyWest Airlines Inc.
Carrier_Code(IATA)  Carrier_ID(DOT)
UA                  20304.0            31285
DL                  20304.0            24934
AA                  20304.0             8068
AS                  20304.0             4614
dtype: int64
--------------------------------------------------
American Airlines Inc.
Carrier_Code(IATA)  Carrier_ID(DOT)
AA                  19805.0            65726
dtype: int64
--------------------------------------------------
United Air Lines Inc.
Carrier_Code(IATA)  Carrier_ID(DOT)
UA                  19977.0            66749
dtype: int64
--------------------------------------------------
Republic Airlines
Carrier_Code(IATA)  Carrier_ID(DOT)
UA                  20452.0            11972
AA                  20452.0             9935
DL                  20452.0             4697
dtype

In [13]:
for airline in data['Carrier_ID(DOT)'].unique():
    print(airline)
    print(data[data['Carrier_ID(DOT)'] == airline][['Airline']].value_counts())
    print('-'*50)

19393.0
Airline               
Southwest Airlines Co.    162329
dtype: int64
--------------------------------------------------
20304.0
Airline              
SkyWest Airlines Inc.    77422
dtype: int64
--------------------------------------------------
19805.0
Airline               
American Airlines Inc.    73835
dtype: int64
--------------------------------------------------
nan
Series([], dtype: int64)
--------------------------------------------------
20452.0
Airline          
Republic Airlines    29806
dtype: int64
--------------------------------------------------
20366.0
Airline                 
ExpressJet Airlines Inc.    17965
dtype: int64
--------------------------------------------------
19790.0
Airline             
Delta Air Lines Inc.    80026
dtype: int64
--------------------------------------------------
20046.0
Airline                    
Air Wisconsin Airlines Corp    11894
dtype: int64
--------------------------------------------------
19687.0
Airline    
Horizon Air 

In [14]:
for airline in data['Airline'].unique():
    print(airline)
    print(data[data['Airline'] == airline][['Carrier_ID(DOT)']].value_counts())
    print('-'*50)

Southwest Airlines Co.
Carrier_ID(DOT)
19393.0            162329
dtype: int64
--------------------------------------------------
SkyWest Airlines Inc.
Carrier_ID(DOT)
20304.0            77422
dtype: int64
--------------------------------------------------
American Airlines Inc.
Carrier_ID(DOT)
19805.0            73835
dtype: int64
--------------------------------------------------
United Air Lines Inc.
Carrier_ID(DOT)
19977.0            74823
dtype: int64
--------------------------------------------------
Republic Airlines
Carrier_ID(DOT)
20452.0            29806
dtype: int64
--------------------------------------------------
nan
Series([], dtype: int64)
--------------------------------------------------
ExpressJet Airlines Inc.
Carrier_ID(DOT)
20366.0            17965
dtype: int64
--------------------------------------------------
Delta Air Lines Inc.
Carrier_ID(DOT)
19790.0            80026
dtype: int64
--------------------------------------------------
Air Wisconsin Airlines Corp
Ca

In [19]:
for id in data['Destination_Airport_ID'].unique():
    print(data[data['Destination_Airport_ID'] == id]['Destination_State'].unique())
    print(data[data['Destination_Airport_ID'] == id]['Destination_Airport'].unique())
    print('-'*50)
    

['Texas' nan]
['HOU']
--------------------------------------------------
['Utah' nan]
['SLC']
--------------------------------------------------
['New York' nan]
['LGA']
--------------------------------------------------
['New Jersey' nan]
['EWR']
--------------------------------------------------
['California' nan]
['ACV']
--------------------------------------------------
['Virginia' nan]
['DCA']
--------------------------------------------------
['Massachusetts' nan]
['BOS']
--------------------------------------------------
['Missouri' nan]
['MCI']
--------------------------------------------------
['North Carolina' nan]
['CLT']
--------------------------------------------------
['Texas' nan]
['IAH']
--------------------------------------------------
['Pennsylvania' nan]
['PIT']
--------------------------------------------------
['Michigan' nan]
['DTW']
--------------------------------------------------
['Colorado' nan]
['DEN']
--------------------------------------------------
['N

In [65]:
dum = data[data['Destination_Airport_ID'] == id]['Destination_State'].unique()
dum = dum.tolist()
dum.remove(np.nan)
dum

['Kentucky']

In [152]:
df['Airline'].unique()

array(['Southwest Airlines Co.', 'SkyWest Airlines Inc.',
       'American Airlines Inc.', 'United Air Lines Inc.',
       'Republic Airlines', 'ExpressJet Airlines Inc.',
       'Delta Air Lines Inc.', 'Air Wisconsin Airlines Corp',
       'Horizon Air', 'JetBlue Airways', 'Spirit Air Lines',
       'Alaska Airlines Inc.', 'Mesa Airlines Inc.',
       'Frontier Airlines Inc.', 'Trans States Airlines',
       'Endeavor Air Inc.', 'Comair Inc.', 'Capital Cargo International',
       'Envoy Air', 'Hawaiian Airlines Inc.', 'Allegiant Air', nan,
       'Virgin America', 'Peninsula Airways Inc.', 'Compass Airlines',
       'GoJet Airlines, LLC d/b/a United Express',
       'Commutair Aka Champlain Enterprises, Inc.',
       'Empire Airlines Inc.', 'Cape Air'], dtype=object)

In [72]:
for id in data['Carrier_ID(DOT)'].unique():
    print(data[data['Carrier_ID(DOT)'] == id][['Carrier_Code(IATA)', 'Distance']].value_counts())
    print('-'*50)
    

Carrier_Code(IATA)  Distance
WN                  337.0       2199
                    239.0       1565
                    325.0       1531
                    255.0       1364
                    369.0       1253
                                ... 
                    1306.0         1
                    1128.0         1
                    352.0          1
                    1091.0         1
                    1368.0         1
Length: 625, dtype: int64
--------------------------------------------------
Carrier_Code(IATA)  Distance
UA                  109.0       614
                    391.0       597
                    73.0        582
                    125.0       533
                    250.0       511
                               ... 
                    1077.0        1
                    778.0         1
DL                  1066.0        1
AS                  1050.0        1
UA                  371.0         1
Length: 969, dtype: int64
------------------------------------

In [66]:
data


Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,CLT,11057,North Carolina,LGA,12953,New York,544.0,American Airlines Inc.,AA,19805.0,N103US,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,LAX,12892,California,EWR,11618,New Jersey,2454.0,United Air Lines Inc.,UA,,N595UA,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,SFO,14771,California,ACV,10157,California,250.0,SkyWest Airlines Inc.,UA,20304.0,N161SY,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,ORD,13930,,PHL,14100,,678.0,United Air Lines Inc.,UA,19977.0,N477UA,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,FAR,11637,,MSP,13487,Minnesota,223.0,SkyWest Airlines Inc.,DL,,N439SW,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,OAK,13796,,HOU,12191,Texas,1642.0,Southwest Airlines Co.,WN,19393.0,N230WN,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,BNA,10693,Tennessee,ATL,10397,,214.0,Delta Air Lines Inc.,DL,19790.0,N968DL,
