In [1]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import LabelEncoder

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
seed_everything(42)

In [22]:
train = pd.read_parquet('./train.parquet').drop("ID", axis=1)
test = pd.read_parquet('./test.parquet').drop("ID", axis=1)
sample_submission = pd.read_csv('./dts/sample_submission.csv')

In [11]:
train.drop(['Cancelled','Diverted'], axis=1, inplace=True)
test.drop(['Cancelled','Diverted'], axis=1, inplace=True)

In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 17 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   Month                     1000000 non-null  int64  
 1   Day_of_Month              1000000 non-null  int64  
 2   Estimated_Departure_Time  891016 non-null   float64
 3   Estimated_Arrival_Time    890952 non-null   float64
 4   Cancelled                 1000000 non-null  int64  
 5   Diverted                  1000000 non-null  int64  
 6   Origin_Airport            1000000 non-null  object 
 7   Origin_Airport_ID         1000000 non-null  int64  
 8   Origin_State              893495 non-null   object 
 9   Destination_Airport       1000000 non-null  object 
 10  Destination_Airport_ID    1000000 non-null  int64  
 11  Destination_State         893477 non-null   object 
 12  Distance                  1000000 non-null  float64
 13  Airline                   89

In [21]:
train.head(2)

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,4,15,,,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
1,8,15,740.0,1024.0,ORD,13930,Illinois,SLC,14869,Utah,1250.0,SkyWest Airlines Inc.,UA,20304.0,N125SY,


In [14]:
train['Delay'].value_counts()

Not_Delayed    210001
Delayed         45000
Name: Delay, dtype: int64

In [17]:
train[np.logical_and(train['Estimated_Departure_Time'].isna(),train['Estimated_Arrival_Time'].notna())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97331 entries, 25 to 999994
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     97331 non-null  int64  
 1   Day_of_Month              97331 non-null  int64  
 2   Estimated_Departure_Time  0 non-null      float64
 3   Estimated_Arrival_Time    97331 non-null  float64
 4   Origin_Airport            97331 non-null  object 
 5   Origin_Airport_ID         97331 non-null  int64  
 6   Origin_State              86755 non-null  object 
 7   Destination_Airport       97331 non-null  object 
 8   Destination_Airport_ID    97331 non-null  int64  
 9   Destination_State         86744 non-null  object 
 10  Distance                  97331 non-null  float64
 11  Airline                   86877 non-null  object 
 12  Carrier_Code(IATA)        86769 non-null  object 
 13  Carrier_ID(DOT)           86758 non-null  float64
 14  Tail

In [18]:
train[np.logical_and(train['Estimated_Departure_Time'].notna(),train['Estimated_Arrival_Time'].isna())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 97352 entries, 5 to 999970
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     97352 non-null  int64  
 1   Day_of_Month              97352 non-null  int64  
 2   Estimated_Departure_Time  97352 non-null  float64
 3   Estimated_Arrival_Time    0 non-null      float64
 4   Origin_Airport            97352 non-null  object 
 5   Origin_Airport_ID         97352 non-null  int64  
 6   Origin_State              86518 non-null  object 
 7   Destination_Airport       97352 non-null  object 
 8   Destination_Airport_ID    97352 non-null  int64  
 9   Destination_State         86843 non-null  object 
 10  Distance                  97352 non-null  float64
 11  Airline                   86736 non-null  object 
 12  Carrier_Code(IATA)        86788 non-null  object 
 13  Carrier_ID(DOT)           86510 non-null  float64
 14  Tail_

In [19]:
train[np.logical_and(train['Estimated_Departure_Time'].notna(),train['Estimated_Arrival_Time'].notna())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 793629 entries, 1 to 999999
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     793629 non-null  int64  
 1   Day_of_Month              793629 non-null  int64  
 2   Estimated_Departure_Time  793629 non-null  float64
 3   Estimated_Arrival_Time    793629 non-null  float64
 4   Origin_Airport            793629 non-null  object 
 5   Origin_Airport_ID         793629 non-null  int64  
 6   Origin_State              707266 non-null  object 
 7   Destination_Airport       793629 non-null  object 
 8   Destination_Airport_ID    793629 non-null  int64  
 9   Destination_State         706931 non-null  object 
 10  Distance                  793629 non-null  float64
 11  Airline                   707059 non-null  object 
 12  Carrier_Code(IATA)        707030 non-null  object 
 13  Carrier_ID(DOT)           707317 non-null  f

In [16]:
train[np.logical_and(train['Estimated_Departure_Time'].isna(),train['Estimated_Arrival_Time'].isna())].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11688 entries, 0 to 999967
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Month                     11688 non-null  int64  
 1   Day_of_Month              11688 non-null  int64  
 2   Estimated_Departure_Time  0 non-null      float64
 3   Estimated_Arrival_Time    0 non-null      float64
 4   Origin_Airport            11688 non-null  object 
 5   Origin_Airport_ID         11688 non-null  int64  
 6   Origin_State              10446 non-null  object 
 7   Destination_Airport       11688 non-null  object 
 8   Destination_Airport_ID    11688 non-null  int64  
 9   Destination_State         10403 non-null  object 
 10  Distance                  11688 non-null  float64
 11  Airline                   10408 non-null  object 
 12  Carrier_Code(IATA)        10423 non-null  object 
 13  Carrier_ID(DOT)           10418 non-null  float64
 14  Tail_

In [9]:
train['Diverted'].unique()

array([0], dtype=int64)

In [5]:
train[train['Estimated_Arrival_Time'].isna()]

Unnamed: 0,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,4,15,,,0,0,OKC,13851,Oklahoma,HOU,12191,Texas,419.0,Southwest Airlines Co.,WN,19393.0,N7858A,
5,4,13,1545.0,,0,0,EWR,11618,,DCA,11278,Virginia,199.0,Republic Airlines,UA,20452.0,N657RW,Not_Delayed
20,7,2,620.0,,0,0,BWI,10821,Maryland,BOS,10721,Massachusetts,369.0,Southwest Airlines Co.,WN,19393.0,N292WN,
24,11,28,1030.0,,0,0,TPA,15304,Florida,BHM,10599,Alabama,459.0,Southwest Airlines Co.,WN,19393.0,N921WN,
28,8,7,930.0,,0,0,SAN,14679,California,LAS,12889,Nevada,258.0,Spirit Air Lines,NK,20416.0,N628NK,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999932,1,15,2110.0,,0,0,DFW,11298,,BNA,10693,Tennessee,631.0,American Airlines Inc.,AA,19805.0,N809NN,
999946,3,6,1925.0,,0,0,MDW,13232,Illinois,TPA,15304,Florida,997.0,Southwest Airlines Co.,WN,19393.0,N8715Q,
999965,5,22,1555.0,,0,0,BOS,10721,,BNA,10693,Tennessee,942.0,Republic Airlines,DL,20452.0,N215JQ,
999967,12,30,,,0,0,BNA,10693,Tennessee,ORD,13930,Illinois,409.0,,UA,20304.0,N127SY,
