In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

cur_dir = os.getcwd()
train_set = pd.read_csv(cur_dir + '/../data/train_set_artificial.csv', low_memory=False)
test_set = pd.read_csv(cur_dir + '/../data/test_set.csv', low_memory=False)
val_set = pd.read_csv(cur_dir + '/../data/test_set.csv', low_memory=False)

df = pd.concat([train_set, test_set, val_set], axis=0)

X_train = train_set.drop(['delay_class'],axis=1)
y_train = train_set['delay_class']

X_val = val_set.drop(['delay_class'],axis=1)
y_val = val_set['delay_class']

X_test = test_set.drop(['delay_class'],axis=1)
y_test = test_set['delay_class']

In [2]:
df

Unnamed: 0,carrier,origin,dest,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,sched_time_in_min,Precipitation Binary,New Snow Binary,Snow Depth Binary,delay_class
0,DL,JFK,MCO,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,0.316067,yes,no,no,no
1,EV,EWR,MCI,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,0.374012,yes,no,no,no
2,B6,JFK,MSY,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,0.087796,no,no,no,no
3,B6,EWR,FLL,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,0.579456,yes,no,no,yes
4,WN,LGA,BNA,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,0.267779,yes,no,no,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65465,EV,EWR,MEM,0.182222,0.176627,0.135802,0.160000,0.142857,0.023965,0.000000,0.0,0.0,0.898352,0.636523,no,no,no,no
65466,EV,EWR,MCI,0.201481,0.206404,0.938272,0.893333,0.922078,0.583878,0.022444,0.0,0.0,0.549451,0.352063,yes,no,no,no
65467,EV,EWR,CHS,0.102222,0.111768,0.740741,0.613333,0.681818,0.422658,0.134663,0.0,0.0,0.722527,0.140474,yes,no,no,no
65468,EV,LGA,PIT,0.048889,0.052009,0.654321,0.573333,0.616883,0.230937,0.000000,0.0,0.0,0.706044,0.340650,no,no,no,no


In [3]:
from sklearn.preprocessing import OneHotEncoder

# Transform categorical features into binary features
categorical_columns = list(X_train.select_dtypes(include=['object']).columns)
encoder = OneHotEncoder()

# Use df to fit the encoder to prevent scenarios that the binary features in train and test sets are different, 
# For example, test set has dest_LAX, but train set does not have dest_LAX
encoder.fit(df[categorical_columns])

X_train_category = encoder.transform(X_train[categorical_columns])
X_test_category = encoder.transform(X_test[categorical_columns])
X_val_category = encoder.transform(X_val[categorical_columns])

# Get numerical features
numerical_columns = list(X_train.select_dtypes(include=['float64']).columns)

# Combine the numerical and categorical features
X_train_df_category = pd.DataFrame(X_train_category.toarray())
X_train_df_category.columns = encoder.get_feature_names_out()
X_train_df_numerical = pd.DataFrame(X_train[numerical_columns]).reset_index(drop=True)
X_train_encoded = pd.concat([X_train_df_numerical, X_train_df_category], axis=1)


X_test_df_category = pd.DataFrame(X_test_category.toarray())
X_test_df_category.columns = encoder.get_feature_names_out()
X_test_df_numerical = pd.DataFrame(X_test[numerical_columns]).reset_index(drop=True)
X_test_encoded = pd.concat([X_test_df_numerical, X_test_df_category], axis=1)

X_val_df_category = pd.DataFrame(X_val_category.toarray())
X_val_df_category.columns = encoder.get_feature_names_out()
X_val_df_numerical = pd.DataFrame(X_val[numerical_columns]).reset_index(drop=True)
X_val_encoded = pd.concat([X_val_df_numerical, X_val_df_category], axis=1)

In [4]:
X_train_encoded

Unnamed: 0,air_time,distance,Maximum,Minimum,Average,Departure,Precipitation,New Snow,Snow Depth,days_in_365,...,dest_TUL,dest_TVC,dest_TYS,dest_XNA,Precipitation Binary_no,Precipitation Binary_yes,New Snow Binary_no,New Snow Binary_yes,Snow Depth Binary_no,Snow Depth Binary_yes
0,0.170370,0.176219,0.358025,0.346667,0.350649,0.418301,0.000000,0.0,0.0,0.917582,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
1,0.229630,0.206404,0.407407,0.333333,0.370130,0.254902,0.000000,0.0,0.0,0.840659,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
2,0.232593,0.224760,0.308642,0.293333,0.298701,0.241830,0.000000,0.0,0.0,0.250000,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
3,0.195556,0.200897,0.506173,0.493333,0.500000,0.459695,0.000000,0.0,0.0,0.274725,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
4,0.130370,0.139506,0.691358,0.773333,0.733766,0.250545,0.004988,0.0,0.0,0.530220,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
293895,0.059259,0.069141,0.545426,0.573333,0.559607,0.334357,0.000000,0.0,0.0,0.773058,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
293896,0.057778,0.037927,0.259259,0.213333,0.233766,0.385621,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
293897,0.278015,0.309606,0.672717,0.641133,0.659579,0.574152,0.000000,0.0,0.0,0.779986,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
293898,0.238519,0.265588,0.619803,0.696054,0.658494,0.295969,0.285698,0.0,0.0,0.439560,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [5]:
for col in X_train_df_category.columns: print(col)

carrier_9E
carrier_AA
carrier_AS
carrier_B6
carrier_DL
carrier_EV
carrier_F9
carrier_FL
carrier_HA
carrier_MQ
carrier_OO
carrier_UA
carrier_US
carrier_VX
carrier_WN
carrier_YV
origin_EWR
origin_JFK
origin_LGA
dest_ABQ
dest_ACK
dest_ALB
dest_ANC
dest_ATL
dest_AUS
dest_AVL
dest_BDL
dest_BGR
dest_BHM
dest_BNA
dest_BOS
dest_BQN
dest_BTV
dest_BUF
dest_BUR
dest_BWI
dest_BZN
dest_CAE
dest_CAK
dest_CHO
dest_CHS
dest_CLE
dest_CLT
dest_CMH
dest_CRW
dest_CVG
dest_DAY
dest_DCA
dest_DEN
dest_DFW
dest_DSM
dest_DTW
dest_EGE
dest_EYW
dest_FLL
dest_GRR
dest_GSO
dest_GSP
dest_HDN
dest_HNL
dest_HOU
dest_IAD
dest_IAH
dest_ILM
dest_IND
dest_JAC
dest_JAX
dest_LAS
dest_LAX
dest_LEX
dest_LGB
dest_MCI
dest_MCO
dest_MDW
dest_MEM
dest_MHT
dest_MIA
dest_MKE
dest_MSN
dest_MSP
dest_MSY
dest_MTJ
dest_MVY
dest_MYR
dest_OAK
dest_OKC
dest_OMA
dest_ORD
dest_ORF
dest_PBI
dest_PDX
dest_PHL
dest_PHX
dest_PIT
dest_PSE
dest_PSP
dest_PVD
dest_PWM
dest_RDU
dest_RIC
dest_ROC
dest_RSW
dest_SAN
dest_SAT
dest_SAV
dest_SBN
dest_SDF

In [6]:
carrier_dict = {}
dest_dict = {}
total = 0
for x in X_train_df_category.columns:
    if(x.startswith('carrier')):
        carrier_dict[x] = X_train_df_category[x].sum()   
    elif(x.startswith('dest')):
        dest_dict[x] = X_train_df_category[x].sum()


sorted_carrier = sorted(carrier_dict.items(), key=lambda x:x[1], reverse=True)
print(sorted_carrier) 


sorted_dest = sorted(dest_dict.items(), key=lambda x:x[1], reverse=True)
print(sorted_dest)

[('carrier_UA', 53320.0), ('carrier_EV', 53270.0), ('carrier_B6', 49774.0), ('carrier_DL', 39317.0), ('carrier_AA', 25706.0), ('carrier_MQ', 22318.0), ('carrier_9E', 15929.0), ('carrier_US', 14806.0), ('carrier_WN', 10964.0), ('carrier_VX', 4070.0), ('carrier_FL', 2681.0), ('carrier_F9', 633.0), ('carrier_YV', 445.0), ('carrier_AS', 442.0), ('carrier_HA', 208.0), ('carrier_OO', 17.0)]
[('dest_ATL', 15776.0), ('dest_ORD', 15283.0), ('dest_LAX', 14527.0), ('dest_BOS', 13067.0), ('dest_MCO', 12896.0), ('dest_SFO', 12417.0), ('dest_CLT', 11841.0), ('dest_FLL', 11343.0), ('dest_MIA', 9774.0), ('dest_DCA', 8116.0), ('dest_DTW', 8101.0), ('dest_RDU', 7303.0), ('dest_DEN', 6937.0), ('dest_DFW', 6909.0), ('dest_IAH', 6686.0), ('dest_TPA', 6179.0), ('dest_MSP', 6089.0), ('dest_PBI', 5831.0), ('dest_BNA', 5663.0), ('dest_IAD', 5189.0), ('dest_SJU', 5144.0), ('dest_LAS', 4881.0), ('dest_BUF', 4282.0), ('dest_CLE', 3879.0), ('dest_CVG', 3863.0), ('dest_PHX', 3861.0), ('dest_MDW', 3848.0), ('dest_ST

In [7]:
X_train_encoded = X_train_encoded.drop('Precipitation Binary_no', axis=1)
X_train_encoded = X_train_encoded.drop('New Snow Binary_no', axis=1)
X_train_encoded = X_train_encoded.drop('Snow Depth Binary_no', axis=1)

index = 0
num_of_column_to_keep = 20

for dest in sorted_dest:
    if index < num_of_column_to_keep:
        index+=1
    else:
        X_train_encoded = X_train_encoded.drop(dest[0], axis=1)


print(X_train_encoded)

        air_time  distance   Maximum   Minimum   Average  Departure  \
0       0.170370  0.176219  0.358025  0.346667  0.350649   0.418301   
1       0.229630  0.206404  0.407407  0.333333  0.370130   0.254902   
2       0.232593  0.224760  0.308642  0.293333  0.298701   0.241830   
3       0.195556  0.200897  0.506173  0.493333  0.500000   0.459695   
4       0.130370  0.139506  0.691358  0.773333  0.733766   0.250545   
...          ...       ...       ...       ...       ...        ...   
293895  0.059259  0.069141  0.545426  0.573333  0.559607   0.334357   
293896  0.057778  0.037927  0.259259  0.213333  0.233766   0.385621   
293897  0.278015  0.309606  0.672717  0.641133  0.659579   0.574152   
293898  0.238519  0.265588  0.619803  0.696054  0.658494   0.295969   
293899  0.029156  0.024475  0.762241  0.746667  0.758062   0.462571   

        Precipitation  New Snow  Snow Depth  days_in_365  ...  dest_MIA  \
0            0.000000       0.0         0.0     0.917582  ...       0.0 