In [1]:
import os
import pandas as pd
import kagglehub
import humanize

from sklearn.model_selection import train_test_split
from IPython.display import display, HTML

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_dir = os.path.join(".", "..", "data")
datasets_dir = os.path.join(data_dir, "datasets")
train_dir = os.path.join(data_dir, "train")
test_dir = os.path.join(data_dir, "test")

os.makedirs(datasets_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Load Data

In [3]:
raw_datasets_paths = (
    kagglehub.dataset_download(
        "gabrielluizone/us-domestic-flights-delay-prediction-2013-2018"
    ),
    kagglehub.dataset_download("threnjen/2019-airline-delays-and-cancellations"),
    kagglehub.dataset_download("ulrikthygepedersen/airlines-delay"),
    kagglehub.dataset_download("robikscube/flight-delay-dataset-20182022"),
    kagglehub.dataset_download("mikhail1681/airline-quality-ratings"),
)



In [4]:
for dataset_path in raw_datasets_paths:
    print(
        "Dataset " + dataset_path[dataset_path.index("datasets/") + len("datasets/") :]
    )
    print("\t" + "\n\t".join(sorted(os.listdir(dataset_path))))

Dataset gabrielluizone/us-domestic-flights-delay-prediction-2013-2018/versions/3
	csv_flight
	flight_delay_predict.csv
Dataset threnjen/2019-airline-delays-and-cancellations/versions/9
	full_data_flightdelay.csv
	raw_data
	raw_data_documentation.txt
	test.csv
	train.csv
	train_sets_documentation.txt
Dataset ulrikthygepedersen/airlines-delay/versions/1
	airlines_delay.csv
Dataset robikscube/flight-delay-dataset-20182022/versions/4
	Airlines.csv
	Combined_Flights_2018.csv
	Combined_Flights_2018.parquet
	Combined_Flights_2019.csv
	Combined_Flights_2019.parquet
	Combined_Flights_2020.csv
	Combined_Flights_2020.parquet
	Combined_Flights_2021.csv
	Combined_Flights_2021.parquet
	Combined_Flights_2022.csv
	Combined_Flights_2022.parquet
	raw
	readme.html
	readme.md
Dataset mikhail1681/airline-quality-ratings/versions/1
	Airline Quality Ratings.csv


# Dataset creation

In [5]:
datasets_paths = (
    os.path.join(raw_datasets_paths[0], "flight_delay_predict.csv"),
    os.path.join(raw_datasets_paths[1], "full_data_flightdelay.csv"),
    os.path.join(raw_datasets_paths[2], "airlines_delay.csv"),
    os.path.join(raw_datasets_paths[3], "Combined_Flights_2022.csv"),
    os.path.join(raw_datasets_paths[4], "Airline Quality Ratings.csv"),
)

In [6]:
for i, dataset_path in enumerate(datasets_paths):
    df = pd.read_csv(dataset_path)
    original_shape = df.shape
    df = df.head(10**4)
    memory_usage = df.memory_usage(deep=True).sum()
    memory_usage_humanized = humanize.naturalsize(memory_usage)

    print(
        f"Dataset {i} - {df.shape[0]} x {df.shape[1]}. Space usage {memory_usage_humanized}. Original length is {original_shape[0]}."
    )
    df.to_csv(os.path.join(datasets_dir, f"dataset_{i}.csv"), index=False)

Dataset 0 - 10000 x 20. Space usage 4.8 MB. Original length is 1635590.
Dataset 1 - 10000 x 26. Space usage 4.8 MB. Original length is 6489062.
Dataset 2 - 10000 x 8. Space usage 2.2 MB. Original length is 539382.
Dataset 3 - 10000 x 61. Space usage 14.8 MB. Original length is 4078318.
Dataset 4 - 10000 x 24. Space usage 4.8 MB. Original length is 129880.


# Data loading and Analysis

For this task we will use 6 dataframes that include Airlines/Flight information. Each of them has a binary target variable. In the first part we will analyze the data, check column types, missing values, identify targets etc.

In [7]:
datasets = []

for i in range(len(datasets_paths)):
    df = pd.read_csv(os.path.join(datasets_dir, f"dataset_{i}.csv"))

    print("-" * 100)
    print(f"----> Dataset {i}")
    print("-" * 100)
    df.info()
    display(HTML(df.describe().to_html()))
    display(HTML(df.head(5).to_html()))

    datasets.append(df)

----------------------------------------------------------------------------------------------------
----> Dataset 0
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   is_delay           10000 non-null  float64
 1   Year               10000 non-null  int64  
 2   Quarter            10000 non-null  int64  
 3   Month              10000 non-null  int64  
 4   DayofMonth         10000 non-null  int64  
 5   DayOfWeek          10000 non-null  int64  
 6   FlightDate         10000 non-null  object 
 7   Reporting_Airline  10000 non-null  object 
 8   Origin             10000 non-null  object 
 9   OriginState        10000 non-null  object 
 10  Dest               10000 non-null  object 
 11  DestState          10000 non-null  object 
 1

Unnamed: 0,is_delay,Year,Quarter,Month,DayofMonth,DayOfWeek,CRSDepTime,Cancelled,Diverted,Distance,DistanceGroup,ArrDelay,ArrDelayMinutes,AirTime
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.3364,2014.0,1.0,1.0,7.6011,3.9395,1313.9225,0.0,0.0,1001.5002,4.3925,17.5016,22.2945,134.3003
std,0.472501,0.0,0.0,0.0,4.142354,1.970946,475.313405,0.0,0.0,514.227925,2.020013,48.150718,45.288576,61.603448
min,0.0,2014.0,1.0,1.0,1.0,1.0,15.0,0.0,0.0,224.0,1.0,-56.0,0.0,32.0
25%,0.0,2014.0,1.0,1.0,4.0,2.0,920.0,0.0,0.0,641.0,3.0,-8.0,0.0,89.0
50%,0.0,2014.0,1.0,1.0,8.0,4.0,1315.0,0.0,0.0,868.0,4.0,3.0,3.0,122.0
75%,1.0,2014.0,1.0,1.0,11.0,6.0,1700.0,0.0,0.0,1440.0,6.0,25.0,25.0,179.0
max,1.0,2014.0,1.0,1.0,15.0,7.0,2359.0,0.0,0.0,2139.0,9.0,1175.0,1175.0,320.0


Unnamed: 0,is_delay,Year,Quarter,Month,DayofMonth,DayOfWeek,FlightDate,Reporting_Airline,Origin,OriginState,Dest,DestState,CRSDepTime,Cancelled,Diverted,Distance,DistanceGroup,ArrDelay,ArrDelayMinutes,AirTime
0,1.0,2014,1,1,1,3,2014-01-01,UA,LAX,CA,ORD,IL,900,0.0,0.0,1744.0,7,43.0,43.0,218.0
1,0.0,2014,1,1,1,3,2014-01-01,AA,IAH,TX,DFW,TX,1750,0.0,0.0,224.0,1,2.0,2.0,50.0
2,1.0,2014,1,1,1,3,2014-01-01,AA,LAX,CA,ORD,IL,1240,0.0,0.0,1744.0,7,26.0,26.0,220.0
3,1.0,2014,1,1,1,3,2014-01-01,AA,DFW,TX,LAX,CA,1905,0.0,0.0,1235.0,5,159.0,159.0,169.0
4,0.0,2014,1,1,1,3,2014-01-01,AA,DFW,TX,CLT,NC,1115,0.0,0.0,936.0,4,-13.0,0.0,108.0


----------------------------------------------------------------------------------------------------
----> Dataset 1
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 26 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   MONTH                          10000 non-null  int64  
 1   DAY_OF_WEEK                    10000 non-null  int64  
 2   DEP_DEL15                      10000 non-null  int64  
 3   DEP_TIME_BLK                   10000 non-null  object 
 4   DISTANCE_GROUP                 10000 non-null  int64  
 5   SEGMENT_NUMBER                 10000 non-null  int64  
 6   CONCURRENT_FLIGHTS             10000 non-null  int64  
 7   NUMBER_OF_SEATS                10000 non-null  int64  
 8   CARRIER_NAME                   10000 non-null  object 
 9   AIRPORT_F

Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,AVG_MONTHLY_PASS_AIRPORT,AVG_MONTHLY_PASS_AIRLINE,FLT_ATTENDANTS_PER_PASS,GROUND_SERV_PER_PASS,PLANE_AGE,LATITUDE,LONGITUDE,PRCP,SNOW,SNWD,TMAX,AWND
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.0,4.528,0.1667,4.0413,2.9382,31.4309,138.6977,14434.9101,60859.2562,3662.7855,2029687.0,8190901.0,0.000109,0.000141,11.5894,36.672197,-99.60171,0.038763,0.14823,0.41391,53.9965,7.132131
std,0.0,1.034854,0.372726,2.542015,1.686317,20.880283,46.962224,8388.079846,30598.482486,4395.428037,1169187.0,4779479.0,8.6e-05,4.6e-05,6.996764,5.872931,18.028152,0.105936,0.522305,1.215276,15.310341,3.479158
min,1.0,3.0,0.0,1.0,1.0,1.0,44.0,1282.0,6713.0,7.0,90611.0,473794.0,0.0,7e-06,1.0,21.319,-157.922,0.0,0.0,0.0,28.0,2.01
25%,1.0,3.0,0.0,2.0,2.0,15.0,110.0,8569.0,24623.0,989.0,1065782.0,3190369.0,3.4e-05,9.9e-05,5.0,33.641,-117.187,0.0,0.0,0.0,34.0,4.25
50%,1.0,5.0,0.0,3.0,3.0,27.0,143.0,13056.0,62105.0,2896.0,1903352.0,8501631.0,9.8e-05,0.000149,12.0,36.127,-93.218,0.0,0.0,0.0,60.0,6.93
75%,1.0,5.0,0.0,5.0,4.0,41.0,173.0,23400.0,75506.0,4691.0,3103410.0,12460180.0,0.000144,0.000177,17.0,41.978,-84.427,0.0,0.0,0.0,65.0,9.17
max,1.0,7.0,1.0,11.0,13.0,93.0,337.0,30842.0,107363.0,18809.0,4365661.0,13383000.0,0.000348,0.000229,32.0,47.447,-71.006,0.62,4.4,5.9,77.0,18.12


Unnamed: 0,MONTH,DAY_OF_WEEK,DEP_DEL15,DEP_TIME_BLK,DISTANCE_GROUP,SEGMENT_NUMBER,CONCURRENT_FLIGHTS,NUMBER_OF_SEATS,CARRIER_NAME,AIRPORT_FLIGHTS_MONTH,AIRLINE_FLIGHTS_MONTH,AIRLINE_AIRPORT_FLIGHTS_MONTH,AVG_MONTHLY_PASS_AIRPORT,AVG_MONTHLY_PASS_AIRLINE,FLT_ATTENDANTS_PER_PASS,GROUND_SERV_PER_PASS,PLANE_AGE,DEPARTING_AIRPORT,LATITUDE,LONGITUDE,PREVIOUS_AIRPORT,PRCP,SNOW,SNWD,TMAX,AWND
0,1,7,0,0800-0859,2,1,25,143,Southwest Airlines Co.,13056,107363,5873,1903352,13382999,6.2e-05,9.9e-05,8,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
1,1,7,0,0700-0759,7,1,29,191,Delta Air Lines Inc.,13056,73508,1174,1903352,12460183,0.000144,0.000149,3,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
2,1,7,0,0600-0659,7,1,27,199,Delta Air Lines Inc.,13056,73508,1174,1903352,12460183,0.000144,0.000149,18,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
3,1,7,0,0600-0659,9,1,27,180,Delta Air Lines Inc.,13056,73508,1174,1903352,12460183,0.000144,0.000149,2,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91
4,1,7,0,0001-0559,7,1,10,182,Spirit Air Lines,13056,15023,1257,1903352,2688839,9e-06,0.000125,1,McCarran International,36.08,-115.152,NONE,0.0,0.0,0.0,65.0,2.91


----------------------------------------------------------------------------------------------------
----> Dataset 2
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Flight       10000 non-null  float64
 1   Time         10000 non-null  float64
 2   Length       10000 non-null  float64
 3   Airline      10000 non-null  object 
 4   AirportFrom  10000 non-null  object 
 5   AirportTo    10000 non-null  object 
 6   DayOfWeek    10000 non-null  int64  
 7   Class        10000 non-null  int64  
dtypes: float64(3), int64(2), object(3)
memory usage: 625.1+ KB


Unnamed: 0,Flight,Time,Length,DayOfWeek,Class
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2506.4669,767.6737,128.5272,3.9527,0.0
std,2090.469228,284.688367,67.749275,1.90584,0.0
min,1.0,10.0,26.0,1.0,0.0
25%,730.75,516.0,80.0,2.0,0.0
50%,1902.0,740.0,112.0,4.0,0.0
75%,3926.75,1000.0,157.0,5.0,0.0
max,7810.0,1439.0,655.0,7.0,0.0


Unnamed: 0,Flight,Time,Length,Airline,AirportFrom,AirportTo,DayOfWeek,Class
0,2313.0,1296.0,141.0,DL,ATL,HOU,1,0
1,6948.0,360.0,146.0,OO,COS,ORD,4,0
2,1247.0,1170.0,143.0,B6,BOS,CLT,3,0
3,31.0,1410.0,344.0,US,OGG,PHX,6,0
4,563.0,692.0,98.0,FL,BMI,ATL,4,0


----------------------------------------------------------------------------------------------------
----> Dataset 3
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 61 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   FlightDate                               10000 non-null  object 
 1   Airline                                  10000 non-null  object 
 2   Origin                                   10000 non-null  object 
 3   Dest                                     10000 non-null  object 
 4   Cancelled                                10000 non-null  bool   
 5   Diverted                                 10000 non-null  bool   
 6   CRSDepTime                               10000 non-null  int64  
 7   DepTime                                

Unnamed: 0,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,Year,Quarter,Month,DayofMonth,DayOfWeek,DOT_ID_Marketing_Airline,Flight_Number_Marketing_Airline,DOT_ID_Operating_Airline,Flight_Number_Operating_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,OriginStateFips,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,DestStateFips,DestWac,DepDel15,DepartureDelayGroups,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,DistanceGroup,DivAirportLandings
count,10000.0,9154.0,9154.0,9154.0,9116.0,9104.0,9104.0,10000.0,9104.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,9154.0,9154.0,9135.0,9135.0,9116.0,9116.0,10000.0,9104.0,9104.0,9104.0,10000.0,10000.0
mean,1325.7969,1350.206358,34.307188,32.944177,1426.690983,31.914433,96.371485,125.0833,117.971441,695.6572,2022.0,2.0,4.0,2.2006,5.6266,19499.3464,2478.7919,19555.726,2478.4604,12573.1254,1257316.0,31598.7805,26.3446,58.2654,12749.4269,1274946.0,31691.5148,25.6123,58.3851,0.448547,1.569041,14.523043,1370.388068,1424.213142,7.076349,1468.2842,26.932008,0.404547,1.203976,3.2709,0.0047
std,485.313028,509.643626,56.685967,57.568865,564.068123,56.926456,55.963964,55.980784,58.243264,461.070076,0.0,0.0,0.0,1.916853,1.070461,225.392593,1405.609041,356.185267,1405.276081,1587.991814,158799.1,1241.647913,16.889416,24.463307,1495.492975,149549.3,1237.994662,16.747401,24.62935,0.497373,3.25803,11.20303,510.405311,559.856655,8.554385,523.700835,60.16399,0.490831,3.398247,1.852689,0.13813
min,500.0,2.0,0.0,-20.0,1.0,0.0,16.0,35.0,31.0,67.0,2022.0,2.0,4.0,1.0,1.0,19393.0,1.0,19393.0,1.0,10135.0,1013506.0,30135.0,1.0,2.0,10135.0,1013506.0,30135.0,1.0,2.0,0.0,-2.0,3.0,1.0,1.0,1.0,5.0,-49.0,0.0,-2.0,1.0,0.0
25%,915.0,924.0,0.0,-1.0,1032.0,0.0,55.0,85.0,75.0,349.0,2022.0,2.0,4.0,1.0,5.0,19393.0,1228.0,19393.0,1228.0,11193.0,1119302.0,30599.0,12.0,35.0,11292.0,1129202.0,30693.0,12.0,36.0,0.0,-1.0,9.0,937.0,1029.0,4.0,1055.0,-9.0,0.0,-1.0,2.0,0.0
50%,1325.0,1338.0,10.0,10.0,1443.0,6.0,83.0,115.0,104.0,595.0,2022.0,2.0,4.0,2.0,6.0,19393.0,2527.5,19393.0,2527.5,12339.0,1233904.0,31123.0,24.0,64.0,12892.0,1289208.0,31453.0,22.0,64.0,0.0,0.0,12.0,1352.0,1441.0,5.0,1505.0,6.0,0.0,0.0,3.0,0.0
75%,1730.0,1753.0,44.0,44.0,1858.0,40.0,123.0,150.0,145.0,925.0,2022.0,2.0,4.0,2.0,6.0,19393.0,3777.0,19393.0,3776.25,13930.0,1393007.0,32467.0,47.0,81.0,13930.0,1393007.0,32467.0,44.0,82.0,1.0,2.0,16.0,1807.5,1855.0,7.0,1905.0,40.0,1.0,2.0,4.0,0.0
max,2255.0,2358.0,659.0,659.0,2400.0,697.0,388.0,425.0,474.0,2979.0,2022.0,2.0,4.0,15.0,7.0,19977.0,6665.0,20500.0,6665.0,15919.0,1591904.0,35412.0,72.0,93.0,15919.0,1591904.0,35412.0,72.0,93.0,1.0,12.0,168.0,2400.0,2400.0,186.0,2355.0,697.0,1.0,12.0,11.0,9.0


Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,Year,Quarter,Month,DayofMonth,DayOfWeek,Marketing_Airline_Network,Operated_or_Branded_Code_Share_Partners,DOT_ID_Marketing_Airline,IATA_Code_Marketing_Airline,Flight_Number_Marketing_Airline,Operating_Airline,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac,DestAirportID,DestAirportSeqID,DestCityMarketID,DestCityName,DestState,DestStateFips,DestStateName,DestWac,DepDel15,DepartureDelayGroups,DepTimeBlk,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
0,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",GJT,DEN,False,False,1133,1123.0,0.0,-10.0,1228.0,0.0,40.0,72.0,65.0,212.0,2022,2,4,4,1,UA,UA_CODESHARE,19977,UA,4301,C5,20445,C5,N21144,4301,11921,1192102,31921,"Grand Junction, CO",CO,8,Colorado,82,11292,1129202,30325,"Denver, CO",CO,8,Colorado,82,0.0,-1.0,1100-1159,17.0,1140.0,1220.0,8.0,1245,-17.0,0.0,-2.0,1200-1259,1,0
1,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",HRL,IAH,False,False,732,728.0,0.0,-4.0,848.0,0.0,55.0,77.0,80.0,295.0,2022,2,4,4,1,UA,UA_CODESHARE,19977,UA,4299,C5,20445,C5,N16170,4299,12206,1220605,32206,"Harlingen/San Benito, TX",TX,48,Texas,74,12266,1226603,31453,"Houston, TX",TX,48,Texas,74,0.0,-1.0,0700-0759,16.0,744.0,839.0,9.0,849,-1.0,0.0,-1.0,0800-0859,2,0
2,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,False,1529,1514.0,0.0,-15.0,1636.0,0.0,47.0,70.0,82.0,251.0,2022,2,4,4,1,UA,UA_CODESHARE,19977,UA,4298,C5,20445,C5,N21144,4298,11413,1141307,30285,"Durango, CO",CO,8,Colorado,82,11292,1129202,30325,"Denver, CO",CO,8,Colorado,82,0.0,-1.0,1500-1559,21.0,1535.0,1622.0,14.0,1639,-3.0,0.0,-1.0,1600-1659,2,0
3,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",IAH,GPT,False,False,1435,1430.0,0.0,-5.0,1547.0,0.0,57.0,90.0,77.0,376.0,2022,2,4,4,1,UA,UA_CODESHARE,19977,UA,4296,C5,20445,C5,N11184,4296,12266,1226603,31453,"Houston, TX",TX,48,Texas,74,11973,1197302,31973,"Gulfport/Biloxi, MS",MS,28,Mississippi,53,0.0,-1.0,1400-1459,16.0,1446.0,1543.0,4.0,1605,-18.0,0.0,-2.0,1600-1659,2,0
4,2022-04-04,"Commutair Aka Champlain Enterprises, Inc.",DRO,DEN,False,False,1135,1135.0,0.0,0.0,1251.0,6.0,49.0,70.0,76.0,251.0,2022,2,4,4,1,UA,UA_CODESHARE,19977,UA,4295,C5,20445,C5,N17146,4295,11413,1141307,30285,"Durango, CO",CO,8,Colorado,82,11292,1129202,30325,"Denver, CO",CO,8,Colorado,82,0.0,0.0,1100-1159,19.0,1154.0,1243.0,8.0,1245,6.0,0.0,0.0,1200-1259,2,0


----------------------------------------------------------------------------------------------------
----> Dataset 4
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 24 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   ID                                      10000 non-null  int64  
 1   Gender                                  10000 non-null  object 
 2   Age                                     10000 non-null  int64  
 3   Customer Type                           10000 non-null  object 
 4   Type of Travel                          10000 non-null  object 
 5   Class                                   10000 non-null  object 
 6   Flight Distance                         10000 non-null  int64  
 7   Departure Delay                         10000 no

Unnamed: 0,ID,Age,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling
count,10000.0,10000.0,10000.0,10000.0,9962.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,40.3657,1078.2809,15.5999,18.383256,3.0484,2.728,3.413,3.3513,2.9718,3.4601,3.5262,3.4182,3.3314,3.2021,3.7368,2.6439,3.3719,3.7442
std,2886.89568,15.127539,1112.49462,43.570727,46.10893,1.542302,1.445346,1.216051,1.35354,1.283888,1.268057,1.300339,1.310145,1.293268,1.308903,1.141077,1.338608,1.340735,1.143283
min,1.0,7.0,67.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,2500.75,28.0,299.0,0.0,0.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0
50%,5000.5,41.0,562.0,0.0,0.0,3.0,3.0,4.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,4.0,3.0,4.0,4.0
75%,7500.25,52.0,1638.0,12.0,17.0,4.0,4.0,4.0,4.0,4.0,4.0,5.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0
max,10000.0,85.0,3997.0,1017.0,1011.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,Ease of Online Booking,Check-in Service,Online Boarding,Gate Location,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,3,4,3,3,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,2,3,5,2,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,4,4,5,4,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,2,3,4,2,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,3,3,5,3,3,4,4,5,4,3,3,3,3,Satisfied


# Identifying targets

In [8]:
datasets[0].rename(columns={"is_delay": "target"}, inplace=True)
datasets[1].rename(columns={"DEP_DEL15": "target"}, inplace=True)
datasets[2].rename(columns={"Class": "target"}, inplace=True)
datasets[3].rename(columns={"DepDel15": "target"}, inplace=True)
datasets[4].rename(columns={"Satisfaction": "target"}, inplace=True)

In [9]:
for i, dataset in enumerate(datasets):
    print(f"Dataset {i}")
    print(
        "\t"
        + "\n\t".join(
            [f"{k}\t{v}" for k, v in dataset["target"].value_counts().to_dict().items()]
        )
    )

Dataset 0
	0.0	6636
	1.0	3364
Dataset 1
	0	8333
	1	1667
Dataset 2
	0	10000
Dataset 3
	0.0	5048
	1.0	4106
Dataset 4
	Neutral or Dissatisfied	5689
	Satisfied	4311


In [10]:
datasets[0]["target"] = datasets[0]["target"].astype(bool)
datasets[1]["target"] = datasets[0]["target"].astype(bool)
datasets[2]["target"] = datasets[0]["target"].astype(bool)
datasets[3]["target"] = datasets[0]["target"].astype(bool)
datasets[4]["target"] = datasets[4]["target"].apply(lambda x: x == "Satisfied")

## Train test split

In [11]:
for i, dataset in enumerate(datasets):
    X = dataset.drop(columns=["target"])
    y = dataset["target"]

    X_train, X_test, Y_train, Y_test = train_test_split(
        X, y, test_size=0.25, random_state=42
    )

    X_train_file_path = os.path.join(train_dir, f"X{i}_train.csv")
    y_train_file_path = os.path.join(train_dir, f"y{i}_train.csv")
    X_test_file_path = os.path.join(test_dir, f"X{i}_test.csv")
    y_test_file_path = os.path.join(test_dir, f"y{i}_test.csv")

    X_train.to_csv(X_train_file_path, index=False)
    Y_train.to_csv(y_train_file_path, index=False)
    X_test.to_csv(X_test_file_path, index=False)
    Y_test.to_csv(y_test_file_path, index=False)