## CIS 662 Intro to Machine Learning and Algorithms
### Flight delay prediction project - group 10
### Team Members:
###                            Vikas Papana
###                            Ajay Hemanshu Desai
###                            Hemil Anip Shah
###                            Rangel Anselm Koli

In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

pd.options.display.float_format = '{:,.2f}'.format
from IPython.display import display, HTML

# Flight Data


### Fetching, reading and preprocessing flight data

In [2]:
# Reading Arrival flights data
Arrival_data = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\Arrival_data.csv", 
                           parse_dates = ['Date (MM/DD/YYYY)', 'Scheduled Arrival Time'])

Arrival_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),Wheels-on Time,Taxi-In time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,MQ,2023-01-01,3392,N283NN,ORD,2024-05-02 21:16:00,21:08,111,104,-8,21:02,6,0,0,0,0,0
1,MQ,2023-01-01,3518,N213NN,ORD,2024-05-02 16:02:00,15:37,117,96,-25,15:33,4,0,0,0,0,0
2,MQ,2023-01-02,3392,N248NN,ORD,2024-05-02 20:19:00,20:12,106,102,-7,20:08,4,0,0,0,0,0
3,MQ,2023-01-02,3518,N263NN,ORD,2024-05-02 16:02:00,15:56,117,112,-6,15:52,4,0,0,0,0,0
4,MQ,2023-01-03,3392,N276NN,ORD,2024-05-02 20:19:00,20:57,106,113,38,20:52,5,19,0,7,0,12


In [3]:
Arrival_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)'],
      dtype='object')

In [4]:
# Reading departure flights data
Departure_data = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\Departure_data.csv", 
                            parse_dates = ['Date (MM/DD/YYYY)', 'Scheduled departure time'])

Departure_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,MQ,2023-01-01,3305,N663AR,COU,2024-05-02 19:35:00,19:34,81,97,-1,20:08,34,0,0,15,0,0
1,MQ,2023-01-01,3309,N939AE,MKE,2024-05-02 09:30:00,9:27,68,53,-3,9:53,26,0,0,0,0,0
2,MQ,2023-01-01,3315,N689EC,ALO,2024-05-02 09:00:00,8:50,85,60,-10,9:06,16,0,0,0,0,0
3,MQ,2023-01-01,3322,N902BC,COU,2024-05-02 12:55:00,12:52,85,80,-3,13:11,19,0,0,0,0,0
4,MQ,2023-01-01,3324,N267NN,AUS,2024-05-02 19:15:00,19:08,172,167,-7,19:23,15,0,0,0,0,0


In [5]:
Departure_data.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Destination Airport', 'Scheduled departure time',
       'Actual departure time', 'Scheduled elapsed time (Minutes)',
       'Actual elapsed time (Minutes)', 'Departure delay (Minutes)',
       'Wheels-off time', 'Taxi-Out time (Minutes)', 'Delay Carrier (Minutes)',
       'Delay Weather (Minutes)', 'Delay National Aviation System (Minutes)',
       'Delay Security (Minutes)', 'Delay Late Aircraft Arrival (Minutes)'],
      dtype='object')

In [6]:
# Filtering data - flights from Chicago, New York and Orlando airports
Arrival_data = Arrival_data[Arrival_data['Origin Airport'].isin({'ORD', 'JFK', 'MCO'})]
Arrival_data = Arrival_data[['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Origin Airport', 
                             'Scheduled Arrival Time', 'Arrival Delay (Minutes)']]

Arrival_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Arrival Delay (Minutes)
0,MQ,2023-01-01,3392,ORD,2024-05-02 21:16:00,-8
1,MQ,2023-01-01,3518,ORD,2024-05-02 16:02:00,-25
2,MQ,2023-01-02,3392,ORD,2024-05-02 20:19:00,-7
3,MQ,2023-01-02,3518,ORD,2024-05-02 16:02:00,-6
4,MQ,2023-01-03,3392,ORD,2024-05-02 20:19:00,38


In [7]:
# Filtering data - Flights reaching syracuse airport
Departure_data = Departure_data[Departure_data['Destination Airport'].isin({'SYR'})]
Departure_data = Departure_data[['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number',
                                 'Destination Airport', 'Scheduled departure time']]

Departure_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Destination Airport,Scheduled departure time
15,MQ,2023-01-01,3392,SYR,2024-05-02 18:25:00
29,MQ,2023-01-01,3518,SYR,2024-05-02 13:05:00
134,MQ,2023-01-02,3392,SYR,2024-05-02 17:33:00
150,MQ,2023-01-02,3518,SYR,2024-05-02 13:05:00
272,MQ,2023-01-03,3392,SYR,2024-05-02 17:33:00


In [8]:
# Merging Arrival data and departure data based on carrier code , date and flight number
Flight_data = pd.merge(Arrival_data, Departure_data, on = ['Carrier Code', 'Date (MM/DD/YYYY)', 
                                                           'Flight Number'])

Flight_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Arrival Delay (Minutes),Destination Airport,Scheduled departure time
0,MQ,2023-01-01,3392,ORD,2024-05-02 21:16:00,-8,SYR,2024-05-02 18:25:00
1,MQ,2023-01-01,3518,ORD,2024-05-02 16:02:00,-25,SYR,2024-05-02 13:05:00
2,MQ,2023-01-02,3392,ORD,2024-05-02 20:19:00,-7,SYR,2024-05-02 17:33:00
3,MQ,2023-01-02,3518,ORD,2024-05-02 16:02:00,-6,SYR,2024-05-02 13:05:00
4,MQ,2023-01-03,3392,ORD,2024-05-02 20:19:00,38,SYR,2024-05-02 17:33:00


In [9]:
Flight_data = Flight_data[Flight_data['Flight Number'].isin({538, 3402, 116, 5340, 491, 56, 656})]
Flight_data['Flight Number'].replace(56,656,inplace=True)

Flight_data.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Origin Airport,Scheduled Arrival Time,Arrival Delay (Minutes),Destination Airport,Scheduled departure time
71,MQ,2023-06-01,3402,ORD,2024-05-02 22:32:00,57,SYR,2024-05-02 19:42:00
73,MQ,2023-06-02,3402,ORD,2024-05-02 22:32:00,2,SYR,2024-05-02 19:42:00
75,MQ,2023-06-03,3402,ORD,2024-05-02 23:41:00,-15,SYR,2024-05-02 20:46:00
77,MQ,2023-06-04,3402,ORD,2024-05-02 22:28:00,0,SYR,2024-05-02 19:39:00
79,MQ,2023-06-05,3402,ORD,2024-05-02 22:28:00,-7,SYR,2024-05-02 19:39:00


In [10]:
Flight_data['Date'] = Flight_data['Date (MM/DD/YYYY)']
Flight_data['Date_MM'] = Flight_data['Date'].dt.month
Flight_data['Date_DD'] = Flight_data['Date'].dt.day

Flight_data['Carrier'] = Flight_data['Carrier Code']
Flight_data['Flight_num'] = Flight_data['Flight Number']
Flight_data['Origin'] = Flight_data['Origin Airport']
Flight_data['Dest'] = Flight_data['Destination Airport']

Flight_data['Arr_t_h'] = Flight_data['Scheduled Arrival Time'].dt.hour
Flight_data['Arr_t_m'] = Flight_data['Scheduled Arrival Time'].dt.minute
Flight_data['Depart_t_h'] = Flight_data['Scheduled departure time'].dt.hour
Flight_data['Depart_t_m'] = Flight_data['Scheduled departure time'].dt.minute


def categorize_arrival_delay(delay):
    if delay <= -5:
        return 'Early'
    elif delay <= 5:
        return 'On Time'
    else:
        return 'Late'
Flight_data['Arr_Status'] = Flight_data['Arrival Delay (Minutes)'].apply(categorize_arrival_delay)

Flight_data.drop(columns = ['Date (MM/DD/YYYY)', 'Carrier Code', 'Flight Number', 'Origin Airport', 
                            'Destination Airport', 'Scheduled Arrival Time', 'Scheduled departure time', 
                            'Arrival Delay (Minutes)'], inplace = True)

Flight_data.head()

Unnamed: 0,Date,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Arr_Status
71,2023-06-01,6,1,MQ,3402,ORD,SYR,22,32,19,42,Late
73,2023-06-02,6,2,MQ,3402,ORD,SYR,22,32,19,42,On Time
75,2023-06-03,6,3,MQ,3402,ORD,SYR,23,41,20,46,Early
77,2023-06-04,6,4,MQ,3402,ORD,SYR,22,28,19,39,On Time
79,2023-06-05,6,5,MQ,3402,ORD,SYR,22,28,19,39,Early


In [11]:
set(Flight_data['Carrier'])

{'9E', 'B6', 'MQ', 'UA', 'WN'}

In [12]:
set(Flight_data['Flight_num'])

{116, 491, 538, 656, 3402, 5340}

In [13]:
set(Flight_data['Origin'])

{'JFK', 'MCO', 'ORD'}

In [14]:
set(Flight_data['Dest'])

{'SYR'}

In [15]:
set(Flight_data['Arr_Status'])

{'Early', 'Late', 'On Time'}

In [16]:
Flight_data['Arr_Status'].replace("Early",0,inplace=True)
Flight_data['Arr_Status'].replace("On Time",1,inplace=True)
Flight_data['Arr_Status'].replace("Late",2,inplace=True)

Flight_data.head()

Unnamed: 0,Date,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Arr_Status
71,2023-06-01,6,1,MQ,3402,ORD,SYR,22,32,19,42,2
73,2023-06-02,6,2,MQ,3402,ORD,SYR,22,32,19,42,1
75,2023-06-03,6,3,MQ,3402,ORD,SYR,23,41,20,46,0
77,2023-06-04,6,4,MQ,3402,ORD,SYR,22,28,19,39,1
79,2023-06-05,6,5,MQ,3402,ORD,SYR,22,28,19,39,0


In [17]:
Flight_data.shape

(940, 12)

In [18]:
Flight_data.isna().sum()

Date          0
Date_MM       0
Date_DD       0
Carrier       0
Flight_num    0
Origin        0
Dest          0
Arr_t_h       0
Arr_t_m       0
Depart_t_h    0
Depart_t_m    0
Arr_Status    0
dtype: int64

# Weather Data

### Fetching, reading and preproessing Syracuse weather data

In [19]:
# Fetching syracuse weather data for the year 2023
SYR_weather = pd.read_excel(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\SYR_weather_2023.xlsx", 
                            parse_dates = ['datetime'])
SYR_weather.head()

Unnamed: 0,datetime,clouds,temp,max_temp,min_temp,wind_dir,max_wind_dir,wind_spd,max_wind_spd,wind_gust_spd,...,snow_depth,precip,precip_gpm,pres,solar_rad,t_solar_rad,uv,dewpt,rh,slp
0,2023-01-01,100,5.1,10.0,3.0,274,274,4.0,10.3,10.8,...,0.0,1.5,1.5,998,38,900,0.8,1.9,80,1013
1,2023-01-02,100,5.5,7.8,3.9,207,207,2.2,5.4,7.2,...,0.0,0.0,0.0,1004,25,607,0.7,1.7,77,1019
2,2023-01-03,99,4.3,5.0,2.2,86,86,2.3,5.8,5.6,...,0.0,8.5,8.5,996,9,209,0.7,2.4,88,1011
3,2023-01-04,100,6.2,7.2,5.0,140,140,2.4,6.7,12.4,...,0.0,12.3,12.3,993,9,211,0.7,5.3,93,1008
4,2023-01-05,82,6.0,8.9,2.3,217,217,2.4,8.9,10.8,...,0.0,5.0,5.0,998,48,1158,1.1,1.8,75,1013


In [20]:
SYR_weather.shape

(364, 21)

In [21]:
SYR_weather.isna().sum()

datetime         0
clouds           0
temp             0
max_temp         0
min_temp         0
wind_dir         0
max_wind_dir     0
wind_spd         0
max_wind_spd     0
wind_gust_spd    0
snow             0
snow_depth       0
precip           0
precip_gpm       0
pres             0
solar_rad        0
t_solar_rad      0
uv               0
dewpt            0
rh               0
slp              0
dtype: int64

In [22]:
SYR_weather.drop(columns = ['max_wind_dir', 'max_wind_spd', 'precip_gpm', 'solar_rad', 't_solar_rad'], 
                 inplace = True)

SYR_weather.columns

Index(['datetime', 'clouds', 'temp', 'max_temp', 'min_temp', 'wind_dir',
       'wind_spd', 'wind_gust_spd', 'snow', 'snow_depth', 'precip', 'pres',
       'uv', 'dewpt', 'rh', 'slp'],
      dtype='object')

In [23]:
SYR_weather.shape

(364, 16)

## Flight weather data - main dataset

In [24]:
# Merging flight data and weather data
Flight_weather = pd.merge(Flight_data, SYR_weather, left_on='Date', right_on='datetime')

Flight_weather.drop(columns = ['datetime', 'Date'], inplace=True)

Flight_weather.head()

Unnamed: 0,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,6,1,MQ,3402,ORD,SYR,22,32,19,42,...,1.6,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018
1,6,1,B6,116,JFK,SYR,14,42,13,30,...,1.6,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018
2,6,1,B6,656,MCO,SYR,19,2,16,15,...,1.6,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018
3,6,1,9E,5340,JFK,SYR,0,4,22,30,...,1.6,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018
4,6,1,UA,538,ORD,SYR,21,9,18,18,...,1.6,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018


In [25]:
Flight_weather.shape

(938, 26)

In [26]:
Flight_weather.columns

Index(['Date_MM', 'Date_DD', 'Carrier', 'Flight_num', 'Origin', 'Dest',
       'Arr_t_h', 'Arr_t_m', 'Depart_t_h', 'Depart_t_m', 'Arr_Status',
       'clouds', 'temp', 'max_temp', 'min_temp', 'wind_dir', 'wind_spd',
       'wind_gust_spd', 'snow', 'snow_depth', 'precip', 'pres', 'uv', 'dewpt',
       'rh', 'slp'],
      dtype='object')

In [27]:
Flight_weather.isna().sum()

Date_MM          0
Date_DD          0
Carrier          0
Flight_num       0
Origin           0
Dest             0
Arr_t_h          0
Arr_t_m          0
Depart_t_h       0
Depart_t_m       0
Arr_Status       0
clouds           0
temp             0
max_temp         0
min_temp         0
wind_dir         0
wind_spd         0
wind_gust_spd    0
snow             0
snow_depth       0
precip           0
pres             0
uv               0
dewpt            0
rh               0
slp              0
dtype: int64

In [28]:
Flight_weather.dtypes

Date_MM            int64
Date_DD            int64
Carrier           object
Flight_num         int64
Origin            object
Dest              object
Arr_t_h            int64
Arr_t_m            int64
Depart_t_h         int64
Depart_t_m         int64
Arr_Status         int64
clouds             int64
temp             float64
max_temp         float64
min_temp         float64
wind_dir           int64
wind_spd         float64
wind_gust_spd    float64
snow             float64
snow_depth       float64
precip           float64
pres               int64
uv               float64
dewpt            float64
rh                 int64
slp                int64
dtype: object

In [29]:
Flight_weather = pd.get_dummies(Flight_weather, columns = ['Carrier', 'Origin', 'Dest'], 
                                drop_first = True)

Flight_weather.head()

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Arr_Status,clouds,temp,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Carrier_UA,Carrier_WN,Origin_MCO,Origin_ORD
0,6,1,3402,22,32,19,42,2,26,24.3,...,10.1,10.8,47,1018,0,1,0,0,0,1
1,6,1,116,14,42,13,30,0,26,24.3,...,10.1,10.8,47,1018,1,0,0,0,0,0
2,6,1,656,19,2,16,15,0,26,24.3,...,10.1,10.8,47,1018,1,0,0,0,1,0
3,6,1,5340,0,4,22,30,0,26,24.3,...,10.1,10.8,47,1018,0,0,0,0,0,0
4,6,1,538,21,9,18,18,0,26,24.3,...,10.1,10.8,47,1018,0,0,1,0,0,1


In [30]:
Flight_weather.shape

(938, 29)

In [31]:
Flight_weather.columns

Index(['Date_MM', 'Date_DD', 'Flight_num', 'Arr_t_h', 'Arr_t_m', 'Depart_t_h',
       'Depart_t_m', 'Arr_Status', 'clouds', 'temp', 'max_temp', 'min_temp',
       'wind_dir', 'wind_spd', 'wind_gust_spd', 'snow', 'snow_depth', 'precip',
       'pres', 'uv', 'dewpt', 'rh', 'slp', 'Carrier_B6', 'Carrier_MQ',
       'Carrier_UA', 'Carrier_WN', 'Origin_MCO', 'Origin_ORD'],
      dtype='object')

In [32]:
Flight_weather.dtypes

Date_MM            int64
Date_DD            int64
Flight_num         int64
Arr_t_h            int64
Arr_t_m            int64
Depart_t_h         int64
Depart_t_m         int64
Arr_Status         int64
clouds             int64
temp             float64
max_temp         float64
min_temp         float64
wind_dir           int64
wind_spd         float64
wind_gust_spd    float64
snow             float64
snow_depth       float64
precip           float64
pres               int64
uv               float64
dewpt            float64
rh                 int64
slp                int64
Carrier_B6         uint8
Carrier_MQ         uint8
Carrier_UA         uint8
Carrier_WN         uint8
Origin_MCO         uint8
Origin_ORD         uint8
dtype: object

In [33]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(Flight_weather.drop(columns = ['Arr_Status']), 
                                                    Flight_weather['Arr_Status'], test_size=0.2, 
                                                    random_state=42)

In [34]:
X_train

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,clouds,temp,max_temp,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Carrier_UA,Carrier_WN,Origin_MCO,Origin_ORD
865,12,3,538,21,1,18,10,100,6.60,9.40,...,0.70,4.90,89,1007,0,0,1,0,0,1
616,5,27,116,14,42,13,30,16,16.70,26.70,...,10.00,3.60,46,1025,1,0,0,0,0,0
2,6,1,656,19,2,16,15,26,24.30,32.80,...,10.10,10.80,47,1018,1,0,0,0,1,0
101,7,11,656,19,35,16,49,44,24.60,31.10,...,9.00,15.60,59,1010,1,0,0,0,1,0
332,3,17,116,14,43,13,30,98,5.40,9.40,...,1.80,1.90,79,1002,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,7,14,3402,21,32,18,36,71,22.70,28.90,...,7.50,16.10,68,1011,0,1,0,0,0,1
270,2,24,656,16,51,14,7,96,-5.70,0.00,...,1.30,-8.70,80,1028,1,0,0,0,1,0
860,12,2,116,15,21,14,0,100,7.00,9.40,...,0.70,4.70,86,1014,1,0,0,0,0,0
435,4,11,538,21,34,18,40,74,14.90,22.80,...,2.50,0.00,39,1017,0,0,1,0,0,1


In [35]:
X_test

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,clouds,temp,max_temp,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Carrier_UA,Carrier_WN,Origin_MCO,Origin_ORD
70,6,25,3402,22,28,19,39,72,23.20,30.00,...,6.20,18.90,78,1008,0,1,0,0,0,1
331,3,17,656,16,51,14,7,98,5.40,9.40,...,1.80,1.90,79,1002,1,0,0,0,1,0
858,12,1,656,19,14,16,24,88,8.30,12.20,...,0.70,1.10,63,1015,1,0,0,0,1,0
495,4,26,538,21,34,18,40,77,7.10,11.70,...,4.90,0.20,63,1021,0,0,1,0,0,1
209,1,21,656,18,59,16,18,91,-0.30,0.60,...,0.80,-3.30,80,1021,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
208,1,21,116,14,29,13,16,91,-0.30,0.60,...,0.80,-3.30,80,1021,1,0,0,0,0,0
468,4,20,656,16,0,13,15,81,8.40,15.00,...,7.30,-1.00,54,1022,1,0,0,0,1,0
82,7,1,3402,23,41,20,46,81,25.30,30.00,...,4.40,18.70,67,1012,0,1,0,0,0,1
310,3,11,5340,23,59,22,32,82,-2.10,0.60,...,1.70,-5.10,80,1013,0,0,0,0,0,0


In [36]:
y_train

865    0
616    0
2      0
101    1
332    1
      ..
106    0
270    1
860    0
435    0
102    2
Name: Arr_Status, Length: 750, dtype: int64

In [37]:
y_test

70     1
331    2
858    2
495    2
209    2
      ..
208    2
468    1
82     2
310    1
817    0
Name: Arr_Status, Length: 188, dtype: int64

In [38]:
if False: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    
    X_train
    X_test
    y_train
    y_test

#### Random Forest Classifier

In [39]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

rf_classifier.score(X_train, y_train) 

1.0

In [40]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")

Accuracy: 0.5053191489361702


In [41]:
test_output = pd.DataFrame(rf_classifier.predict(X_test), index = X_test.index, 
                           columns = ['pred_Type'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

Unnamed: 0,pred_Type,Arr_Status
70,2,1
331,1,2
858,0,2
495,0,2
209,0,2


In [42]:
print('Percentage of correct predictions is ')
print(rf_classifier.score(X_test, y_test))

Percentage of correct predictions is 
0.5053191489361702


#### Gradient Boosting Classifier

In [43]:
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, 
                                max_depth = 4, n_estimators = 100)
gb = gb.fit(X_train, y_train) 

y_pred_gb = gb.predict(X_test)

gb.score(X_train, y_train) 

0.928

In [44]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}")

Accuracy: 0.5053191489361702


In [45]:
test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_Type'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

Unnamed: 0,pred_Type,Arr_Status
70,2,1
331,1,2
858,0,2
495,0,2
209,0,2


In [46]:
print('Percentage of correct predictions is ')
print(gb.score(X_test, y_test))

Percentage of correct predictions is 
0.5053191489361702


#### XGBoost Classifier

In [47]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)
xgb_classifier.fit(X_train, y_train)
 
y_pred_xgb = xgb_classifier.predict(X_test)

xgb_classifier.score(X_train, y_train) 

1.0

In [48]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb)}")

Accuracy: 0.43617021276595747


In [49]:
test_output = pd.DataFrame(xgb_classifier.predict(X_test), index = X_test.index, 
                           columns = ['pred_Type'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

Unnamed: 0,pred_Type,Arr_Status
70,2,1
331,1,2
858,0,2
495,0,2
209,0,2


In [50]:
print('Percentage of correct predictions is ')
print(xgb_classifier.score(X_test, y_test))

Percentage of correct predictions is 
0.43617021276595747


#### Logistic Regression

In [51]:
Log_reg = LogisticRegression(fit_intercept = True, solver='newton-cg', 
                             multi_class = 'multinomial', penalty = 'none', max_iter = 1000)

Log_reg.fit(X_train, y_train) 

# The following gives the mean accuracy on the given data and labels
Log_reg.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
#model.coef_

# This is the coefficient Beta_0
#model.intercept_



0.5426666666666666

In [52]:
test_output = pd.DataFrame(Log_reg.predict(X_test), index = X_test.index, columns = ['pred_Type'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()

Unnamed: 0,pred_Type,Arr_Status
70,2,1
331,2,2
858,0,2
495,2,2
209,0,2


In [53]:
print('Percentage of correct predictions is ')
print(Log_reg.score(X_test, y_test))

Percentage of correct predictions is 
0.48936170212765956


## Testing weather data

In [54]:
test_weather = pd.read_excel(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\test_weather.xlsx", 
                             parse_dates = ['datetime'])
test_weather

Unnamed: 0,datetime,clouds,temp,max_temp,min_temp,wind_dir,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,2024-04-09,30,16.7,24.5,5.8,154,2.4,3.5,0,0,0.0,998.2,3.0,5.4,50,1013.8
1,2024-04-10,79,15.0,17.6,11.9,196,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
2,2024-04-11,88,15.8,20.0,11.1,133,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
3,2024-04-12,79,14.6,18.0,7.7,203,10.8,16.4,0,0,11.5,975.9,3.5,9.7,73,991.4
4,2024-04-13,76,6.9,9.2,4.6,271,9.8,14.6,0,0,4.35,987.4,3.5,2.9,76,1003.5
5,2024-04-14,65,9.8,15.0,5.5,232,5.4,7.9,0,0,11.35,991.2,4.3,4.3,69,1007.3
6,2024-04-15,62,11.6,16.1,7.6,269,4.0,4.0,0,0,33.5,994.1,8.1,3.4,59,1010.1
7,2024-04-16,21,12.7,17.0,8.1,250,2.8,2.8,0,0,29.5,1006.3,8.1,2.5,51,1022.7
8,2024-04-17,38,14.2,18.6,10.6,138,3.5,3.5,0,0,67.0,1010.1,8.2,4.8,54,1026.4
9,2024-04-18,94,13.5,17.4,9.8,223,3.6,3.6,0,0,64.0,1006.9,8.3,4.8,57,1023.0


## Initial predictions

In [55]:
# Reading flight data for initial predictions
initial_flight = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\flight_initials.csv", 
                             parse_dates = ['Date', 'Scheduled Arrival Time', 'Scheduled departure time'])

initial_flight

Unnamed: 0,Date,Carrier,Flight_Num,Origin,Dest,Scheduled Arrival Time,Scheduled departure time
0,2024-04-10,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
1,2024-04-10,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
2,2024-04-10,B6,116,JFK,SYR,2024-05-02 14:50:00,2024-05-02 13:33:00
3,2024-04-10,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00
4,2024-04-10,WN,491,MCO,SYR,2024-05-02 13:45:00,2024-05-02 11:05:00
5,2024-04-10,B6,656,MCO,SYR,2024-05-02 16:25:00,2024-05-02 13:35:00
6,2024-04-11,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
7,2024-04-11,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
8,2024-04-11,B6,116,JFK,SYR,2024-05-02 14:50:00,2024-05-02 13:33:00
9,2024-04-11,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00


In [56]:
initial_flight['Date_MM'] = initial_flight['Date'].dt.month
initial_flight['Date_DD'] = initial_flight['Date'].dt.day

initial_flight['Flight_num'] = initial_flight['Flight_Num']

initial_flight['Arr_t_h'] = initial_flight['Scheduled Arrival Time'].dt.hour
initial_flight['Arr_t_m'] = initial_flight['Scheduled Arrival Time'].dt.minute
initial_flight['Depart_t_h'] = initial_flight['Scheduled departure time'].dt.hour
initial_flight['Depart_t_m'] = initial_flight['Scheduled departure time'].dt.minute

initial_flight.drop(columns = ['Scheduled Arrival Time', 'Scheduled departure time', 'Flight_Num'], 
                    inplace = True)

initial_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
0,2024-04-10,UA,ORD,SYR,4,10,538,21,47,18,52
1,2024-04-10,MQ,ORD,SYR,4,10,3402,22,52,19,59
2,2024-04-10,B6,JFK,SYR,4,10,116,14,50,13,33
3,2024-04-10,9E,JFK,SYR,4,10,5340,16,21,14,55
4,2024-04-10,WN,MCO,SYR,4,10,491,13,45,11,5
5,2024-04-10,B6,MCO,SYR,4,10,656,16,25,13,35
6,2024-04-11,UA,ORD,SYR,4,11,538,21,47,18,52
7,2024-04-11,MQ,ORD,SYR,4,11,3402,22,52,19,59
8,2024-04-11,B6,JFK,SYR,4,11,116,14,50,13,33
9,2024-04-11,9E,JFK,SYR,4,11,5340,16,21,14,55


In [57]:
# Merging flight data and tesing weather data for initial predictions based on dates
test_initials = pd.merge(initial_flight, test_weather, left_on='Date', right_on='datetime')
test_initials.drop(columns = ['datetime', 'Date'], inplace=True)

test_initials

Unnamed: 0,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,UA,ORD,SYR,4,10,538,21,47,18,52,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
1,MQ,ORD,SYR,4,10,3402,22,52,19,59,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
2,B6,JFK,SYR,4,10,116,14,50,13,33,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
3,9E,JFK,SYR,4,10,5340,16,21,14,55,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
4,WN,MCO,SYR,4,10,491,13,45,11,5,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
5,B6,MCO,SYR,4,10,656,16,25,13,35,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
6,UA,ORD,SYR,4,11,538,21,47,18,52,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
7,MQ,ORD,SYR,4,11,3402,22,52,19,59,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
8,B6,JFK,SYR,4,11,116,14,50,13,33,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
9,9E,JFK,SYR,4,11,5340,16,21,14,55,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1


In [58]:
test_initials = pd.get_dummies(test_initials, columns = ['Carrier', 'Origin','Dest'], 
                               drop_first = True)

test_initials

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,clouds,temp,max_temp,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Carrier_UA,Carrier_WN,Origin_MCO,Origin_ORD
0,4,10,538,21,47,18,52,79,15.0,17.6,...,2.2,10.2,73,1013.1,0,0,1,0,0,1
1,4,10,3402,22,52,19,59,79,15.0,17.6,...,2.2,10.2,73,1013.1,0,1,0,0,0,1
2,4,10,116,14,50,13,33,79,15.0,17.6,...,2.2,10.2,73,1013.1,1,0,0,0,0,0
3,4,10,5340,16,21,14,55,79,15.0,17.6,...,2.2,10.2,73,1013.1,0,0,0,0,0,0
4,4,10,491,13,45,11,5,79,15.0,17.6,...,2.2,10.2,73,1013.1,0,0,0,1,1,0
5,4,10,656,16,25,13,35,79,15.0,17.6,...,2.2,10.2,73,1013.1,1,0,0,0,1,0
6,4,11,538,21,47,18,52,88,15.8,20.0,...,2.3,12.0,78,1008.1,0,0,1,0,0,1
7,4,11,3402,22,52,19,59,88,15.8,20.0,...,2.3,12.0,78,1008.1,0,1,0,0,0,1
8,4,11,116,14,50,13,33,88,15.8,20.0,...,2.3,12.0,78,1008.1,1,0,0,0,0,0
9,4,11,5340,16,21,14,55,88,15.8,20.0,...,2.3,12.0,78,1008.1,0,0,0,0,0,0


### Predicting initial test data using pre-trained model - Random Forest Classifier

In [59]:
y1_pred = rf_classifier.predict(test_initials)
#y1_pred = gb.predict(test_initials)
#y1_pred = xgb_classifier.predict(test_initials)
#y1_pred = Log_reg.predict(test_initials)
y1_pred

array([0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 0, 0, 2, 0,
       2], dtype=int64)

In [60]:
label_mapping = {
    0: "Early",
    1: "On Time",
    2: "Late"
}

In [61]:
predictions = np.vectorize(label_mapping.get)(y1_pred)
predictions

array(['Early', 'Early', 'Late', 'Late', 'Early', 'Late', 'Early',
       'Early', 'Late', 'Late', 'Late', 'Late', 'Early', 'Early', 'Late',
       'Early', 'Late', 'Late', 'Early', 'Early', 'Late', 'Early', 'Late'],
      dtype='<U5')

## Final predictions

In [62]:
# Reading flight data for final predictions
final_flight = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\flight_finals.csv", 
                          parse_dates = ['Date', 'Scheduled Arrival Time', 
                                         'Scheduled departure time'])

final_flight



Unnamed: 0,Date,Carrier,Flight_Num,Origin,Dest,Scheduled Arrival Time,Scheduled departure time
0,2024-04-19,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
1,2024-04-19,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
2,2024-04-19,B6,116,JFK,SYR,2024-05-02 14:51:00,2024-05-02 13:34:00
3,2024-04-19,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00
4,2024-04-19,WN,491,MCO,SYR,2024-05-02 14:20:00,2024-05-02 11:35:00
5,2024-04-19,B6,656,MCO,SYR,2024-05-02 16:25:00,2024-05-02 13:35:00
6,2024-04-20,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
7,2024-04-20,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
8,2024-04-20,B6,116,JFK,SYR,2024-05-02 14:41:00,2024-05-02 13:25:00
9,2024-04-20,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00


In [63]:
final_flight['Date_MM'] = final_flight['Date'].dt.month
final_flight['Date_DD'] = final_flight['Date'].dt.day

final_flight['Flight_num'] = final_flight['Flight_Num']


final_flight['Arr_t_h'] = final_flight['Scheduled Arrival Time'].dt.hour
final_flight['Arr_t_m'] = final_flight['Scheduled Arrival Time'].dt.minute
final_flight['Depart_t_h'] = final_flight['Scheduled departure time'].dt.hour
final_flight['Depart_t_m'] = final_flight['Scheduled departure time'].dt.minute

final_flight.drop(columns = ['Scheduled Arrival Time', 'Scheduled departure time', 'Flight_Num'], 
                  inplace = True)

final_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
0,2024-04-19,UA,ORD,SYR,4,19,538,21,47,18,52
1,2024-04-19,MQ,ORD,SYR,4,19,3402,22,52,19,59
2,2024-04-19,B6,JFK,SYR,4,19,116,14,51,13,34
3,2024-04-19,9E,JFK,SYR,4,19,5340,16,21,14,55
4,2024-04-19,WN,MCO,SYR,4,19,491,14,20,11,35
5,2024-04-19,B6,MCO,SYR,4,19,656,16,25,13,35
6,2024-04-20,UA,ORD,SYR,4,20,538,21,47,18,52
7,2024-04-20,MQ,ORD,SYR,4,20,3402,22,52,19,59
8,2024-04-20,B6,JFK,SYR,4,20,116,14,41,13,25
9,2024-04-20,9E,JFK,SYR,4,20,5340,16,21,14,55


In [64]:
# Merging flight data and tesing weather data for final predictions based on dates
test_finals = pd.merge(final_flight, test_weather, left_on='Date', right_on='datetime')
test_finals.drop(columns = ['datetime', 'Date'], inplace=True)

test_finals

Unnamed: 0,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,UA,ORD,SYR,4,19,538,21,47,18,52,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
1,MQ,ORD,SYR,4,19,3402,22,52,19,59,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
2,B6,JFK,SYR,4,19,116,14,51,13,34,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
3,9E,JFK,SYR,4,19,5340,16,21,14,55,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
4,WN,MCO,SYR,4,19,491,14,20,11,35,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
5,B6,MCO,SYR,4,19,656,16,25,13,35,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
6,UA,ORD,SYR,4,20,538,21,47,18,52,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7
7,MQ,ORD,SYR,4,20,3402,22,52,19,59,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7
8,B6,JFK,SYR,4,20,116,14,41,13,25,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7
9,9E,JFK,SYR,4,20,5340,16,21,14,55,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7


In [65]:
test_finals = pd.get_dummies(test_finals, columns = ['Carrier', 'Origin','Dest'], drop_first = True)

test_finals

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,clouds,temp,max_temp,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Carrier_UA,Carrier_WN,Origin_MCO,Origin_ORD
0,4,19,538,21,47,18,52,84,12.4,16.7,...,8.3,3.9,57,1019.5,0,0,1,0,0,1
1,4,19,3402,22,52,19,59,84,12.4,16.7,...,8.3,3.9,57,1019.5,0,1,0,0,0,1
2,4,19,116,14,51,13,34,84,12.4,16.7,...,8.3,3.9,57,1019.5,1,0,0,0,0,0
3,4,19,5340,16,21,14,55,84,12.4,16.7,...,8.3,3.9,57,1019.5,0,0,0,0,0,0
4,4,19,491,14,20,11,35,84,12.4,16.7,...,8.3,3.9,57,1019.5,0,0,0,1,1,0
5,4,19,656,16,25,13,35,84,12.4,16.7,...,8.3,3.9,57,1019.5,1,0,0,0,1,0
6,4,20,538,21,47,18,52,49,10.1,19.8,...,8.4,6.3,78,1023.7,0,0,1,0,0,1
7,4,20,3402,22,52,19,59,49,10.1,19.8,...,8.4,6.3,78,1023.7,0,1,0,0,0,1
8,4,20,116,14,41,13,25,49,10.1,19.8,...,8.4,6.3,78,1023.7,1,0,0,0,0,0
9,4,20,5340,16,21,14,55,49,10.1,19.8,...,8.4,6.3,78,1023.7,0,0,0,0,0,0


### Predicting final test data using pre-trained model - Random Forest Classifier

In [66]:
y2_pred = rf_classifier.predict(test_finals)
#y2_pred = gb.predict(test_finals)
#y2_pred = xgb_classifier.predict(test_finals)
#y2_pred = Log_reg.predict(test_finals)

y2_pred

array([0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2], dtype=int64)

In [67]:
predictions = np.vectorize(label_mapping.get)(y2_pred)
predictions

array(['Early', 'Early', 'Early', 'Late', 'Early', 'Early', 'Early',
       'Early', 'Early', 'Early', 'Late', 'Early', 'Early', 'Late',
       'Late', 'Early', 'Late', 'Late', 'Late', 'Late', 'Late', 'Late',
       'Late'], dtype='<U5')

# Latter flights

In [68]:
# Splitting dataset into first and second flights according to given conditions
First_flights = Flight_data[Flight_data['Flight_num'].isin({538, 116, 491})]
First_flights['Carrier1'] = First_flights['Carrier']
First_flights['Flight_num1'] = First_flights['Flight_num']
First_flights['Arr_Status1'] = First_flights['Arr_Status']
First_flights = First_flights[['Date', 'Carrier1', 'Flight_num1', 'Origin', 'Arr_Status1']]

First_flights.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Carrier1'] = First_flights['Carrier']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Flight_num1'] = First_flights['Flight_num']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Arr_Status1'] = First_flights['Arr_Status']


Unnamed: 0,Date,Carrier1,Flight_num1,Origin,Arr_Status1
555,2023-01-01,B6,116,JFK,2
558,2023-01-02,B6,116,JFK,2
561,2023-01-03,B6,116,JFK,2
564,2023-01-04,B6,116,JFK,2
567,2023-01-05,B6,116,JFK,0


In [69]:
Second_flights = Flight_data[Flight_data['Flight_num'].isin({3402, 5340, 656})]

Second_flights.head()                          

Unnamed: 0,Date,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Arr_Status
71,2023-06-01,6,1,MQ,3402,ORD,SYR,22,32,19,42,2
73,2023-06-02,6,2,MQ,3402,ORD,SYR,22,32,19,42,1
75,2023-06-03,6,3,MQ,3402,ORD,SYR,23,41,20,46,0
77,2023-06-04,6,4,MQ,3402,ORD,SYR,22,28,19,39,1
79,2023-06-05,6,5,MQ,3402,ORD,SYR,22,28,19,39,0


In [70]:
#merging first and second flights based on origin and date
Latter_flight = pd.merge(Second_flights, First_flights, on = ['Origin', 'Date'])
                              
Latter_flight.head()

Unnamed: 0,Date,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Arr_Status,Carrier1,Flight_num1,Arr_Status1
0,2023-06-01,6,1,MQ,3402,ORD,SYR,22,32,19,42,2,UA,538,0
1,2023-06-05,6,5,MQ,3402,ORD,SYR,22,28,19,39,0,UA,538,0
2,2023-06-06,6,6,MQ,3402,ORD,SYR,22,28,19,39,0,UA,538,2
3,2023-06-07,6,7,MQ,3402,ORD,SYR,22,28,19,39,1,UA,538,1
4,2023-06-08,6,8,MQ,3402,ORD,SYR,22,28,19,39,1,UA,538,2


In [71]:
# merging latter flight data syracuse weather data based on date
Latter_Flight_weather = pd.merge(Latter_flight, SYR_weather, left_on='Date', right_on='datetime')

# arrival status of second flights as Arr_Status2
Latter_Flight_weather['Arr_Status2'] = Latter_Flight_weather['Arr_Status']
Latter_Flight_weather.drop(columns = ['datetime', 'Date', 'Arr_Status'], inplace=True)

Latter_Flight_weather.head()


Unnamed: 0,Date_MM,Date_DD,Carrier,Flight_num,Origin,Dest,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp,Arr_Status2
0,6,1,MQ,3402,ORD,SYR,22,32,19,42,...,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018,2
1,6,1,9E,5340,JFK,SYR,0,4,22,30,...,7.6,0.0,0.0,0.0,1003,10.1,10.8,47,1018,0
2,6,5,MQ,3402,ORD,SYR,22,28,19,39,...,12.4,0.0,0.0,0.0,999,5.6,5.1,50,1014,0
3,6,6,MQ,3402,ORD,SYR,22,28,19,39,...,15.6,0.0,0.0,0.0,992,7.6,6.9,54,1007,0
4,6,7,MQ,3402,ORD,SYR,22,28,19,39,...,12.8,0.0,0.0,1.0,990,4.9,5.2,56,1005,1


In [72]:
Latter_Flight_weather.columns

Index(['Date_MM', 'Date_DD', 'Carrier', 'Flight_num', 'Origin', 'Dest',
       'Arr_t_h', 'Arr_t_m', 'Depart_t_h', 'Depart_t_m', 'Carrier1',
       'Flight_num1', 'Arr_Status1', 'clouds', 'temp', 'max_temp', 'min_temp',
       'wind_dir', 'wind_spd', 'wind_gust_spd', 'snow', 'snow_depth', 'precip',
       'pres', 'uv', 'dewpt', 'rh', 'slp', 'Arr_Status2'],
      dtype='object')

In [73]:
Latter_Flight_weather.isna().sum()

Date_MM          0
Date_DD          0
Carrier          0
Flight_num       0
Origin           0
Dest             0
Arr_t_h          0
Arr_t_m          0
Depart_t_h       0
Depart_t_m       0
Carrier1         0
Flight_num1      0
Arr_Status1      0
clouds           0
temp             0
max_temp         0
min_temp         0
wind_dir         0
wind_spd         0
wind_gust_spd    0
snow             0
snow_depth       0
precip           0
pres             0
uv               0
dewpt            0
rh               0
slp              0
Arr_Status2      0
dtype: int64

In [74]:
Latter_Flight_weather.dtypes

Date_MM            int64
Date_DD            int64
Carrier           object
Flight_num         int64
Origin            object
Dest              object
Arr_t_h            int64
Arr_t_m            int64
Depart_t_h         int64
Depart_t_m         int64
Carrier1          object
Flight_num1        int64
Arr_Status1        int64
clouds             int64
temp             float64
max_temp         float64
min_temp         float64
wind_dir           int64
wind_spd         float64
wind_gust_spd    float64
snow             float64
snow_depth       float64
precip           float64
pres               int64
uv               float64
dewpt            float64
rh                 int64
slp                int64
Arr_Status2        int64
dtype: object

In [75]:
Latter_Flight_weather = pd.get_dummies(Latter_Flight_weather, 
                                       columns = ['Carrier', 'Origin','Carrier1', 'Dest'], 
                                       drop_first = True)

Latter_Flight_weather.head()

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Flight_num1,Arr_Status1,clouds,...,dewpt,rh,slp,Arr_Status2,Carrier_B6,Carrier_MQ,Origin_MCO,Origin_ORD,Carrier1_UA,Carrier1_WN
0,6,1,3402,22,32,19,42,538,0,26,...,10.8,47,1018,2,0,1,0,1,1,0
1,6,1,5340,0,4,22,30,116,0,26,...,10.8,47,1018,0,0,0,0,0,0,0
2,6,5,3402,22,28,19,39,538,0,80,...,5.1,50,1014,0,0,1,0,1,1,0
3,6,6,3402,22,28,19,39,538,2,94,...,6.9,54,1007,0,0,1,0,1,1,0
4,6,7,3402,22,28,19,39,538,1,99,...,5.2,56,1005,1,0,1,0,1,1,0


In [76]:
X_train, X_test, y_train, y_test = train_test_split(Latter_Flight_weather.drop(columns = ['Arr_Status2']), 
                                                    Latter_Flight_weather['Arr_Status2'], test_size=0.2, 
                                                    random_state=42)

X_train
X_test
y_train
y_test

100    2
10     0
4      1
81     0
97     2
65     2
30     0
33     0
93     1
11     1
47     2
42     2
0      2
78     2
18     2
64     2
67     0
79     0
55     2
44     0
12     0
80     1
Name: Arr_Status2, dtype: int64

In [77]:
if False: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    
    X_train
    X_test
    y_train
    y_test

#### Random Forest Classifier

In [78]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

rf_classifier.score(X_train, y_train) 

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.3181818181818182


#### Gradient Boosting classifier

In [79]:
gb = GradientBoostingClassifier(random_state=50, min_samples_split = 12, min_samples_leaf = 6, 
                                max_depth = 4, n_estimators = 100)
gb = gb.fit(X_train, y_train) 

y_pred = gb.predict(X_test)

gb.score(X_train, y_train) 

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.36363636363636365


#### XGBoost classifier

In [80]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=3, random_state=42)
xgb_classifier.fit(X_train, y_train)
 
y_pred = xgb_classifier.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

Accuracy: 0.4090909090909091


## Initial predictions for Latter flights

In [81]:
initial_flight = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\flight_initials.csv", 
                             parse_dates = ['Date', 'Scheduled Arrival Time', 
                                            'Scheduled departure time'])

initial_flight

Unnamed: 0,Date,Carrier,Flight_Num,Origin,Dest,Scheduled Arrival Time,Scheduled departure time
0,2024-04-10,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
1,2024-04-10,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
2,2024-04-10,B6,116,JFK,SYR,2024-05-02 14:50:00,2024-05-02 13:33:00
3,2024-04-10,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00
4,2024-04-10,WN,491,MCO,SYR,2024-05-02 13:45:00,2024-05-02 11:05:00
5,2024-04-10,B6,656,MCO,SYR,2024-05-02 16:25:00,2024-05-02 13:35:00
6,2024-04-11,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
7,2024-04-11,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
8,2024-04-11,B6,116,JFK,SYR,2024-05-02 14:50:00,2024-05-02 13:33:00
9,2024-04-11,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00


In [82]:
initial_flight['Date_MM'] = initial_flight['Date'].dt.month
initial_flight['Date_DD'] = initial_flight['Date'].dt.day

initial_flight['Flight_num'] = initial_flight['Flight_Num']

initial_flight['Arr_t_h'] = initial_flight['Scheduled Arrival Time'].dt.hour
initial_flight['Arr_t_m'] = initial_flight['Scheduled Arrival Time'].dt.minute
initial_flight['Depart_t_h'] = initial_flight['Scheduled departure time'].dt.hour
initial_flight['Depart_t_m'] = initial_flight['Scheduled departure time'].dt.minute

initial_flight.drop(columns = ['Scheduled Arrival Time', 'Scheduled departure time', 'Flight_Num'], 
                    inplace = True)

initial_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
0,2024-04-10,UA,ORD,SYR,4,10,538,21,47,18,52
1,2024-04-10,MQ,ORD,SYR,4,10,3402,22,52,19,59
2,2024-04-10,B6,JFK,SYR,4,10,116,14,50,13,33
3,2024-04-10,9E,JFK,SYR,4,10,5340,16,21,14,55
4,2024-04-10,WN,MCO,SYR,4,10,491,13,45,11,5
5,2024-04-10,B6,MCO,SYR,4,10,656,16,25,13,35
6,2024-04-11,UA,ORD,SYR,4,11,538,21,47,18,52
7,2024-04-11,MQ,ORD,SYR,4,11,3402,22,52,19,59
8,2024-04-11,B6,JFK,SYR,4,11,116,14,50,13,33
9,2024-04-11,9E,JFK,SYR,4,11,5340,16,21,14,55


In [83]:
First_flights = initial_flight[initial_flight['Flight_num'].isin({538, 116, 491})]
First_flights['Carrier1'] = First_flights['Carrier']
First_flights['Flight_num1'] = First_flights['Flight_num']
First_flights = First_flights[['Date', 'Carrier1', 'Flight_num1', 'Origin']]

First_flights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Carrier1'] = First_flights['Carrier']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Flight_num1'] = First_flights['Flight_num']


Unnamed: 0,Date,Carrier1,Flight_num1,Origin
0,2024-04-10,UA,538,ORD
2,2024-04-10,B6,116,JFK
4,2024-04-10,WN,491,MCO
6,2024-04-11,UA,538,ORD
8,2024-04-11,B6,116,JFK
10,2024-04-11,WN,491,MCO
12,2024-04-12,UA,538,ORD
14,2024-04-12,B6,116,JFK
16,2024-04-12,WN,491,MCO
18,2024-04-13,UA,538,ORD


In [84]:
Second_flights = initial_flight[initial_flight['Flight_num'].isin({3402, 5340, 656})]

Second_flights

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
1,2024-04-10,MQ,ORD,SYR,4,10,3402,22,52,19,59
3,2024-04-10,9E,JFK,SYR,4,10,5340,16,21,14,55
5,2024-04-10,B6,MCO,SYR,4,10,656,16,25,13,35
7,2024-04-11,MQ,ORD,SYR,4,11,3402,22,52,19,59
9,2024-04-11,9E,JFK,SYR,4,11,5340,16,21,14,55
11,2024-04-11,B6,MCO,SYR,4,11,656,16,25,13,35
13,2024-04-12,MQ,ORD,SYR,4,12,3402,22,52,19,59
15,2024-04-12,9E,JFK,SYR,4,12,5340,16,21,14,55
17,2024-04-12,B6,MCO,SYR,4,12,656,16,25,13,35
19,2024-04-13,MQ,ORD,SYR,4,13,3402,22,52,19,59


In [85]:
Latter_flight = pd.merge(Second_flights, First_flights, on = ['Origin', 'Date'])
Latter_flight['Arr_Status1'] = 0

Latter_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Carrier1,Flight_num1,Arr_Status1
0,2024-04-10,MQ,ORD,SYR,4,10,3402,22,52,19,59,UA,538,0
1,2024-04-10,9E,JFK,SYR,4,10,5340,16,21,14,55,B6,116,0
2,2024-04-10,B6,MCO,SYR,4,10,656,16,25,13,35,WN,491,0
3,2024-04-11,MQ,ORD,SYR,4,11,3402,22,52,19,59,UA,538,0
4,2024-04-11,9E,JFK,SYR,4,11,5340,16,21,14,55,B6,116,0
5,2024-04-11,B6,MCO,SYR,4,11,656,16,25,13,35,WN,491,0
6,2024-04-12,MQ,ORD,SYR,4,12,3402,22,52,19,59,UA,538,0
7,2024-04-12,9E,JFK,SYR,4,12,5340,16,21,14,55,B6,116,0
8,2024-04-12,B6,MCO,SYR,4,12,656,16,25,13,35,WN,491,0
9,2024-04-13,MQ,ORD,SYR,4,13,3402,22,52,19,59,UA,538,0


In [86]:
Latter_Flight_weather = pd.merge(Latter_flight, test_weather, left_on='Date', right_on='datetime')
Latter_Flight_weather.drop(columns = ['datetime', 'Date'], inplace=True)

Latter_Flight_weather

Unnamed: 0,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,MQ,ORD,SYR,4,10,3402,22,52,19,59,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
1,9E,JFK,SYR,4,10,5340,16,21,14,55,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
2,B6,MCO,SYR,4,10,656,16,25,13,35,...,3.5,5.3,0,0,6.8,997.4,2.2,10.2,73,1013.1
3,MQ,ORD,SYR,4,11,3402,22,52,19,59,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
4,9E,JFK,SYR,4,11,5340,16,21,14,55,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
5,B6,MCO,SYR,4,11,656,16,25,13,35,...,7.2,10.7,0,0,6.15,992.4,2.3,12.0,78,1008.1
6,MQ,ORD,SYR,4,12,3402,22,52,19,59,...,10.8,16.4,0,0,11.5,975.9,3.5,9.7,73,991.4
7,9E,JFK,SYR,4,12,5340,16,21,14,55,...,10.8,16.4,0,0,11.5,975.9,3.5,9.7,73,991.4
8,B6,MCO,SYR,4,12,656,16,25,13,35,...,10.8,16.4,0,0,11.5,975.9,3.5,9.7,73,991.4
9,MQ,ORD,SYR,4,13,3402,22,52,19,59,...,9.8,14.6,0,0,4.35,987.4,3.5,2.9,76,1003.5


In [87]:
Latter_Flight_weather = pd.get_dummies(Latter_Flight_weather, 
                                       columns = ['Carrier', 'Origin','Carrier1', 'Dest'], 
                                       drop_first = True)
Latter_Flight_weather

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Flight_num1,Arr_Status1,clouds,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Origin_MCO,Origin_ORD,Carrier1_UA,Carrier1_WN
0,4,10,3402,22,52,19,59,538,0,79,...,2.2,10.2,73,1013.1,0,1,0,1,1,0
1,4,10,5340,16,21,14,55,116,0,79,...,2.2,10.2,73,1013.1,0,0,0,0,0,0
2,4,10,656,16,25,13,35,491,0,79,...,2.2,10.2,73,1013.1,1,0,1,0,0,1
3,4,11,3402,22,52,19,59,538,0,88,...,2.3,12.0,78,1008.1,0,1,0,1,1,0
4,4,11,5340,16,21,14,55,116,0,88,...,2.3,12.0,78,1008.1,0,0,0,0,0,0
5,4,11,656,16,25,13,35,491,0,88,...,2.3,12.0,78,1008.1,1,0,1,0,0,1
6,4,12,3402,22,52,19,59,538,0,79,...,3.5,9.7,73,991.4,0,1,0,1,1,0
7,4,12,5340,16,21,14,55,116,0,79,...,3.5,9.7,73,991.4,0,0,0,0,0,0
8,4,12,656,16,25,13,35,491,0,79,...,3.5,9.7,73,991.4,1,0,1,0,0,1
9,4,13,3402,22,52,19,59,538,0,76,...,3.5,2.9,76,1003.5,0,1,0,1,1,0


### Predicting  initial test data of latter flights using pre-trained model - XGBoost classifier

In [88]:
Latter_Flight_weather['Arr_Status1'] = 0
y1_pred0 = xgb_classifier.predict(Latter_Flight_weather)

Latter_Flight_weather['Arr_Status1'] = 1
y1_pred1 = xgb_classifier.predict(Latter_Flight_weather)

Latter_Flight_weather['Arr_Status1'] = 2
y1_pred2 = xgb_classifier.predict(Latter_Flight_weather)

y1_pred = np.column_stack((y1_pred0, y1_pred1, y1_pred2))
y1_pred

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 2],
       [2, 2, 2],
       [2, 2, 2],
       [2, 2, 2],
       [1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [0, 0, 0],
       [0, 0, 0]])

In [89]:
predictions = np.vectorize(label_mapping.get)(y1_pred)
predictions

array([['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'Late'],
       ['Late', 'Late', 'Late'],
       ['Late', 'Late', 'Late'],
       ['Late', 'Late', 'Late'],
       ['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'On Time'],
       ['Late', 'Late', 'Late'],
       ['Early', 'Early', 'Early'],
       ['Early', 'Early', 'Early']], dtype='<U7')

## Final predictions for Latter flights

In [90]:
final_flight = pd.read_csv(r"C:\Users\VIKAS\Documents\Intro to ML\Project\datasets\flight_finals.csv", 
                          parse_dates = ['Date', 'Scheduled Arrival Time', 'Scheduled departure time'])

final_flight


Unnamed: 0,Date,Carrier,Flight_Num,Origin,Dest,Scheduled Arrival Time,Scheduled departure time
0,2024-04-19,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
1,2024-04-19,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
2,2024-04-19,B6,116,JFK,SYR,2024-05-02 14:51:00,2024-05-02 13:34:00
3,2024-04-19,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00
4,2024-04-19,WN,491,MCO,SYR,2024-05-02 14:20:00,2024-05-02 11:35:00
5,2024-04-19,B6,656,MCO,SYR,2024-05-02 16:25:00,2024-05-02 13:35:00
6,2024-04-20,UA,538,ORD,SYR,2024-05-02 21:47:00,2024-05-02 18:52:00
7,2024-04-20,MQ,3402,ORD,SYR,2024-05-02 22:52:00,2024-05-02 19:59:00
8,2024-04-20,B6,116,JFK,SYR,2024-05-02 14:41:00,2024-05-02 13:25:00
9,2024-04-20,9E,5340,JFK,SYR,2024-05-02 16:21:00,2024-05-02 14:55:00


In [91]:
final_flight['Date_MM'] = final_flight['Date'].dt.month
final_flight['Date_DD'] = final_flight['Date'].dt.day

final_flight['Flight_num'] = final_flight['Flight_Num']

final_flight['Arr_t_h'] = final_flight['Scheduled Arrival Time'].dt.hour
final_flight['Arr_t_m'] = final_flight['Scheduled Arrival Time'].dt.minute
final_flight['Depart_t_h'] = final_flight['Scheduled departure time'].dt.hour
final_flight['Depart_t_m'] = final_flight['Scheduled departure time'].dt.minute

final_flight.drop(columns = ['Scheduled Arrival Time', 'Scheduled departure time', 'Flight_Num'], 
                  inplace = True)

final_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
0,2024-04-19,UA,ORD,SYR,4,19,538,21,47,18,52
1,2024-04-19,MQ,ORD,SYR,4,19,3402,22,52,19,59
2,2024-04-19,B6,JFK,SYR,4,19,116,14,51,13,34
3,2024-04-19,9E,JFK,SYR,4,19,5340,16,21,14,55
4,2024-04-19,WN,MCO,SYR,4,19,491,14,20,11,35
5,2024-04-19,B6,MCO,SYR,4,19,656,16,25,13,35
6,2024-04-20,UA,ORD,SYR,4,20,538,21,47,18,52
7,2024-04-20,MQ,ORD,SYR,4,20,3402,22,52,19,59
8,2024-04-20,B6,JFK,SYR,4,20,116,14,41,13,25
9,2024-04-20,9E,JFK,SYR,4,20,5340,16,21,14,55


In [92]:
First_flights = final_flight[final_flight['Flight_num'].isin({538, 116, 491})]
First_flights['Carrier1'] = First_flights['Carrier']
First_flights['Flight_num1'] = First_flights['Flight_num']
First_flights = First_flights[['Date', 'Carrier1', 'Flight_num1', 'Origin']]

First_flights

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Carrier1'] = First_flights['Carrier']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  First_flights['Flight_num1'] = First_flights['Flight_num']


Unnamed: 0,Date,Carrier1,Flight_num1,Origin
0,2024-04-19,UA,538,ORD
2,2024-04-19,B6,116,JFK
4,2024-04-19,WN,491,MCO
6,2024-04-20,UA,538,ORD
8,2024-04-20,B6,116,JFK
11,2024-04-21,UA,538,ORD
13,2024-04-21,B6,116,JFK
15,2024-04-21,WN,491,MCO
17,2024-04-22,UA,538,ORD
19,2024-04-22,B6,116,JFK


In [93]:
Second_flights = final_flight[final_flight['Flight_num'].isin({3402, 5340, 656})]

Second_flights                              

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m
1,2024-04-19,MQ,ORD,SYR,4,19,3402,22,52,19,59
3,2024-04-19,9E,JFK,SYR,4,19,5340,16,21,14,55
5,2024-04-19,B6,MCO,SYR,4,19,656,16,25,13,35
7,2024-04-20,MQ,ORD,SYR,4,20,3402,22,52,19,59
9,2024-04-20,9E,JFK,SYR,4,20,5340,16,21,14,55
10,2024-04-20,B6,MCO,SYR,4,20,656,16,25,13,35
12,2024-04-21,MQ,ORD,SYR,4,21,3402,22,52,19,59
14,2024-04-21,9E,JFK,SYR,4,21,5340,16,21,14,55
16,2024-04-21,B6,MCO,SYR,4,21,656,16,25,13,35
18,2024-04-22,MQ,ORD,SYR,4,22,3402,22,52,19,59


In [94]:
Latter_flight = pd.merge(Second_flights, First_flights, on = ['Origin', 'Date'])
Latter_flight['Arr_Status1'] = 0

Latter_flight

Unnamed: 0,Date,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Carrier1,Flight_num1,Arr_Status1
0,2024-04-19,MQ,ORD,SYR,4,19,3402,22,52,19,59,UA,538,0
1,2024-04-19,9E,JFK,SYR,4,19,5340,16,21,14,55,B6,116,0
2,2024-04-19,B6,MCO,SYR,4,19,656,16,25,13,35,WN,491,0
3,2024-04-20,MQ,ORD,SYR,4,20,3402,22,52,19,59,UA,538,0
4,2024-04-20,9E,JFK,SYR,4,20,5340,16,21,14,55,B6,116,0
5,2024-04-21,MQ,ORD,SYR,4,21,3402,22,52,19,59,UA,538,0
6,2024-04-21,9E,JFK,SYR,4,21,5340,16,21,14,55,B6,116,0
7,2024-04-21,B6,MCO,SYR,4,21,656,16,25,13,35,WN,491,0
8,2024-04-22,MQ,ORD,SYR,4,22,3402,22,52,19,59,UA,538,0
9,2024-04-22,9E,JFK,SYR,4,22,5340,16,21,14,55,B6,116,0


In [95]:
Latter_Flight_weather = pd.merge(Latter_flight, test_weather, left_on='Date', right_on='datetime')
Latter_Flight_weather.drop(columns = ['datetime', 'Date'], inplace=True)

Latter_Flight_weather

Unnamed: 0,Carrier,Origin,Dest,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,...,wind_spd,wind_gust_spd,snow,snow_depth,precip,pres,uv,dewpt,rh,slp
0,MQ,ORD,SYR,4,19,3402,22,52,19,59,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
1,9E,JFK,SYR,4,19,5340,16,21,14,55,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
2,B6,MCO,SYR,4,19,656,16,25,13,35,...,4.2,4.6,0,0,42.56,1003.5,8.3,3.9,57,1019.5
3,MQ,ORD,SYR,4,20,3402,22,52,19,59,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7
4,9E,JFK,SYR,4,20,5340,16,21,14,55,...,3.4,6.5,0,0,25.0,1007.5,8.4,6.3,78,1023.7
5,MQ,ORD,SYR,4,21,3402,22,52,19,59,...,1.9,2.7,0,0,13.19,1002.8,8.4,9.6,85,1018.7
6,9E,JFK,SYR,4,21,5340,16,21,14,55,...,1.9,2.7,0,0,13.19,1002.8,8.4,9.6,85,1018.7
7,B6,MCO,SYR,4,21,656,16,25,13,35,...,1.9,2.7,0,0,13.19,1002.8,8.4,9.6,85,1018.7
8,MQ,ORD,SYR,4,22,3402,22,52,19,59,...,5.5,10.9,0,0,17.38,999.5,8.5,8.5,78,1015.4
9,9E,JFK,SYR,4,22,5340,16,21,14,55,...,5.5,10.9,0,0,17.38,999.5,8.5,8.5,78,1015.4


In [96]:
Latter_Flight_weather = pd.get_dummies(Latter_Flight_weather, 
                                       columns = ['Carrier', 'Origin','Carrier1', 'Dest'], 
                                       drop_first = True)

Latter_Flight_weather

Unnamed: 0,Date_MM,Date_DD,Flight_num,Arr_t_h,Arr_t_m,Depart_t_h,Depart_t_m,Flight_num1,Arr_Status1,clouds,...,uv,dewpt,rh,slp,Carrier_B6,Carrier_MQ,Origin_MCO,Origin_ORD,Carrier1_UA,Carrier1_WN
0,4,19,3402,22,52,19,59,538,0,84,...,8.3,3.9,57,1019.5,0,1,0,1,1,0
1,4,19,5340,16,21,14,55,116,0,84,...,8.3,3.9,57,1019.5,0,0,0,0,0,0
2,4,19,656,16,25,13,35,491,0,84,...,8.3,3.9,57,1019.5,1,0,1,0,0,1
3,4,20,3402,22,52,19,59,538,0,49,...,8.4,6.3,78,1023.7,0,1,0,1,1,0
4,4,20,5340,16,21,14,55,116,0,49,...,8.4,6.3,78,1023.7,0,0,0,0,0,0
5,4,21,3402,22,52,19,59,538,0,90,...,8.4,9.6,85,1018.7,0,1,0,1,1,0
6,4,21,5340,16,21,14,55,116,0,90,...,8.4,9.6,85,1018.7,0,0,0,0,0,0
7,4,21,656,16,25,13,35,491,0,90,...,8.4,9.6,85,1018.7,1,0,1,0,0,1
8,4,22,3402,22,52,19,59,538,0,72,...,8.5,8.5,78,1015.4,0,1,0,1,1,0
9,4,22,5340,16,21,14,55,116,0,72,...,8.5,8.5,78,1015.4,0,0,0,0,0,0


### Predicting final test data of latter flights using pre-trained model - XGBoost classifier

In [97]:
Latter_Flight_weather['Arr_Status1'] = 0
y2_pred0 = xgb_classifier.predict(Latter_Flight_weather)

Latter_Flight_weather['Arr_Status1'] = 1
y2_pred1 = xgb_classifier.predict(Latter_Flight_weather)

Latter_Flight_weather['Arr_Status1'] = 2
y2_pred2 = xgb_classifier.predict(Latter_Flight_weather)

y2_pred = np.column_stack((y1_pred0, y1_pred1, y1_pred2))
y2_pred

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 2],
       [2, 2, 2],
       [2, 2, 2],
       [2, 2, 2],
       [1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [0, 0, 0],
       [0, 0, 0]])

In [98]:
predictions = np.vectorize(label_mapping.get)(y2_pred)
predictions

array([['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'Late'],
       ['Late', 'Late', 'Late'],
       ['Late', 'Late', 'Late'],
       ['Late', 'Late', 'Late'],
       ['On Time', 'On Time', 'On Time'],
       ['On Time', 'On Time', 'On Time'],
       ['Late', 'Late', 'Late'],
       ['Early', 'Early', 'Early'],
       ['Early', 'Early', 'Early']], dtype='<U7')