In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
flights_df = pd.read_pickle("flights_with_weather.pickle")

In [3]:
region_map = {
    'CT': 'Northeast',
    'ME': 'Northeast',
    'MA': 'Northeast',
    'NH': 'Northeast',
    'RI': 'Northeast',
    'VT': 'Northeast',
    'NJ': 'Northeast',
    'NY': 'Northeast',
    'PA': 'Northeast',
    'IL': 'Midwest',
    'IN': 'Midwest',
    'MI': 'Midwest',
    'OH': 'Midwest',
    'WI': 'Midwest',
    'IA': 'Midwest',
    'KS': 'Midwest',
    'MN': 'Midwest',
    'MO': 'Midwest',
    'NE': 'Midwest',
    'ND': 'Midwest',
    'SD': 'Midwest',
    'DE': 'South',
    'FL': 'South',
    'GA': 'South',
    'MD': 'South',
    'NC': 'South',
    'SC': 'South',
    'VA': 'South',
    'WV': 'South',
    'AL': 'South',
    'KY': 'South',
    'MS': 'South',
    'TN': 'South',
    'AR': 'South',
    'LA': 'South',
    'OK': 'South',
    'TX': 'South',
    'AZ': 'West',
    'CO': 'West',
    'ID': 'West',
    'MT': 'West',
    'NV': 'West',
    'NM': 'West',
    'UT': 'West',
    'WY': 'West',
    'AK': 'West',
    'CA': 'West',
    'HI': 'West',
    'OR': 'West',
    'WA': 'West',
}

flights_df['ORIGIN_REGION'] = flights_df['ORIGIN_STATE'].map(region_map)
flights_df['DESTINATION_REGION'] = flights_df['DESTINATION_STATE'].map(region_map)

In [4]:
flights_df['MONTH'] = flights_df['DATE'].dt.month
flights_df['DAY_OF_WEEK'] = flights_df['DATE'].dt.dayofweek
flights_df['D_TEMP_TIME'] = pd.to_datetime(flights_df['SCHEDULED_DEPARTURE_TIME'], format='%H:%M:%S')
flights_df['D_HOUR'] = flights_df['D_TEMP_TIME'].dt.hour
flights_df['A_TEMP_TIME'] = pd.to_datetime(flights_df['SCHEDULED_ARRIVAL_TIME'], format='%H:%M:%S')
flights_df['A_HOUR'] = flights_df['A_TEMP_TIME'].dt.hour

# Categorize hours into four categories - 'Morning', 'Afternoon', 'Evening', and 'Night'
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
flights_df['SCHEDULED_DEP_TIME_CAT'] = pd.cut(flights_df['D_HOUR'], bins=bins, labels=labels, include_lowest=True)
flights_df['SCHEDULED_ARR_TIME_CAT'] = pd.cut(flights_df['A_HOUR'], bins=bins, labels=labels, include_lowest=True)
flights_df = flights_df.drop(columns=['D_HOUR','D_TEMP_TIME','A_HOUR','A_TEMP_TIME' ])

### Adding 'DELAY' column (1 if the flight was delayed, 0 if not)

In [5]:
flights_df['DELAYED'] = flights_df['DEPARTURE_DELAY'].apply(lambda x: 0 if x <= 0 else 1)

In [6]:
delayed_counts = flights_df['DELAYED'].value_counts()
print(delayed_counts)

0    3219152
1    1941358
Name: DELAYED, dtype: int64


### Categorizing Delay

In [7]:
# create new columns based on the following conditions
conditions = [(flights_df['DEPARTURE_DELAY'] <= 0),
    (flights_df['DEPARTURE_DELAY'] < 20) & (flights_df['DEPARTURE_DELAY'] >= 1),
    (flights_df['DEPARTURE_DELAY'] < 40) & (flights_df['DEPARTURE_DELAY'] >= 20),
    (flights_df['DEPARTURE_DELAY'] < 60) & (flights_df['DEPARTURE_DELAY'] >= 40),
    (flights_df['DEPARTURE_DELAY'] >= 60)
]

values = ['ON-TIME', 'DELAY<20', '20=<DELAY<40', '40=<DELAY<60', '60=<DELAY']

# Create a new column 'DELAY_CATEGORY' and assign values based on conditions
flights_df['DELAY_CATEGORY'] = np.select(conditions, values)

In [8]:
flights_df

Unnamed: 0,DATE,AIRLINE,ORIGIN_AIRPORT,ORIGIN_CITY,ORIGIN_STATE,DESTINATION_AIRPORT,DESTINATION_CITY,DESTINATION_STATE,SCHEDULED_DEPARTURE_TIME,ACTUAL_DEPARTURE_TIME,...,DESTINATION_WINDDIRECTION_10M,DESTINATION_WINDDIRECTION_100M,ORIGIN_REGION,DESTINATION_REGION,MONTH,DAY_OF_WEEK,SCHEDULED_DEP_TIME_CAT,SCHEDULED_ARR_TIME_CAT,DELAYED,DELAY_CATEGORY
0,2015-01-01,AS,ANC,Anchorage,AK,SEA,Seattle,WA,00:05:00,23:54:00,...,48.0,37.0,West,West,1,3,Night,Night,0,ON-TIME
1,2015-01-01,AS,ANC,Anchorage,AK,SEA,Seattle,WA,00:45:00,00:41:00,...,74.0,63.0,West,West,1,3,Night,Night,0,ON-TIME
2,2015-01-01,DL,ANC,Anchorage,AK,SEA,Seattle,WA,00:45:00,00:31:00,...,74.0,63.0,West,West,1,3,Night,Night,0,ON-TIME
3,2015-01-01,AS,ANC,Anchorage,AK,SEA,Seattle,WA,01:55:00,01:40:00,...,74.0,69.0,West,West,1,3,Night,Night,0,ON-TIME
4,2015-01-01,AS,ANC,Anchorage,AK,SEA,Seattle,WA,02:20:00,02:09:00,...,74.0,69.0,West,West,1,3,Night,Night,0,ON-TIME
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5231124,2015-12-29,OO,JMS,Jamestown,ND,DVL,Devils Lake,ND,14:31:00,14:28:00,...,343.0,347.0,Midwest,Midwest,12,1,Afternoon,Afternoon,0,ON-TIME
5231125,2015-12-29,OO,JMS,Jamestown,ND,DVL,Devils Lake,ND,23:56:00,23:35:00,...,104.0,109.0,Midwest,Midwest,12,1,Evening,Night,0,ON-TIME
5231126,2015-12-30,OO,JMS,Jamestown,ND,DVL,Devils Lake,ND,14:31:00,14:19:00,...,270.0,268.0,Midwest,Midwest,12,2,Afternoon,Afternoon,0,ON-TIME
5231127,2015-12-30,OO,JMS,Jamestown,ND,DVL,Devils Lake,ND,23:56:00,23:23:00,...,69.0,76.0,Midwest,Midwest,12,2,Evening,Night,0,ON-TIME


In [9]:
category_counts = flights_df['DELAY_CATEGORY'].value_counts()
print(category_counts)


ON-TIME         3219152
DELAY<20        1123062
20=<DELAY<40     346758
60=<DELAY        305987
40=<DELAY<60     165551
Name: DELAY_CATEGORY, dtype: int64


In [10]:
selected_cols1 = ['MONTH', 'DAY_OF_WEEK',
        'AIRLINE', 'ORIGIN_AIRPORT',
       'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT',
       'DEPARTURE_DELAY', 'SCHEDULED_ARR_TIME_CAT',
       'ARRIVAL_DELAY',
       'SCHEDULED_TIME', 'DISTANCE', 'ORIGIN_REGION','DESTINATION_REGION','DELAYED','DELAY_CATEGORY']

selected_cols2 = ['MONTH', 'DAY_OF_WEEK',
        'AIRLINE', 'ORIGIN_AIRPORT', 
       'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT',
       'DEPARTURE_DELAY', 'SCHEDULED_ARR_TIME_CAT',
       'ARRIVAL_DELAY',
       'SCHEDULED_TIME', 'DISTANCE', 'ORIGIN_TEMPERATURE_2M',
       'ORIGIN_RELATIVEHUMIDITY_2M', 'ORIGIN_DEWPOINT_2M',
       'ORIGIN_APPARENT_TEMPERATURE', 'ORIGIN_PRESSURE_MSL',
       'ORIGIN_SURFACE_PRESSURE', 'ORIGIN_PRECIPITATION', 'ORIGIN_RAIN',
       'ORIGIN_SNOWFALL', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_CLOUDCOVER_HIGH', 'ORIGIN_WINDSPEED_10M',
       'ORIGIN_WINDSPEED_100M', 'ORIGIN_WINDDIRECTION_10M',
       'ORIGIN_WINDDIRECTION_100M', 'DESTINATION_TEMPERATURE_2M',
       'DESTINATION_RELATIVEHUMIDITY_2M', 'DESTINATION_DEWPOINT_2M',
       'DESTINATION_APPARENT_TEMPERATURE', 'DESTINATION_PRESSURE_MSL',
       'DESTINATION_SURFACE_PRESSURE', 'DESTINATION_PRECIPITATION',
       'DESTINATION_RAIN', 'DESTINATION_SNOWFALL', 'DESTINATION_WEATHERCODE',
       'DESTINATION_CLOUDCOVER', 'DESTINATION_CLOUDCOVER_LOW',
       'DESTINATION_CLOUDCOVER_MID', 'DESTINATION_CLOUDCOVER_HIGH',
       'DESTINATION_WINDSPEED_10M', 'DESTINATION_WINDSPEED_100M',
       'DESTINATION_WINDDIRECTION_10M', 'DESTINATION_WINDDIRECTION_100M','ORIGIN_REGION','DESTINATION_REGION',
       'DELAYED', 'DELAY_CATEGORY']

flights_wo_weather = flights_df.loc[:, selected_cols1]
flights_w_weather = flights_df.loc[:, selected_cols2]

# Flights within the Midwest

In [72]:
flights_MW = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'Midwest') & (flights_w_weather['DESTINATION_REGION'] == 'Midwest')]

## Binary Classification:

In [73]:
# Encode categorical variables using one-hot encoding
flights_MW = pd.get_dummies(flights_MW, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_MW = flights_MW.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW = flights_MW['DELAYED']
X_train_MW, X_test_MW, y_train_MW, y_test_MW = train_test_split(X_MW, y_MW, test_size=0.3, random_state=123)


### Balancing the data

In [74]:
from imblearn.over_sampling import RandomOverSampler

# Instantiate RandomOverSampler
ros = RandomOverSampler(random_state=123)

# Balance the data using RandomOverSampler
X_train_balanced_MW, y_train_balanced_MW = ros.fit_resample(X_train_MW, y_train_MW)


### Feature Selection

In [75]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW.shape, X_test_MW.shape)
selector = SelectKBest(f_classif, k=20)
X_train_selected_MW = selector.fit_transform(X_train_balanced_MW, y_train_balanced_MW)
X_test_MW = selector.transform(X_test_MW)
print("Size after feature selection:",X_train_selected_MW.shape, X_test_MW.shape)


Size before feature selection: (320862, 213) (100905, 213)
Size after feature selection: (320862, 20) (100905, 20)


#### Selected features for binary classification

In [76]:
# Print the selected feature names
selected_features_binC_MW = X_MW.columns[selector.get_support()]
print(selected_features_binC_MW)

Index(['ORIGIN_SNOWFALL', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'DESTINATION_WEATHERCODE', 'DESTINATION_CLOUDCOVER',
       'DESTINATION_CLOUDCOVER_LOW', 'MONTH_2', 'AIRLINE_UA', 'AIRLINE_WN',
       'ORIGIN_AIRPORT_MDW', 'ORIGIN_AIRPORT_ORD',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Night', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [77]:
rfc = RandomForestClassifier(random_state= 123)

# Train the model on the training set
binC_MW = rfc.fit(X_train_selected_MW, y_train_balanced_MW)



In [78]:
# Evaluate the model on the testing set
y_pred_MW = binC_MW.predict(X_test_MW)
print("Binary Classification Results for Flights within the Midwest")
print("Accuracy Score:", accuracy_score(y_test_MW,y_pred_MW))
target_names = ['ON TIME', 'DELAYED']
print(classification_report(y_test_MW, y_pred_MW, target_names=target_names))


Binary Classification Results for Flights within the Midwest
Accuracy Score: 0.6531688221594569
              precision    recall  f1-score   support

     ON TIME       0.74      0.76      0.75     68920
     DELAYED       0.45      0.42      0.43     31985

    accuracy                           0.65    100905
   macro avg       0.59      0.59      0.59    100905
weighted avg       0.65      0.65      0.65    100905



## Multiclass Classification

In [87]:
X_MW = flights_MW.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW = flights_MW['DELAY_CATEGORY']

X_train_MW, X_test_MW, y_train_MW, y_test_MW = train_test_split(X_MW, y_MW, test_size=0.3, random_state=123)


### Balancing the data

In [89]:
# Balance the dataset using RandomOverSampler
X_train_balanced_MW, y_train_balanced_MW = ros.fit_resample(X_train_MW, y_train_MW)

In [90]:
before_balancing = y_train_MW.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_MW.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         160431
DELAY<20         39484
60=<DELAY        14393
20=<DELAY<40     13793
40=<DELAY<60      7343
Name: DELAY_CATEGORY, dtype: int64
After balancing: 20=<DELAY<40    160431
ON-TIME         160431
40=<DELAY<60    160431
DELAY<20        160431
60=<DELAY       160431
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [91]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW.shape, X_test_MW.shape)
selector = SelectKBest(f_classif, k=20)
X_train_selected_MW = selector.fit_transform(X_train_balanced_MW, y_train_balanced_MW)
X_test_MW = selector.transform(X_test_MW)
print("Size after feature selection:",X_train_selected_MW.shape, X_test_MW.shape)

Size before feature selection: (802155, 213) (100905, 213)
Size after feature selection: (802155, 20) (100905, 20)


#### Features selected for multicategory classifcation

In [92]:
selected_features_mulC_MW = X_MW.columns[selector.get_support()]
print(selected_features_mulC_MW)

Index(['ORIGIN_SNOWFALL', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'DESTINATION_TEMPERATURE_2M', 'DESTINATION_WEATHERCODE',
       'DESTINATION_CLOUDCOVER', 'DESTINATION_CLOUDCOVER_LOW',
       'DESTINATION_CLOUDCOVER_MID', 'MONTH_2', 'AIRLINE_DL', 'AIRLINE_WN',
       'ORIGIN_AIRPORT_ORD', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_ARR_TIME_CAT_Night', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [93]:
# Train a Random Forest classifier
rfc_MW = rfc.fit(X_train_selected_MW, y_train_balanced_MW)
y_pred_MW = rfc_MW.predict(X_test_MW)

In [94]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights within the Midwest")
print("Accuracy Score:", accuracy_score(y_test_MW,y_pred_MW))
print(classification_report(y_test_MW, y_pred_MW))

Multiclass Classification Results for Flights within the Midwest
Accuracy Score: 0.6077498637332144
              precision    recall  f1-score   support

20=<DELAY<40       0.10      0.06      0.07      5823
40=<DELAY<60       0.06      0.04      0.05      3134
   60=<DELAY       0.23      0.16      0.19      6112
    DELAY<20       0.23      0.17      0.20     16916
     ON-TIME       0.72      0.83      0.77     68920

    accuracy                           0.61    100905
   macro avg       0.27      0.25      0.26    100905
weighted avg       0.56      0.61      0.58    100905



# Flights within the Northeast:

In [65]:
flights_NE = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'Northeast') & (flights_w_weather['DESTINATION_REGION'] == 'Northeast')]

## Binary Classification:

In [66]:
# Encode categorical variables using one-hot encoding
flights_NE = pd.get_dummies(flights_NE, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_NE = flights_NE.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE = flights_NE['DELAYED']
X_train_NE, X_test_NE, y_train_NE, y_test_NE = train_test_split(X_NE, y_NE, test_size=0.3, random_state=123)


### Balancing the data

In [67]:
# Balance the data using RandomOverSampler
X_train_balanced_NE, y_train_balanced_NE = ros.fit_resample(X_train_NE, y_train_NE)


### Feature Selection

In [68]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE.shape, X_test_NE.shape)
X_train_selected_NE = selector.fit_transform(X_train_balanced_NE, y_train_balanced_NE)
X_test_NE = selector.transform(X_test_NE)
print("Size after feature selection:",X_train_selected_NE.shape, X_test_NE.shape)


Size before feature selection: (72424, 114) (22995, 114)
Size after feature selection: (72424, 20) (22995, 20)


#### Selected features for binary classification

In [69]:
# Print the selected feature names
selected_features_binC_NE = X_NE.columns[selector.get_support()]
print(selected_features_binC_NE)

Index(['ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_LOW',
       'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_WINDSPEED_10M',
       'ORIGIN_WINDSPEED_100M', 'DESTINATION_PRESSURE_MSL',
       'DESTINATION_WEATHERCODE', 'DESTINATION_CLOUDCOVER',
       'DESTINATION_CLOUDCOVER_LOW', 'DESTINATION_WINDSPEED_10M',
       'DESTINATION_WINDSPEED_100M', 'MONTH_2', 'AIRLINE_UA',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [70]:
# Train the model on the training set
binC_NE = rfc.fit(X_train_selected_NE, y_train_balanced_NE)

In [71]:
# Evaluate the model on the testing set
y_pred_NE = binC_NE.predict(X_test_NE)
print("Binary Classification Results for Flights within the Northeast")
print("Accuracy Score:", accuracy_score(y_test_NE,y_pred_NE))
target_names = ['ON TIME', 'DELAYED']
print(classification_report(y_test_NE, y_pred_NE, target_names=target_names))

Binary Classification Results for Flights within the Northeast
Accuracy Score: 0.6908023483365949
              precision    recall  f1-score   support

 NOT DELAYED       0.76      0.80      0.78     15600
     DELAYED       0.52      0.47      0.49      7395

    accuracy                           0.69     22995
   macro avg       0.64      0.63      0.64     22995
weighted avg       0.68      0.69      0.69     22995



## Multiclass Classification

In [79]:
X_NE = flights_NE.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE = flights_NE['DELAY_CATEGORY']

X_train_NE, X_test_NE, y_train_NE, y_test_NE = train_test_split(X_NE, y_NE, test_size=0.3, random_state=123)


### Balancing the data

In [80]:
# Balance the dataset using RandomOverSampler
X_train_balanced_NE, y_train_balanced_NE = ros.fit_resample(X_train_NE, y_train_NE)

In [81]:
before_balancing = y_train_NE.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_NE.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         36212
DELAY<20         8115
60=<DELAY        4061
20=<DELAY<40     3363
40=<DELAY<60     1903
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         36212
60=<DELAY       36212
20=<DELAY<40    36212
40=<DELAY<60    36212
DELAY<20        36212
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [82]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE.shape, X_test_NE.shape)
X_train_selected_NE = selector.fit_transform(X_train_balanced_NE, y_train_balanced_NE)
X_test_NE = selector.transform(X_test_NE)
print("Size after feature selection:",X_train_selected_NE.shape, X_test_NE.shape)

Size before feature selection: (181060, 114) (22995, 114)
Size after feature selection: (181060, 20) (22995, 20)


#### Features selected for multicategory classifcation

In [83]:
selected_features_mulC_NE = X_NE.columns[selector.get_support()]
print(selected_features_mulC_NE)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_WINDSPEED_10M', 'ORIGIN_WINDSPEED_100M',
       'DESTINATION_RELATIVEHUMIDITY_2M', 'DESTINATION_PRECIPITATION',
       'DESTINATION_RAIN', 'DESTINATION_WEATHERCODE', 'DESTINATION_CLOUDCOVER',
       'DESTINATION_CLOUDCOVER_LOW', 'DESTINATION_WINDSPEED_10M',
       'DESTINATION_WINDSPEED_100M', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [84]:
# Train a Random Forest classifier
rfc_NE = rfc.fit(X_train_selected_NE, y_train_balanced_NE)
y_pred_NE = rfc_NE.predict(X_test_NE)

In [86]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights within the Northeast")
print("Accuracy Score:", accuracy_score(y_test_NE,y_pred_NE))
print(classification_report(y_test_NE, y_pred_NE))

Multiclass Classification Results for Flights within the Northeast
Accuracy Score: 0.6187432050445749
              precision    recall  f1-score   support

20=<DELAY<40       0.11      0.06      0.08      1413
40=<DELAY<60       0.05      0.03      0.04       835
   60=<DELAY       0.33      0.28      0.30      1622
    DELAY<20       0.19      0.12      0.15      3525
     ON-TIME       0.73      0.85      0.78     15600

    accuracy                           0.62     22995
   macro avg       0.28      0.27      0.27     22995
weighted avg       0.55      0.62      0.58     22995



# Flights within the South:

In [95]:
flights_S = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'South') & (flights_w_weather['DESTINATION_REGION'] == 'South')]

## Binary Classification:

In [96]:
# Encode categorical variables using one-hot encoding
flights_S = pd.get_dummies(flights_S, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_S = flights_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_S = flights_S['DELAYED']
X_train_S, X_test_S, y_train_S, y_test_S = train_test_split(X_S, y_S, test_size=0.3, random_state=123)


### Balancing the data

In [97]:
# Balance the data using RandomOverSampler
X_train_balanced_S, y_train_balanced_S = ros.fit_resample(X_train_S, y_train_S)


### Feature Selection

In [98]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_S.shape, X_test_S.shape)
X_train_selected_S = selector.fit_transform(X_train_balanced_S, y_train_balanced_S)
X_test_S = selector.transform(X_test_S)
print("Size after feature selection:",X_train_selected_S.shape, X_test_S.shape)


Size before feature selection: (1019212, 280) (338483, 280)
Size after feature selection: (1019212, 20) (338483, 20)


#### Selected features for binary classification

In [99]:
# Print the selected feature names
selected_features_binC_S = X_S.columns[selector.get_support()]
print(selected_features_binC_S)

Index(['SCHEDULED_TIME', 'DISTANCE', 'ORIGIN_PRECIPITATION', 'ORIGIN_RAIN',
       'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_LOW',
       'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_CLOUDCOVER_HIGH',
       'DESTINATION_WEATHERCODE', 'DESTINATION_CLOUDCOVER_HIGH', 'MONTH_9',
       'AIRLINE_EV', 'AIRLINE_WN', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [100]:
# Train the model on the training set
binC_S = rfc.fit(X_train_selected_S, y_train_balanced_S)

In [101]:
# Evaluate the model on the testing set
y_pred_S = binC_S.predict(X_test_S)
print("Binary Classification Results for Flights within the South")
print("Accuracy Score:", accuracy_score(y_test_S,y_pred_S))
target_names = ['ON TIME', 'DELAYED']
print(classification_report(y_test_S, y_pred_S, target_names=target_names))

Binary Classification Results for Flights within the South
Accuracy Score: 0.6511612104596095
              precision    recall  f1-score   support

     ON TIME       0.71      0.77      0.74    218608
     DELAYED       0.51      0.43      0.47    119875

    accuracy                           0.65    338483
   macro avg       0.61      0.60      0.60    338483
weighted avg       0.64      0.65      0.64    338483



## Multiclass Classification

In [102]:
X_S = flights_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_S = flights_S['DELAY_CATEGORY']

X_train_S, X_test_S, y_train_S, y_test_S = train_test_split(X_S, y_S, test_size=0.3, random_state=123)


### Balancing the data

In [103]:
# Balance the dataset using RandomOverSampler
X_train_balanced_S, y_train_balanced_S = ros.fit_resample(X_train_S, y_train_S)

In [104]:
before_balancing = y_train_S.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_S.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         509606
DELAY<20        163172
20=<DELAY<40     50388
60=<DELAY        43285
40=<DELAY<60     23342
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         509606
20=<DELAY<40    509606
DELAY<20        509606
40=<DELAY<60    509606
60=<DELAY       509606
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [105]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_S.shape, X_test_S.shape)
X_train_selected_S = selector.fit_transform(X_train_balanced_S, y_train_balanced_S)
X_test_S = selector.transform(X_test_S)
print("Size after feature selection:",X_train_selected_S.shape, X_test_S.shape)

Size before feature selection: (2548030, 280) (338483, 280)
Size after feature selection: (2548030, 20) (338483, 20)


#### Features selected for multicategory classifcation

In [106]:
selected_features_mulC_S = X_S.columns[selector.get_support()]
print(selected_features_mulC_S)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_CLOUDCOVER_HIGH',
       'DESTINATION_RELATIVEHUMIDITY_2M', 'DESTINATION_PRECIPITATION',
       'DESTINATION_RAIN', 'DESTINATION_WEATHERCODE',
       'DESTINATION_CLOUDCOVER_HIGH', 'AIRLINE_DL', 'AIRLINE_EV', 'AIRLINE_WN',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [107]:
# Train a Random Forest classifier
rfc_S = rfc.fit(X_train_selected_S, y_train_balanced_S)
y_pred_S = rfc_S.predict(X_test_S)

In [108]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights within the South")
print("Accuracy Score:", accuracy_score(y_test_S,y_pred_S))
print(classification_report(y_test_S, y_pred_S))

Multiclass Classification Results for Flights within the South
Accuracy Score: 0.5539598739079954
              precision    recall  f1-score   support

20=<DELAY<40       0.10      0.07      0.08     21694
40=<DELAY<60       0.05      0.04      0.04      9963
   60=<DELAY       0.18      0.12      0.15     18423
    DELAY<20       0.26      0.22      0.24     69795
     ON-TIME       0.69      0.77      0.73    218608

    accuracy                           0.55    338483
   macro avg       0.26      0.24      0.25    338483
weighted avg       0.52      0.55      0.53    338483



# Flights within the West:

In [109]:
flights_W = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'West') & (flights_w_weather['DESTINATION_REGION'] == 'West')]

## Binary Classification:

In [110]:
# Encode categorical variables using one-hot encoding
flights_W = pd.get_dummies(flights_W, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_W = flights_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_W = flights_W['DELAYED']
X_train_W, X_test_W, y_train_W, y_test_W = train_test_split(X_W, y_W, test_size=0.3, random_state=123)


### Balancing the data

In [111]:
# Balance the data using RandomOverSampler
X_train_balanced_W, y_train_balanced_W = ros.fit_resample(X_train_W, y_train_W)


### Feature Selection

In [112]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_W.shape, X_test_W.shape)
X_train_selected_W = selector.fit_transform(X_train_balanced_W, y_train_balanced_W)
X_test_W = selector.transform(X_test_W)
print("Size after feature selection:",X_train_selected_W.shape, X_test_W.shape)


Size before feature selection: (965216, 276) (321563, 276)
Size after feature selection: (965216, 20) (321563, 20)


#### Selected features for binary classification

In [113]:
# Print the selected feature names
selected_features_binC_W = X_W.columns[selector.get_support()]
print(selected_features_binC_W)

Index(['ORIGIN_RELATIVEHUMIDITY_2M', 'ORIGIN_WINDSPEED_10M',
       'DESTINATION_WINDDIRECTION_10M', 'DESTINATION_WINDDIRECTION_100M',
       'MONTH_9', 'AIRLINE_AS', 'AIRLINE_HA', 'AIRLINE_OO', 'AIRLINE_UA',
       'AIRLINE_WN', 'ORIGIN_AIRPORT_DEN', 'ORIGIN_AIRPORT_SLC',
       'DESTINATION_AIRPORT_SLC', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Afternoon', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [114]:
# Train the model on the training set
binC_W = rfc.fit(X_train_selected_W, y_train_balanced_W)

In [115]:
# Evaluate the model on the testing set
y_pred_W = binC_W.predict(X_test_W)
print("Binary Classification Results for Flights within the West")
print("Accuracy Score:", accuracy_score(y_test_W,y_pred_W))
print(classification_report(y_test_W, y_pred_W, target_names=target_names))

Binary Classification Results for Flights within the West
Accuracy Score: 0.654036689544506
              precision    recall  f1-score   support

     ON TIME       0.72      0.76      0.74    207468
     DELAYED       0.51      0.46      0.49    114095

    accuracy                           0.65    321563
   macro avg       0.62      0.61      0.61    321563
weighted avg       0.65      0.65      0.65    321563



## Multiclass Classification

In [116]:
X_W = flights_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_W = flights_W['DELAY_CATEGORY']

X_train_W, X_test_W, y_train_W, y_test_W = train_test_split(X_W, y_W, test_size=0.3, random_state=123)


### Balancing the data

In [117]:
# Balance the dataset using RandomOverSampler
X_train_balanced_W, y_train_balanced_W = ros.fit_resample(X_train_W, y_train_W)

In [118]:
before_balancing = y_train_W.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_W.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         482608
DELAY<20        159533
20=<DELAY<40     47947
60=<DELAY        37405
40=<DELAY<60     22820
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         482608
60=<DELAY       482608
DELAY<20        482608
40=<DELAY<60    482608
20=<DELAY<40    482608
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [119]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_W.shape, X_test_W.shape)
X_train_selected_W = selector.fit_transform(X_train_balanced_W, y_train_balanced_W)
X_test_W = selector.transform(X_test_W)
print("Size after feature selection:",X_train_selected_W.shape, X_test_W.shape)

Size before feature selection: (2413040, 276) (321563, 276)
Size after feature selection: (2413040, 20) (321563, 20)


#### Features selected for multicategory classifcation

In [120]:
selected_features_mulC_W = X_W.columns[selector.get_support()]
print(selected_features_mulC_W)

Index(['ORIGIN_RELATIVEHUMIDITY_2M', 'ORIGIN_WINDDIRECTION_10M',
       'ORIGIN_WINDDIRECTION_100M', 'DESTINATION_WINDDIRECTION_100M',
       'MONTH_9', 'AIRLINE_AS', 'AIRLINE_HA', 'AIRLINE_OO', 'AIRLINE_UA',
       'AIRLINE_WN', 'ORIGIN_AIRPORT_HNL', 'DESTINATION_AIRPORT_HNL',
       'DESTINATION_AIRPORT_SFO', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Afternoon', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [121]:
# Train a Random Forest classifier
rfc_W = rfc.fit(X_train_selected_W, y_train_balanced_W)
y_pred_W = rfc_W.predict(X_test_W)

In [122]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights within the West")
print("Accuracy Score:", accuracy_score(y_test_W,y_pred_W))
print(classification_report(y_test_W, y_pred_W))

Multiclass Classification Results for Flights within the West
Accuracy Score: 0.5779178574649446
              precision    recall  f1-score   support

20=<DELAY<40       0.12      0.08      0.10     20539
40=<DELAY<60       0.06      0.04      0.05      9491
   60=<DELAY       0.14      0.09      0.11     15938
    DELAY<20       0.31      0.28      0.29     68127
     ON-TIME       0.71      0.79      0.75    207468

    accuracy                           0.58    321563
   macro avg       0.27      0.25      0.26    321563
weighted avg       0.54      0.58      0.56    321563



# Flights between the Midwest and the Northeast:

In [123]:
flights_MW_NE = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'Northeast') & (flights_w_weather['DESTINATION_REGION'] == 'Midwest') | (flights_w_weather['ORIGIN_REGION'] == 'Midwest') & (flights_w_weather['DESTINATION_REGION'] == 'Northeast')]

## Binary Classification:

In [124]:
# Encode categorical variables using one-hot encoding
flights_MW_NE = pd.get_dummies(flights_MW_NE, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_MW_NE = flights_MW_NE.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_NE = flights_MW_NE['DELAYED']
X_train_MW_NE, X_test_MW_NE, y_train_MW_NE, y_test_MW_NE = train_test_split(X_MW_NE, y_MW_NE, test_size=0.3, random_state=123)


### Balancing the data

In [125]:
# Balance the data using RandomOverSampler
X_train_balanced_MW_NE, y_train_balanced_MW_NE = ros.fit_resample(X_train_MW_NE, y_train_MW_NE)


### Feature Selection

In [126]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_NE.shape, X_test_MW_NE.shape)
X_train_selected_MW_NE = selector.fit_transform(X_train_balanced_MW_NE, y_train_balanced_MW_NE)
X_test_MW_NE = selector.transform(X_test_MW_NE)
print("Size after feature selection:",X_train_selected_MW_NE.shape, X_test_MW_NE.shape)


Size before feature selection: (198088, 169) (68291, 169)
Size after feature selection: (198088, 20) (68291, 20)


#### Selected features for binary classification

In [127]:
# Print the selected feature names
selected_features_binC_MW_NE = X_MW_NE.columns[selector.get_support()]
print(selected_features_binC_MW_NE)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_WINDSPEED_10M', 'DESTINATION_WEATHERCODE', 'MONTH_2', 'MONTH_9',
       'AIRLINE_EV', 'AIRLINE_UA', 'AIRLINE_WN', 'ORIGIN_AIRPORT_MDW',
       'DESTINATION_AIRPORT_DTW', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [128]:
# Train the model on the training set
binC_MW_NE = rfc.fit(X_train_selected_MW_NE, y_train_balanced_MW_NE)

In [129]:
# Evaluate the model on the testing set
y_pred_MW_NE = binC_MW_NE.predict(X_test_MW_NE)
print("Binary Classification Results for Flights between Midwest and Northeast")
print("Accuracy Score:", accuracy_score(y_test_MW_NE,y_pred_MW_NE))
print(classification_report(y_test_MW_NE, y_pred_MW_NE, target_names=target_names))

Binary Classification Results for Flights between Midwest and Northeast
Accuracy Score: 0.6334509671845485
              precision    recall  f1-score   support

     ON TIME       0.70      0.71      0.71     42412
     DELAYED       0.52      0.50      0.51     25879

    accuracy                           0.63     68291
   macro avg       0.61      0.61      0.61     68291
weighted avg       0.63      0.63      0.63     68291



## Multiclass Classification

In [130]:
X_MW_NE = flights_MW_NE.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_NE = flights_MW_NE['DELAY_CATEGORY']

X_train_MW_NE, X_test_MW_NE, y_train_MW_NE, y_test_MW_NE = train_test_split(X_MW_NE, y_MW_NE, test_size=0.3, random_state=123)


### Balancing the data

In [131]:
# Balance the dataset using RandomOverSampler
X_train_balanced_MW_NE, y_train_balanced_MW_NE = ros.fit_resample(X_train_MW_NE, y_train_MW_NE)

In [132]:
before_balancing = y_train_MW_NE.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_MW_NE.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         99044
DELAY<20        31071
60=<DELAY       12210
20=<DELAY<40    11146
40=<DELAY<60     5873
Name: DELAY_CATEGORY, dtype: int64
After balancing: 40=<DELAY<60    99044
ON-TIME         99044
60=<DELAY       99044
DELAY<20        99044
20=<DELAY<40    99044
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [133]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_NE.shape, X_test_MW_NE.shape)
X_train_selected_MW_NE = selector.fit_transform(X_train_balanced_MW_NE, y_train_balanced_MW_NE)
X_test_MW_NE = selector.transform(X_test_MW_NE)
print("Size after feature selection:",X_train_selected_MW_NE.shape, X_test_MW_NE.shape)

Size before feature selection: (495220, 169) (68291, 169)
Size after feature selection: (495220, 20) (68291, 20)


#### Features selected for multicategory classifcation

In [134]:
selected_features_mulC_MW_NE = X_MW_NE.columns[selector.get_support()]
print(selected_features_mulC_MW_NE)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_WINDSPEED_10M', 'DESTINATION_RELATIVEHUMIDITY_2M',
       'DESTINATION_PRECIPITATION', 'DESTINATION_WEATHERCODE',
       'DESTINATION_CLOUDCOVER', 'DESTINATION_WINDSPEED_100M', 'AIRLINE_EV',
       'AIRLINE_UA', 'ORIGIN_AIRPORT_MDW', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [135]:
# Train a Random Forest classifier
rfc_MW_NE = rfc.fit(X_train_selected_MW_NE, y_train_balanced_MW_NE)
y_pred_MW_NE = rfc_MW_NE.predict(X_test_MW_NE)

In [136]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between Midwest and Northeast")
print("Accuracy Score:", accuracy_score(y_test_MW_NE,y_pred_MW_NE))
print(classification_report(y_test_MW_NE, y_pred_MW_NE))

Multiclass Classification Results for Flights between Midwest and Northeast
Accuracy Score: 0.5799007189820035
              precision    recall  f1-score   support

20=<DELAY<40       0.13      0.06      0.08      4612
40=<DELAY<60       0.09      0.03      0.05      2443
   60=<DELAY       0.30      0.18      0.22      5290
    DELAY<20       0.28      0.19      0.23     13534
     ON-TIME       0.68      0.84      0.75     42412

    accuracy                           0.58     68291
   macro avg       0.29      0.26      0.27     68291
weighted avg       0.51      0.58      0.54     68291



# Flights between the Midwest and the South:

In [137]:
flights_MW_S = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'South') & (flights_w_weather['DESTINATION_REGION'] == 'Midwest') | (flights_w_weather['ORIGIN_REGION'] == 'Midwest') & (flights_w_weather['DESTINATION_REGION'] == 'South')]

## Binary Classification:

In [138]:
# Encode categorical variables using one-hot encoding
flights_MW_S = pd.get_dummies(flights_MW_S, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_MW_S = flights_MW_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_S = flights_MW_S['DELAYED']
X_train_MW_S, X_test_MW_S, y_train_MW_S, y_test_MW_S = train_test_split(X_MW_S, y_MW_S, test_size=0.3, random_state=123)


### Balancing the data

In [139]:
# Balance the data using RandomOverSampler
X_train_balanced_MW_S, y_train_balanced_MW_S = ros.fit_resample(X_train_MW_S, y_train_MW_S)


### Feature Selection

In [140]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_S.shape, X_test_MW_S.shape)
X_train_selected_MW_S = selector.fit_transform(X_train_balanced_MW_S, y_train_balanced_MW_S)
X_test_MW_S = selector.transform(X_test_MW_S)
print("Size after feature selection:",X_train_selected_MW_S.shape, X_test_MW_S.shape)


Size before feature selection: (558966, 269) (191021, 269)
Size after feature selection: (558966, 20) (191021, 20)


#### Selected features for binary classification

In [141]:
# Print the selected feature names
selected_features_binC_MW_S = X_MW_S.columns[selector.get_support()]
print(selected_features_binC_MW_S)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_WINDSPEED_10M', 'MONTH_6', 'MONTH_9', 'AIRLINE_DL',
       'AIRLINE_UA', 'AIRLINE_WN', 'ORIGIN_AIRPORT_MDW', 'ORIGIN_AIRPORT_ORD',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [142]:
# Train the model on the training set
binC_MW_S = rfc.fit(X_train_selected_MW_S, y_train_balanced_MW_S)

In [143]:
# Evaluate the model on the testing set
y_pred_MW_S = binC_MW_S.predict(X_test_MW_S)
print("Binary Classification Results for Flights between the Midwest and the South")
print("Accuracy Score:", accuracy_score(y_test_MW_S, y_pred_MW_S))
print(classification_report(y_test_MW_S, y_pred_MW_S, target_names=target_names))

Binary Classification Results for Flights between the Midwest and the South
Accuracy Score: 0.6173876170682805
              precision    recall  f1-score   support

     ON TIME       0.71      0.67      0.69    120235
     DELAYED       0.48      0.52      0.50     70786

    accuracy                           0.62    191021
   macro avg       0.59      0.60      0.60    191021
weighted avg       0.62      0.62      0.62    191021



## Multiclass Classification

In [144]:
X_MW_S = flights_MW_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_S = flights_MW_S['DELAY_CATEGORY']

X_train_MW_S, X_test_MW_S, y_train_MW_S, y_test_MW_S = train_test_split(X_MW_S, y_MW_S, test_size=0.3, random_state=123)


### Balancing the data

In [145]:
# Balance the dataset using RandomOverSampler
X_train_balanced_MW_S, y_train_balanced_MW_S = ros.fit_resample(X_train_MW_S, y_train_MW_S)

In [146]:
before_balancing = y_train_MW_S.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_MW_S.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         279483
DELAY<20         94623
20=<DELAY<40     29730
60=<DELAY        27598
40=<DELAY<60     14279
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         279483
DELAY<20        279483
40=<DELAY<60    279483
60=<DELAY       279483
20=<DELAY<40    279483
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [147]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_S.shape, X_test_MW_S.shape)
X_train_selected_MW_S = selector.fit_transform(X_train_balanced_MW_S, y_train_balanced_MW_S)
X_test_MW_S = selector.transform(X_test_MW_S)
print("Size after feature selection:",X_train_selected_MW_S.shape, X_test_MW_S.shape)

Size before feature selection: (1397415, 269) (191021, 269)
Size after feature selection: (1397415, 20) (191021, 20)


#### Features selected for multicategory classifcation

In [148]:
selected_features_mulC_MW_S = X_MW_S.columns[selector.get_support()]
print(selected_features_mulC_MW_S)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_CLOUDCOVER_HIGH', 'DESTINATION_WEATHERCODE', 'MONTH_6',
       'MONTH_9', 'AIRLINE_DL', 'AIRLINE_NK', 'AIRLINE_WN',
       'ORIGIN_AIRPORT_ORD', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [149]:
# Train a Random Forest classifier
rfc_MW_S = rfc.fit(X_train_selected_MW_S, y_train_balanced_MW_S)
y_pred_MW_S = rfc_MW_S.predict(X_test_MW_S)

In [150]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between the Midwest and the South")
print("Accuracy Score:", accuracy_score(y_test_MW_S,y_pred_MW_S))
print(classification_report(y_test_MW_S, y_pred_MW_S))

Multiclass Classification Results for Flights between the Midwest and the South
Accuracy Score: 0.45850456232560816
              precision    recall  f1-score   support

20=<DELAY<40       0.09      0.14      0.11     12632
40=<DELAY<60       0.05      0.10      0.06      6053
   60=<DELAY       0.14      0.18      0.16     11844
    DELAY<20       0.25      0.23      0.24     40257
     ON-TIME       0.69      0.61      0.65    120235

    accuracy                           0.46    191021
   macro avg       0.24      0.25      0.24    191021
weighted avg       0.51      0.46      0.48    191021



# Flights between the Midwest and the West:

In [151]:
flights_MW_W = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'West') & (flights_w_weather['DESTINATION_REGION'] == 'Midwest') | (flights_w_weather['ORIGIN_REGION'] == 'Midwest') & (flights_w_weather['DESTINATION_REGION'] == 'West')]

## Binary Classification:

In [152]:
# Encode categorical variables using one-hot encoding
flights_MW_W = pd.get_dummies(flights_MW_W, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_MW_W = flights_MW_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_W = flights_MW_W['DELAYED']
X_train_MW_W, X_test_MW_W, y_train_MW_W, y_test_MW_W = train_test_split(X_MW_W, y_MW_W, test_size=0.3, random_state=123)


### Balancing the data

In [153]:
# Balance the data using RandomOverSampler
X_train_balanced_MW_W, y_train_balanced_MW_W = ros.fit_resample(X_train_MW_W, y_train_MW_W)


### Feature Selection

In [154]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_W.shape, X_test_MW_W.shape)
X_train_selected_MW_W = selector.fit_transform(X_train_balanced_MW_W, y_train_balanced_MW_W)
X_test_MW_W = selector.transform(X_test_MW_W)
print("Size after feature selection:",X_train_selected_MW_W.shape, X_test_MW_W.shape)


Size before feature selection: (280510, 222) (103697, 222)
Size after feature selection: (280510, 20) (103697, 20)


#### Selected features for binary classification

In [155]:
# Print the selected feature names
selected_features_binC_MW_W = X_MW_W.columns[selector.get_support()]
print(selected_features_binC_MW_W)

Index(['SCHEDULED_TIME', 'DISTANCE', 'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER',
       'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_WINDSPEED_10M', 'MONTH_9',
       'AIRLINE_AA', 'AIRLINE_DL', 'AIRLINE_UA', 'AIRLINE_WN',
       'ORIGIN_AIRPORT_MDW', 'ORIGIN_AIRPORT_ORD', 'DESTINATION_AIRPORT_MSP',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [156]:
# Train the model on the training set
binC_MW_W = rfc.fit(X_train_selected_MW_W, y_train_balanced_MW_W)

In [157]:
# Evaluate the model on the testing set
y_pred_MW_W = binC_MW_W.predict(X_test_MW_W)
print("Binary Classification Results for Flights between the Midwest and the West")
print("Accuracy Score:", accuracy_score(y_test_MW_W, y_pred_MW_W))
print(classification_report(y_test_MW_W, y_pred_MW_W, target_names=target_names))

Binary Classification Results for Flights between the Midwest and the West
Accuracy Score: 0.6290442346451681
              precision    recall  f1-score   support

     ON TIME       0.67      0.70      0.69     60104
     DELAYED       0.56      0.54      0.55     43593

    accuracy                           0.63    103697
   macro avg       0.62      0.62      0.62    103697
weighted avg       0.63      0.63      0.63    103697



## Multiclass Classification

In [158]:
X_MW_W = flights_MW_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_MW_W = flights_MW_W['DELAY_CATEGORY']

X_train_MW_W, X_test_MW_W, y_train_MW_W, y_test_MW_W = train_test_split(X_MW_W, y_MW_W, test_size=0.3, random_state=123)


### Balancing the data

In [159]:
# Balance the dataset using RandomOverSampler
X_train_balanced_MW_W, y_train_balanced_MW_W = ros.fit_resample(X_train_MW_W, y_train_MW_W)

In [160]:
before_balancing = y_train_MW_W.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_MW_W.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         140255
DELAY<20         61637
20=<DELAY<40     17612
60=<DELAY        14373
40=<DELAY<60      8080
Name: DELAY_CATEGORY, dtype: int64
After balancing: 60=<DELAY       140255
ON-TIME         140255
DELAY<20        140255
20=<DELAY<40    140255
40=<DELAY<60    140255
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [161]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_MW_W.shape, X_test_MW_W.shape)
X_train_selected_MW_W = selector.fit_transform(X_train_balanced_MW_W, y_train_balanced_MW_W)
X_test_MW_W = selector.transform(X_test_MW_W)
print("Size after feature selection:",X_train_selected_MW_W.shape, X_test_MW_W.shape)

Size before feature selection: (701275, 222) (103697, 222)
Size after feature selection: (701275, 20) (103697, 20)


#### Features selected for multicategory classifcation

In [162]:
selected_features_mulC_MW_W = X_MW_W.columns[selector.get_support()]
print(selected_features_mulC_MW_W)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_SNOWFALL', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_MID', 'MONTH_6', 'MONTH_9',
       'AIRLINE_DL', 'AIRLINE_NK', 'AIRLINE_UA', 'AIRLINE_WN',
       'ORIGIN_AIRPORT_MDW', 'ORIGIN_AIRPORT_ORD', 'DESTINATION_AIRPORT_MSP',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [163]:
# Train a Random Forest classifier
rfc_MW_W = rfc.fit(X_train_selected_MW_W, y_train_balanced_MW_W)
y_pred_MW_W = rfc_MW_W.predict(X_test_MW_W)

In [164]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between the Midwest and the West")
print("Accuracy Score:", accuracy_score(y_test_MW_W,y_pred_MW_W))
print(classification_report(y_test_MW_W, y_pred_MW_W))

Multiclass Classification Results for Flights between the Midwest and the West
Accuracy Score: 0.382701524634271
              precision    recall  f1-score   support

20=<DELAY<40       0.10      0.19      0.13      7523
40=<DELAY<60       0.05      0.17      0.07      3329
   60=<DELAY       0.11      0.20      0.14      6138
    DELAY<20       0.30      0.24      0.27     26603
     ON-TIME       0.68      0.50      0.58     60104

    accuracy                           0.38    103697
   macro avg       0.25      0.26      0.24    103697
weighted avg       0.49      0.38      0.42    103697



# Flights between the Northeast and the South:

In [169]:
flights_NE_S = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'Northeast') & (flights_w_weather['DESTINATION_REGION'] == 'South') | (flights_w_weather['ORIGIN_REGION'] == 'South') & (flights_w_weather['DESTINATION_REGION'] == 'Northeast')]

## Binary Classification:

In [170]:
# Encode categorical variables using one-hot encoding
flights_NE_S = pd.get_dummies(flights_NE_S, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_NE_S = flights_NE_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE_S = flights_NE_S['DELAYED']
X_train_NE_S, X_test_NE_S, y_train_NE_S, y_test_NE_S = train_test_split(X_NE_S, y_NE_S, test_size=0.3, random_state=123)


### Balancing the data

In [171]:
# Balance the data using RandomOverSampler
X_train_balanced_NE_S, y_train_balanced_NE_S = ros.fit_resample(X_train_NE_S, y_train_NE_S)


### Feature Selection

In [172]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE_S.shape, X_test_NE_S.shape)
X_train_selected_NE_S = selector.fit_transform(X_train_balanced_NE_S, y_train_balanced_NE_S)
X_test_NE_S = selector.transform(X_test_NE_S)
print("Size after feature selection:",X_train_selected_NE_S.shape, X_test_NE_S.shape)


Size before feature selection: (499246, 228) (176880, 228)


  f = msb / msw


Size after feature selection: (499246, 20) (176880, 20)


#### Selected features for binary classification

In [173]:
# Print the selected feature names
selected_features_binC_NE_S = X_NE_S.columns[selector.get_support()]
print(selected_features_binC_NE_S)

Index(['SCHEDULED_TIME', 'DISTANCE', 'ORIGIN_PRESSURE_MSL',
       'ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_WINDSPEED_10M',
       'DESTINATION_PRESSURE_MSL', 'DESTINATION_WINDSPEED_100M', 'MONTH_2',
       'AIRLINE_UA', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Afternoon', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [174]:
# Train the model on the training set
binC_NE_S = rfc.fit(X_train_selected_NE_S, y_train_balanced_NE_S)

In [175]:
# Evaluate the model on the testing set
y_pred_NE_S = binC_NE_S.predict(X_test_NE_S)
print("Binary Classification Results for Flights between the Northeast and the South")
print("Accuracy Score:", accuracy_score(y_test_NE_S, y_pred_NE_S))
print(classification_report(y_test_NE_S, y_pred_NE_S, target_names=target_names))

Binary Classification Results for Flights between the Northeast and the South
Accuracy Score: 0.6533525554047942
              precision    recall  f1-score   support

     ON TIME       0.70      0.74      0.72    106693
     DELAYED       0.57      0.52      0.54     70187

    accuracy                           0.65    176880
   macro avg       0.64      0.63      0.63    176880
weighted avg       0.65      0.65      0.65    176880



## Multiclass Classification

In [176]:
X_NE_S = flights_NE_S.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE_S = flights_NE_S['DELAY_CATEGORY']

X_train_NE_S, X_test_NE_S, y_train_NE_S, y_test_NE_S = train_test_split(X_NE_S, y_NE_S, test_size=0.3, random_state=123)


### Balancing the data

In [177]:
# Balance the dataset using RandomOverSampler
X_train_balanced_NE_S, y_train_balanced_NE_S = ros.fit_resample(X_train_NE_S, y_train_NE_S)

In [178]:
before_balancing = y_train_NE_S.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_NE_S.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         249623
DELAY<20         88059
60=<DELAY        30164
20=<DELAY<40     29932
40=<DELAY<60     14940
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         249623
DELAY<20        249623
20=<DELAY<40    249623
60=<DELAY       249623
40=<DELAY<60    249623
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [179]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE_S.shape, X_test_NE_S.shape)
X_train_selected_NE_S = selector.fit_transform(X_train_balanced_NE_S, y_train_balanced_NE_S)
X_test_NE_S = selector.transform(X_test_NE_S)
print("Size after feature selection:",X_train_selected_NE_S.shape, X_test_NE_S.shape)

Size before feature selection: (1248115, 228) (176880, 228)


  f = msb / msw


Size after feature selection: (1248115, 20) (176880, 20)


#### Features selected for multicategory classifcation

In [180]:
selected_features_mulC_NE_S = X_NE_S.columns[selector.get_support()]
print(selected_features_mulC_NE_S)

Index(['ORIGIN_PRESSURE_MSL', 'ORIGIN_PRECIPITATION', 'ORIGIN_RAIN',
       'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_CLOUDCOVER_HIGH', 'ORIGIN_WINDSPEED_10M',
       'DESTINATION_RELATIVEHUMIDITY_2M', 'DESTINATION_PRESSURE_MSL',
       'DESTINATION_PRECIPITATION', 'DESTINATION_WEATHERCODE',
       'DESTINATION_WINDSPEED_100M', 'AIRLINE_UA',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [181]:
# Train a Random Forest classifier
rfc_NE_S = rfc.fit(X_train_selected_NE_S, y_train_balanced_NE_S)
y_pred_NE_S = rfc_NE_S.predict(X_test_NE_S)

In [182]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between the Northeast and the South")
print("Accuracy Score:", accuracy_score(y_test_NE_S,y_pred_NE_S))
print(classification_report(y_test_NE_S, y_pred_NE_S))

Multiclass Classification Results for Flights between the Northeast and the South
Accuracy Score: 0.5487731795567616
              precision    recall  f1-score   support

20=<DELAY<40       0.13      0.07      0.09     12968
40=<DELAY<60       0.09      0.05      0.06      6550
   60=<DELAY       0.29      0.21      0.24     12857
    DELAY<20       0.28      0.21      0.24     37812
     ON-TIME       0.67      0.80      0.73    106693

    accuracy                           0.55    176880
   macro avg       0.29      0.27      0.27    176880
weighted avg       0.49      0.55      0.52    176880



# Flights between the Northeast and the West:

In [183]:
flights_NE_W = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'Northeast') & (flights_w_weather['DESTINATION_REGION'] == 'West') | (flights_w_weather['ORIGIN_REGION'] == 'West') & (flights_w_weather['DESTINATION_REGION'] == 'Northeast')]

## Binary Classification:

In [184]:
# Encode categorical variables using one-hot encoding
flights_NE_W = pd.get_dummies(flights_NE_W, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_NE_W = flights_NE_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE_W = flights_NE_W['DELAYED']
X_train_NE_W, X_test_NE_W, y_train_NE_W, y_test_NE_W = train_test_split(X_NE_W, y_NE_W, test_size=0.3, random_state=123)


### Balancing the data

In [185]:
# Balance the data using RandomOverSampler
X_train_balanced_NE_W, y_train_balanced_NE_W = ros.fit_resample(X_train_NE_W, y_train_NE_W)


### Feature Selection

In [186]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE_W.shape, X_test_NE_W.shape)
X_train_selected_NE_W = selector.fit_transform(X_train_balanced_NE_W, y_train_balanced_NE_W)
X_test_NE_W = selector.transform(X_test_NE_W)
print("Size after feature selection:",X_train_selected_NE_W.shape, X_test_NE_W.shape)


Size before feature selection: (146388, 151) (56323, 151)
Size after feature selection: (146388, 20) (56323, 20)


#### Selected features for binary classification

In [187]:
# Print the selected feature names
selected_features_binC_NE_W = X_NE_W.columns[selector.get_support()]
print(selected_features_binC_NE_W)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_WINDSPEED_10M',
       'ORIGIN_WINDSPEED_100M', 'DESTINATION_RELATIVEHUMIDITY_2M', 'MONTH_9',
       'AIRLINE_AA', 'AIRLINE_UA', 'ORIGIN_AIRPORT_DEN', 'ORIGIN_AIRPORT_EWR',
       'ORIGIN_AIRPORT_JFK', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Afternoon', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [188]:
# Train the model on the training set
binC_NE_W = rfc.fit(X_train_selected_NE_W, y_train_balanced_NE_W)

In [189]:
# Evaluate the model on the testing set
y_pred_NE_W = binC_NE_W.predict(X_test_NE_W)
print("Binary Classification Results for Flights between the Northeast and the West")
print("Accuracy Score:", accuracy_score(y_test_NE_W, y_pred_NE_W))
print(classification_report(y_test_NE_W, y_pred_NE_W, target_names=target_names))

Binary Classification Results for Flights between the Northeast and the West
Accuracy Score: 0.6031283844965645
              precision    recall  f1-score   support

     ON TIME       0.64      0.66      0.65     31435
     DELAYED       0.55      0.53      0.54     24888

    accuracy                           0.60     56323
   macro avg       0.60      0.60      0.60     56323
weighted avg       0.60      0.60      0.60     56323



## Multiclass Classification

In [190]:
X_NE_W = flights_NE_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_NE_W = flights_NE_W['DELAY_CATEGORY']

X_train_NE_W, X_test_NE_W, y_train_NE_W, y_test_NE_W = train_test_split(X_NE_W, y_NE_W, test_size=0.3, random_state=123)


### Balancing the data

In [191]:
# Balance the dataset using RandomOverSampler
X_train_balanced_NE_W, y_train_balanced_NE_W = ros.fit_resample(X_train_NE_W, y_train_NE_W)

In [192]:
before_balancing = y_train_NE_W.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_NE_W.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         73194
DELAY<20        35115
20=<DELAY<40     9883
60=<DELAY        8630
40=<DELAY<60     4596
Name: DELAY_CATEGORY, dtype: int64
After balancing: 60=<DELAY       73194
40=<DELAY<60    73194
20=<DELAY<40    73194
ON-TIME         73194
DELAY<20        73194
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [193]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_NE_W.shape, X_test_NE_W.shape)
X_train_selected_NE_W = selector.fit_transform(X_train_balanced_NE_W, y_train_balanced_NE_W)
X_test_NE_W = selector.transform(X_test_NE_W)
print("Size after feature selection:",X_train_selected_NE_W.shape, X_test_NE_W.shape)

Size before feature selection: (365970, 151) (56323, 151)
Size after feature selection: (365970, 20) (56323, 20)


#### Features selected for multicategory classifcation

In [194]:
selected_features_mulC_NE_W = X_NE_W.columns[selector.get_support()]
print(selected_features_mulC_NE_W)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_SNOWFALL',
       'ORIGIN_WEATHERCODE', 'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_WINDSPEED_10M',
       'ORIGIN_WINDSPEED_100M', 'DESTINATION_RELATIVEHUMIDITY_2M',
       'DESTINATION_WEATHERCODE', 'MONTH_9', 'AIRLINE_AA', 'AIRLINE_UA',
       'ORIGIN_AIRPORT_EWR', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Afternoon', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [195]:
# Train a Random Forest classifier
rfc_NE_W = rfc.fit(X_train_selected_NE_W, y_train_balanced_NE_W)
y_pred_NE_W = rfc_NE_W.predict(X_test_NE_W)

In [196]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between the Northeast and the West")
print("Accuracy Score:", accuracy_score(y_test_NE_W,y_pred_NE_W))
print(classification_report(y_test_NE_W, y_pred_NE_W))

Multiclass Classification Results for Flights between the Northeast and the West
Accuracy Score: 0.4869591463522895
              precision    recall  f1-score   support

20=<DELAY<40       0.11      0.08      0.09      4290
40=<DELAY<60       0.07      0.04      0.05      1956
   60=<DELAY       0.20      0.15      0.17      3657
    DELAY<20       0.31      0.29      0.30     14985
     ON-TIME       0.62      0.70      0.66     31435

    accuracy                           0.49     56323
   macro avg       0.26      0.25      0.25     56323
weighted avg       0.45      0.49      0.47     56323



# Flights between the South and the West:

In [197]:
flights_S_W = flights_w_weather[(flights_w_weather['ORIGIN_REGION'] == 'South') & (flights_w_weather['DESTINATION_REGION'] == 'West') | (flights_w_weather['ORIGIN_REGION'] == 'West') & (flights_w_weather['DESTINATION_REGION'] == 'South')]

## Binary Classification:

In [198]:
# Encode categorical variables using one-hot encoding
flights_S_W = pd.get_dummies(flights_S_W, columns=['MONTH', 'DAY_OF_WEEK','AIRLINE', 'ORIGIN_AIRPORT', 'DESTINATION_AIRPORT', 'SCHEDULED_DEP_TIME_CAT', 'SCHEDULED_ARR_TIME_CAT'])

# Split the dataset into training and testing sets
X_S_W = flights_S_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_S_W = flights_S_W['DELAYED']
X_train_S_W, X_test_S_W, y_train_S_W, y_test_S_W = train_test_split(X_S_W, y_S_W, test_size=0.3, random_state=123)


### Balancing the data

In [199]:
# Balance the data using RandomOverSampler
X_train_balanced_S_W, y_train_balanced_S_W = ros.fit_resample(X_train_S_W, y_train_S_W)


### Feature Selection

In [200]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_S_W.shape, X_test_S_W.shape)
X_train_selected_S_W = selector.fit_transform(X_train_balanced_S_W, y_train_balanced_S_W)
X_test_S_W = selector.transform(X_test_S_W)
print("Size after feature selection:",X_train_selected_S_W.shape, X_test_S_W.shape)


Size before feature selection: (443700, 237) (168000, 237)
Size after feature selection: (443700, 20) (168000, 20)


#### Selected features for binary classification

In [201]:
# Print the selected feature names
selected_features_binC_S_W = X_S_W.columns[selector.get_support()]
print(selected_features_binC_S_W)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_LOW', 'ORIGIN_CLOUDCOVER_MID',
       'ORIGIN_WINDSPEED_10M', 'MONTH_9', 'AIRLINE_AA', 'AIRLINE_OO',
       'AIRLINE_UA', 'AIRLINE_WN', 'ORIGIN_AIRPORT_IAH',
       'DESTINATION_AIRPORT_DFW', 'SCHEDULED_DEP_TIME_CAT_Night',
       'SCHEDULED_DEP_TIME_CAT_Morning', 'SCHEDULED_DEP_TIME_CAT_Afternoon',
       'SCHEDULED_DEP_TIME_CAT_Evening', 'SCHEDULED_ARR_TIME_CAT_Morning',
       'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training binary RFC

In [202]:
# Train the model on the training set
binC_S_W = rfc.fit(X_train_selected_S_W, y_train_balanced_S_W)

In [203]:
# Evaluate the model on the testing set
y_pred_S_W = binC_S_W.predict(X_test_S_W)
print("Binary Classification Results for Flights between the South and the West")
print("Accuracy Score:", accuracy_score(y_test_S_W, y_pred_S_W))
print(classification_report(y_test_S_W, y_pred_S_W, target_names=target_names))

Binary Classification Results for Flights between the South and the West
Accuracy Score: 0.5986190476190476
              precision    recall  f1-score   support

     ON TIME       0.65      0.64      0.64     95371
     DELAYED       0.54      0.55      0.54     72629

    accuracy                           0.60    168000
   macro avg       0.59      0.59      0.59    168000
weighted avg       0.60      0.60      0.60    168000



## Multiclass Classification

In [204]:
X_S_W = flights_S_W.drop(['DELAYED','ORIGIN_REGION','DESTINATION_REGION','DEPARTURE_DELAY','ARRIVAL_DELAY','DELAY_CATEGORY'], axis=1)
y_S_W = flights_S_W['DELAY_CATEGORY']

X_train_S_W, X_test_S_W, y_train_S_W, y_test_S_W = train_test_split(X_S_W, y_S_W, test_size=0.3, random_state=123)


### Balancing the data

In [205]:
# Balance the dataset using RandomOverSampler
X_train_balanced_S_W, y_train_balanced_S_W = ros.fit_resample(X_train_S_W, y_train_S_W)

In [206]:
before_balancing = y_train_S_W.value_counts()
print("Before balancing:",before_balancing)
after_balancing = y_train_balanced_S_W.value_counts()
print("After balancing:",after_balancing)


Before balancing: ON-TIME         221850
DELAY<20        105632
20=<DELAY<40     28995
60=<DELAY        22461
40=<DELAY<60     13060
Name: DELAY_CATEGORY, dtype: int64
After balancing: ON-TIME         221850
DELAY<20        221850
20=<DELAY<40    221850
40=<DELAY<60    221850
60=<DELAY       221850
Name: DELAY_CATEGORY, dtype: int64


### Feature selection

In [207]:
# Perform feature selection using ANOVA F-value between label/feature
print("Size before feature selection:",X_train_balanced_S_W.shape, X_test_S_W.shape)
X_train_selected_S_W = selector.fit_transform(X_train_balanced_S_W, y_train_balanced_S_W)
X_test_S_W = selector.transform(X_test_S_W)
print("Size after feature selection:",X_train_selected_S_W.shape, X_test_S_W.shape)

Size before feature selection: (1109250, 237) (168000, 237)
Size after feature selection: (1109250, 20) (168000, 20)


#### Features selected for multicategory classifcation

In [208]:
selected_features_mulC_S_W = X_S_W.columns[selector.get_support()]
print(selected_features_mulC_S_W)

Index(['ORIGIN_PRECIPITATION', 'ORIGIN_RAIN', 'ORIGIN_WEATHERCODE',
       'ORIGIN_CLOUDCOVER', 'ORIGIN_CLOUDCOVER_MID', 'ORIGIN_CLOUDCOVER_HIGH',
       'ORIGIN_WINDSPEED_10M', 'MONTH_6', 'MONTH_9', 'AIRLINE_DL',
       'AIRLINE_OO', 'AIRLINE_UA', 'AIRLINE_WN', 'DESTINATION_AIRPORT_DFW',
       'SCHEDULED_DEP_TIME_CAT_Night', 'SCHEDULED_DEP_TIME_CAT_Morning',
       'SCHEDULED_DEP_TIME_CAT_Afternoon', 'SCHEDULED_DEP_TIME_CAT_Evening',
       'SCHEDULED_ARR_TIME_CAT_Morning', 'SCHEDULED_ARR_TIME_CAT_Evening'],
      dtype='object')


### Training multiclass RFC

In [209]:
# Train a Random Forest classifier
rfc_S_W = rfc.fit(X_train_selected_S_W, y_train_balanced_S_W)
y_pred_S_W = rfc_S_W.predict(X_test_S_W)

In [210]:
# Get the classification report for each class
print("Multiclass Classification Results for Flights between the South and the West")
print("Accuracy Score:", accuracy_score(y_test_S_W,y_pred_S_W))
print(classification_report(y_test_S_W, y_pred_S_W))

Multiclass Classification Results for Flights between the South and the West
Accuracy Score: 0.40860714285714284
              precision    recall  f1-score   support

20=<DELAY<40       0.10      0.16      0.12     12475
40=<DELAY<60       0.05      0.12      0.07      5561
   60=<DELAY       0.11      0.17      0.13      9526
    DELAY<20       0.32      0.28      0.30     45067
     ON-TIME       0.65      0.54      0.59     95371

    accuracy                           0.41    168000
   macro avg       0.24      0.25      0.24    168000
weighted avg       0.47      0.41      0.43    168000

