In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# import libraries.
import pandas as pd
import numpy as np
import glob
import holidays
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [4]:
# Set the path to the folder containing the CSV files.
folder_path = "/content/drive/MyDrive/data_Nazila"

In [5]:
# Use glob to create a list of all CSV files in the folder
csv_files = glob.glob(folder_path + "/*.csv")

# Read each CSV file into a DataFrame and store in a list
dfs = [pd.read_csv(file, skiprows=7, header=0, usecols=range(17))
       for file in csv_files]

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined DataFrame to a new CSV file. Overwrite everytime the code
# runs.
#combined_df.to_csv(folder_path + "/combined_file.csv", index=False, mode="w")

In [None]:
# display the first few rows of the combined Dataframe.
combined_df.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
0,WN,1/1/2021,559,N8626B,DAL,7:45,7:43,175,172,-2,7:52,9,0,0,0,0,0
1,WN,1/1/2021,824,N8324A,SJC,6:00,5:58,80,86,-2,6:29,31,0,0,0,0,0
2,WN,1/1/2021,1149,N8308K,SMF,19:45,20:08,85,75,23,20:14,6,0,0,0,0,0
3,WN,1/1/2021,1265,N8302F,PHX,9:45,9:44,85,64,-1,9:51,7,0,0,0,0,0
4,WN,1/1/2021,1479,N8542Z,SMF,10:05,10:03,90,73,-2,10:09,6,0,0,0,0,0


In [None]:
# display the last few rows fo the combined Dataframe.
combined_df.tail()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
484600,AA,12/31/2024,3185,N453AA,DFW,17:30,17:18,184,155,-12,17:34,16,0,0,0,0,0
484601,AA,12/31/2024,3231,N421UW,PHL,0:55,0:52,309,268,-3,1:05,13,0,0,0,0,0
484602,AA,12/31/2024,3258,N378SC,SFO,13:30,13:22,92,80,-8,13:41,19,0,0,0,0,0
484603,AA,12/31/2024,3269,N335RT,MIA,0:52,0:52,299,267,0,1:07,15,0,0,0,0,0
484604,AA,12/31/2024,3295,N198UW,DFW,15:45,15:42,185,162,-3,16:02,20,0,0,0,0,0


In [None]:
# see a random sample from the middle of the DataFrame.
combined_df.iloc[len(combined_df)//4 : 3*len(combined_df)//4].sample(7)

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Destination Airport,Scheduled departure time,Actual departure time,Scheduled elapsed time (Minutes),Actual elapsed time (Minutes),Departure delay (Minutes),Wheels-off time,Taxi-Out time (Minutes),Delay Carrier (Minutes),Delay Weather (Minutes),Delay National Aviation System (Minutes),Delay Security (Minutes),Delay Late Aircraft Arrival (Minutes)
170379,UA,9/27/2023,1730,N27526,MCO,22:52,22:47,291,284,-5,23:09,22,0,0,0,0,0
335406,AS,2/26/2023,1293,N594AS,SEA,6:00,6:00,180,176,0,6:36,36,0,0,0,0,0
218116,DL,3/7/2023,738,N327NW,DFW,8:40,8:30,182,193,-10,8:59,29,0,0,0,0,0
168461,UA,9/20/2023,2326,N17105,EWR,7:30,7:24,321,307,-6,7:40,16,0,0,0,0,0
207679,DL,2/2/2024,446,N513DA,MCO,15:55,15:54,283,258,-1,16:07,13,0,0,0,0,0
300716,DL,10/13/2023,2906,N518DQ,SLC,5:35,5:29,123,109,-6,5:50,21,0,0,0,0,0
301699,DL,10/16/2022,661,N371NW,DFW,18:35,18:31,183,181,-4,18:49,18,0,0,0,0,0


In [None]:
# get the numbers of rows and columns.
print(combined_df.shape)

(484605, 17)


In [6]:
# Convert 'Date (MM/DD/YYYY)' to datetime
combined_df['Date'] = pd.to_datetime(combined_df['Date (MM/DD/YYYY)'])

In [7]:
# Extract useful time-based features
combined_df['Month'] = combined_df['Date'].dt.month
combined_df['Day'] = combined_df['Date'].dt.day
combined_df['Weekday'] = combined_df['Date'].dt.weekday  # Monday = 0, Sunday = 6

In [8]:
# Add Holiday Indicator
us_holidays = holidays.US()
combined_df['Holiday'] = combined_df['Date'].apply(lambda x: 1 if x in us_holidays else 0)

In [9]:
# Create Season Feature based on the month
def get_season(date):
    month = date.month
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'
combined_df['Season'] = combined_df['Date'].apply(get_season)

In [10]:
# Extract hour from Scheduled Departure Time with error handling
def extract_hour(time_str):
    try:
        return int(time_str.split(":")[0])  # Extract hour part
    except:
        return np.nan  # Handle potential errors

# Apply function to extract the hour
combined_df['Scheduled Departure Hour'] = combined_df['Scheduled departure time'].apply(extract_hour)

# Drop any rows where Scheduled Departure Hour couldn't be extracted
combined_df = combined_df.dropna(subset=['Scheduled Departure Hour'])

# Convert Scheduled Departure Hour to integer type
combined_df['Scheduled Departure Hour'] = combined_df['Scheduled Departure Hour'].astype(int)

# Drop the original string column to avoid conversion issues
combined_df = combined_df.drop(columns=['Scheduled departure time'])


In [18]:
# Define features and target
feature_cols = ['Carrier Code', 'Destination Airport', 'Month', 'Day', 'Weekday', 'Holiday', 'Season',
                'Scheduled Departure Hour', 'Scheduled elapsed time (Minutes)']

target_col = 'Delay'


In [19]:
# Create binary target variable: 1 if (Departure delay (Minutes)) >= 15 min, else 0
combined_df[target_col] = (combined_df['Departure delay (Minutes)']>= 15).astype(int)

# (Optional) Filter rows where Delay is 1
delayed_rows = combined_df[combined_df['Delay'] == 1]
print(delayed_rows.head())


   Date (MM/DD/YYYY)  Flight Number Tail Number Actual departure time  \
2           1/1/2021           1149      N8308K                 20:08   
12          1/1/2021           2776      N8549Z                 18:05   
27          1/1/2021           3989      N8662F                 13:34   
36          1/1/2022            488      N8813Q                 18:29   
37          1/1/2022            492      N8750Q                  8:53   

    Scheduled elapsed time (Minutes)  Actual elapsed time (Minutes)  \
2                          -1.374873                             75   
12                         -1.320681                             64   
27                         -1.483257                             52   
36                         -1.374873                             73   
37                         -1.374873                             78   

    Departure delay (Minutes)  Taxi-Out time (Minutes)  \
2                          23                        6   
12                

In [None]:
combined_df['Delay'].value_counts()

Unnamed: 0_level_0,count
Delay,Unnamed: 1_level_1
0,398784
1,85821


In [20]:
# One-hot encode categorical features
combined_df = pd.get_dummies(combined_df, columns=['Carrier Code', 'Destination Airport', 'Scheduled Departure Hour', 'Season'])

KeyError: "None of [Index(['Carrier Code', 'Destination Airport', 'Scheduled Departure Hour',\n       'Season'],\n      dtype='object')] are in the [columns]"

In [21]:
# Standardize numerical features
scaler = StandardScaler()
combined_df['Scheduled elapsed time (Minutes)'] = scaler.fit_transform(combined_df[['Scheduled elapsed time (Minutes)']])

In [14]:
# Drop 'Wheels-off time' since it's not used
combined_df = combined_df.drop(columns=['Wheels-off time'])


In [22]:
# Split data into training and testing sets
X = combined_df.drop(columns=[
    'Date',
    'Date (MM/DD/YYYY)',
    'Flight Number',
    'Tail Number',
    'Actual departure time',
    'Actual elapsed time (Minutes)',
    'Departure delay (Minutes)',
    'Delay Carrier (Minutes)',  # Remove these delay-related features
    'Delay Weather (Minutes)',
    'Delay National Aviation System (Minutes)',
    'Delay Security (Minutes)',
    'Delay Late Aircraft Arrival (Minutes)',
    'Taxi-Out time (Minutes)',
    target_col  # This is 'Delay', your target variable
])

# Print all feature names used in training
print("Final Features used for training:", X.columns.tolist())

y = combined_df[target_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


Final Features used for training: ['Scheduled elapsed time (Minutes)', 'Month', 'Day', 'Weekday', 'Holiday', 'Carrier Code_AA', 'Carrier Code_AS', 'Carrier Code_DL', 'Carrier Code_UA', 'Carrier Code_WN', 'Destination Airport_ABQ', 'Destination Airport_ANC', 'Destination Airport_ATL', 'Destination Airport_AUS', 'Destination Airport_BHM', 'Destination Airport_BNA', 'Destination Airport_BOI', 'Destination Airport_BOS', 'Destination Airport_BTR', 'Destination Airport_BUF', 'Destination Airport_BWI', 'Destination Airport_BZN', 'Destination Airport_CID', 'Destination Airport_CLE', 'Destination Airport_CLT', 'Destination Airport_CVG', 'Destination Airport_DAL', 'Destination Airport_DCA', 'Destination Airport_DEN', 'Destination Airport_DFW', 'Destination Airport_DSM', 'Destination Airport_DTW', 'Destination Airport_EGE', 'Destination Airport_ELP', 'Destination Airport_EUG', 'Destination Airport_EWR', 'Destination Airport_FLL', 'Destination Airport_GEG', 'Destination Airport_HNL', 'Destination 

In [23]:
# Which columns in X_train are not numeric?
obj_cols = X_train.select_dtypes(include=['object']).columns
print("Columns with object dtype:", obj_cols)

Columns with object dtype: Index([], dtype='object')


In [24]:
# Train a Logistic Regression model
log_reg = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', random_state=42, max_iter=1000)
#log_reg = LogisticRegression(class_weight={0: 1, 1: 2.5}, random_state=42, max_iter=1000)

log_reg.fit(X_train, y_train)


In [27]:
# Make predictions
y_probs = log_reg.predict_proba(X_test)[:, 1]  # Get probabilities for class 1 (delayed flights)


threshold = 0.2  # Adjust as needed
y_pred_adjusted = (y_probs >= threshold).astype(int)  # Convert probabilities to class labels


In [28]:
# Evaluate model performance
cm = confusion_matrix(y_test, y_pred_adjusted)
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred_adjusted))

Confusion Matrix:
 [[50548 28545]
 [ 7416 10412]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.64      0.74     79093
           1       0.27      0.58      0.37     17828

    accuracy                           0.63     96921
   macro avg       0.57      0.61      0.55     96921
weighted avg       0.76      0.63      0.67     96921

