# Predicting Airline Delays Using Machine Learning

## Loading the Dataset

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
df = pd.read_csv(r"C:\Users\kruth\Documents\VSCode\Project2\Airlines.csv")  
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 539383 entries, 0 to 539382
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           539383 non-null  int64 
 1   Airline      539383 non-null  object
 2   Flight       539383 non-null  int64 
 3   AirportFrom  539383 non-null  object
 4   AirportTo    539383 non-null  object
 5   DayOfWeek    539383 non-null  int64 
 6   Time         539383 non-null  int64 
 7   Length       539383 non-null  int64 
 8   Delay        539383 non-null  int64 
dtypes: int64(6), object(3)
memory usage: 37.0+ MB
None
   id Airline  Flight AirportFrom AirportTo  DayOfWeek  Time  Length  Delay
0   1      CO     269         SFO       IAH          3    15     205      1
1   2      US    1558         PHX       CLT          3    15     222      1
2   3      AA    2400         LAX       DFW          3    20     165      1
3   4      AA    2466         SFO       DFW          3    20     195

## Pre-Processing Steps

In [5]:
df.drop(columns=['id'], inplace=True)

In [6]:
df_encoded = pd.get_dummies(df, columns=['Airline', 'AirportFrom', 'AirportTo'], drop_first=True)

In [7]:
scaler = StandardScaler()
num_features = ['Time', 'Length', 'DayOfWeek']  # Select numerical columns for scaling
df_encoded[num_features] = scaler.fit_transform(df_encoded[num_features])

In [8]:
X = df_encoded.drop(columns=['Delay'])  
y = df_encoded['Delay']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Sample a smaller subset of the data for faster testing
X_train_sampled = X_train.sample(frac=0.1, random_state=42)  # 10% of the training data
y_train_sampled = y_train.sample(frac=0.1, random_state=42)

# Initialize the model with fewer trees and limited depth
rf_model = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42, n_jobs=-1)

# Train the model on the sampled data
rf_model.fit(X_train_sampled, y_train_sampled)

# Predicting on the test set
y_pred = rf_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)


Accuracy: 56.96%
Confusion Matrix:
 [[58880   944]
 [45487  2566]]
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.98      0.72     59824
           1       0.73      0.05      0.10     48053

    accuracy                           0.57    107877
   macro avg       0.65      0.52      0.41    107877
weighted avg       0.64      0.57      0.44    107877

