In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [34]:
data = pd.read_csv("Airline_Delay_Cause.csv")
data.head()

Unnamed: 0,year,month,carrier,carrier_name,airport,airport_name,arr_flights,arr_del15,carrier_ct,weather_ct,...,security_ct,late_aircraft_ct,arr_cancelled,arr_diverted,arr_delay,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay
0,2023,8,9E,Endeavor Air Inc.,ABE,"Allentown/Bethlehem/Easton, PA: Lehigh Valley ...",89.0,13.0,2.25,1.6,...,0.0,5.99,2.0,1.0,1375.0,71.0,761.0,118.0,0.0,425.0
1,2023,8,9E,Endeavor Air Inc.,ABY,"Albany, GA: Southwest Georgia Regional",62.0,10.0,1.97,0.04,...,0.0,7.42,0.0,1.0,799.0,218.0,1.0,62.0,0.0,518.0
2,2023,8,9E,Endeavor Air Inc.,AEX,"Alexandria, LA: Alexandria International",62.0,10.0,2.73,1.18,...,0.0,4.28,1.0,0.0,766.0,56.0,188.0,78.0,0.0,444.0
3,2023,8,9E,Endeavor Air Inc.,AGS,"Augusta, GA: Augusta Regional at Bush Field",66.0,12.0,3.69,2.27,...,0.0,1.57,1.0,1.0,1397.0,471.0,320.0,388.0,0.0,218.0
4,2023,8,9E,Endeavor Air Inc.,ALB,"Albany, NY: Albany International",92.0,22.0,7.76,0.0,...,0.0,11.28,2.0,0.0,1530.0,628.0,0.0,134.0,0.0,768.0


In [35]:
top_50_airports = data.groupby('airport').agg({'arr_flights': 'sum'}).reset_index()
top_50_airports = top_50_airports.sort_values(by='arr_flights', ascending=False).head(50)

In [36]:
filtered_top_50 = data[data['airport'].isin(top_50_airports['airport'])]

In [37]:
features = [
    'arr_flights', 'carrier_ct', 'weather_ct', 'nas_ct', 
    'security_ct', 'late_aircraft_ct', 'arr_cancelled', 'arr_diverted'
]
target = 'arr_del15'

In [38]:
filtered_top_50 = filtered_top_50.dropna(subset=features + [target])

In [39]:
X = filtered_top_50[features]
y = (filtered_top_50[target] > 0).astype(int)

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [42]:
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)

In [43]:
y_pred = rf_model.predict(X_test_scaled)

In [44]:
report_top_50 = classification_report(y_test, y_pred, target_names=["No Delay", "Delay"])
feature_importances_top_50 = rf_model.feature_importances_

In [45]:
report_top_50, dict(zip(features, feature_importances_top_50))

('              precision    recall  f1-score   support\n\n    No Delay       1.00      1.00      1.00       388\n       Delay       1.00      1.00      1.00     18305\n\n    accuracy                           1.00     18693\n   macro avg       1.00      1.00      1.00     18693\nweighted avg       1.00      1.00      1.00     18693\n',
 {'arr_flights': 0.12608830655745573,
  'carrier_ct': 0.43350643696198604,
  'weather_ct': 0.03907406392451505,
  'nas_ct': 0.24548074303478357,
  'security_ct': 0.004013034169642438,
  'late_aircraft_ct': 0.14053190970006757,
  'arr_cancelled': 0.009220626300190573,
  'arr_diverted': 0.002084879351359})