In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

cols = [
    "FL_DATE",
    "CRS_DEP_TIME",
    "DISTANCE",
    "AIR_TIME",
    "TAXI_OUT",
    "AIRLINE",
    "ORIGIN",
    "DEST",
    "DEP_DELAY"
]

df = pd.read_csv("/content/drive/MyDrive/flights_sample_3m.csv", usecols=cols, nrows=100000)

df["FL_DATE"] = pd.to_datetime(df["FL_DATE"])
df["MONTH"] = df["FL_DATE"].dt.month
df.drop("FL_DATE", axis=1, inplace=True)

df["IS_DELAYED"] = (df["DEP_DELAY"] > 0).astype(int)
df.drop("DEP_DELAY", axis=1, inplace=True)

df = pd.get_dummies(df, drop_first=True)
X = df.drop("IS_DELAYED", axis=1)
y = df["IS_DELAYED"]

X = X.dropna()
y = y[X.index]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

lr = LogisticRegression(max_iter=100)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

print("Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1 Score:", f1_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

Logistic Regression Results
Accuracy: 0.6739365258988735
Precision: 0.5745577085088458
Recall: 0.20379500971163902
F1 Score: 0.30087129149663616
Confusion Matrix:
 [[11738  1010]
 [ 5329  1364]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
dt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)

# Evaluate
print("\nDecision Tree Results")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))



Decision Tree Results
Accuracy: 0.5925106733192737
Precision: 0.4106181818181818
Recall: 0.42178395338413266
F1 Score: 0.416126179245283
Confusion Matrix:
 [[8696 4052]
 [3870 2823]]


In [None]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)

print("\nRandom Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



Random Forest Results
Accuracy: 0.6616943572861478
Precision: 0.5158469945355191
Recall: 0.2820857612430898
F1 Score: 0.36472520042499756
Confusion Matrix:
 [[10976  1772]
 [ 4805  1888]]


In [None]:
feature_names = X.columns
lr_importance = pd.DataFrame({
    "Feature": feature_names,
    "Coefficient": lr.coef_[0]
})
lr_importance["Abs_Coefficient"] = lr_importance["Coefficient"].abs()
lr_importance = lr_importance.sort_values(
    by="Abs_Coefficient", ascending=False
)
print("Top 10 Logistic Regression Features:")
print(lr_importance.head(10))


Top 10 Logistic Regression Features:
                            Feature  Coefficient  Abs_Coefficient
18    AIRLINE_SkyWest Airlines Inc.    -0.694052         0.694052
17         AIRLINE_Republic Airline    -0.591092         0.591092
8         AIRLINE_Endeavor Air Inc.    -0.579701         0.579701
19   AIRLINE_Southwest Airlines Co.     0.525078         0.525078
9                 AIRLINE_Envoy Air    -0.282307         0.282307
16        AIRLINE_PSA Airlines Inc.    -0.259660         0.259660
7      AIRLINE_Delta Air Lines Inc.    -0.247784         0.247784
119                      ORIGIN_DEN     0.238774         0.238774
417                        DEST_ATL    -0.210110         0.210110
14          AIRLINE_JetBlue Airways     0.162123         0.162123


In [None]:
dt_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": dt.feature_importances_
})

dt_importance = dt_importance.sort_values(
    by="Importance", ascending=False
)

print("Top 10 Decision Tree Features:")
print(dt_importance.head(10))


Top 10 Decision Tree Features:
                            Feature  Importance
0                      CRS_DEP_TIME    0.173229
2                          AIR_TIME    0.126095
3                          DISTANCE    0.101909
1                          TAXI_OUT    0.092691
4                             MONTH    0.076473
19   AIRLINE_Southwest Airlines Co.    0.019747
6    AIRLINE_American Airlines Inc.    0.006892
7      AIRLINE_Delta Air Lines Inc.    0.005357
417                        DEST_ATL    0.005266
21    AIRLINE_United Air Lines Inc.    0.005154


In [None]:
dt_importance = pd.DataFrame({
    "Feature": X.columns,
    "Importance": dt.feature_importances_
})

dt_importance = dt_importance.sort_values(
    by="Importance", ascending=False
)

print("Top 10 Decision Tree Features:")
print(dt_importance.head(10))


Top 10 Random Forest Features:
                           Feature  Importance
0                     CRS_DEP_TIME    0.168991
2                         AIR_TIME    0.131991
1                         TAXI_OUT    0.119089
4                            MONTH    0.095393
3                         DISTANCE    0.073145
19  AIRLINE_Southwest Airlines Co.    0.014049
6   AIRLINE_American Airlines Inc.    0.005719
18   AIRLINE_SkyWest Airlines Inc.    0.005569
7     AIRLINE_Delta Air Lines Inc.    0.005233
21   AIRLINE_United Air Lines Inc.    0.005002
