In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [30]:
def load():
    return pd.read_csv("../data/Airlines_updated.csv")

def get_split(df):
    X = df.drop(['id', 'Delay'], axis=1)
    y = df['Delay']
    return train_test_split(X, y, test_size=0.2, random_state=0)

def print_metrics(y_test, y_pred):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

In [31]:
df = load()
df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay,Airline_DelayRate,Route,Route_AvgDelay
0,0,CO,269,SFO,IAH,3,15,205,1,0.566199,SFO_IAH,0.800866
1,1,US,1558,PHX,CLT,3,15,222,1,0.335971,PHX_CLT,0.418972
2,2,AA,2400,LAX,DFW,3,20,165,1,0.38847,LAX_DFW,0.347426
3,3,AA,2466,SFO,DFW,3,20,195,1,0.38847,SFO_DFW,0.525836
4,4,AS,108,ANC,SEA,3,30,202,0,0.33929,ANC_SEA,0.344519


In [32]:
categorical_columns = ['Airline', 'AirportFrom', 'AirportTo', 'Route']
numeric_columns = ['DayOfWeek', 'Time', 'Length', 'Flight', 'Airline_DelayRate', 'Route_AvgDelay']
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_columns),
        ('cat', OneHotEncoder(handle_unknown='infrequent_if_exist'), categorical_columns)
    ])

In [33]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(random_state=0, max_iter=1000))
])
param_grid = {
    'clf__C': [0.001, 0.01, 0.1, 1, 10, 100],
    'clf__penalty': ['l1', 'l2'],
    'clf__solver': ['liblinear', 'saga']
}

In [34]:
X_train, X_test, y_train, y_test = get_split(df)
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)



Best Parameters: {'clf__C': 0.1, 'clf__penalty': 'l1', 'clf__solver': 'liblinear'}
Best Accuracy: 0.6549735092959692


In [36]:
import joblib
joblib.dump(grid_search.best_estimator_, "../models/log_reg_acc_6549")

['../models/log_reg_acc_6549']