In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import TargetEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
from amplpy import AMPL, ampl_notebook
from joblib import load, dump
import sys
sys.path.append("../src")
from utils import cost_prediction

import warnings
warnings.filterwarnings("ignore")
pd.set_option("future.no_silent_downcasting", True) # Prevent silent data type changes during operations for future compatibility

In [2]:
df = pd.read_csv("../data/US Airline Flight Routes and Fares 1993-2024.csv")
display(df.head())

Unnamed: 0,tbl,Year,quarter,citymarketid_1,citymarketid_2,city1,city2,airportid_1,airportid_2,airport_1,...,fare,carrier_lg,large_ms,fare_lg,carrier_low,lf_ms,fare_low,Geocoded_City1,Geocoded_City2,tbl1apk
0,Table1a,2021,3,30135,33195,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",10135,14112,ABE,...,81.43,G4,1.0,81.43,G4,1.0,81.43,,,202131013514112ABEPIE
1,Table1a,2021,3,30135,33195,"Allentown/Bethlehem/Easton, PA","Tampa, FL (Metropolitan Area)",10135,15304,ABE,...,208.93,DL,0.4659,219.98,UA,0.1193,154.11,,,202131013515304ABETPA
2,Table1a,2021,3,30140,30194,"Albuquerque, NM","Dallas/Fort Worth, TX",10140,11259,ABQ,...,184.56,WN,0.9968,184.44,WN,0.9968,184.44,,,202131014011259ABQDAL
3,Table1a,2021,3,30140,30194,"Albuquerque, NM","Dallas/Fort Worth, TX",10140,11298,ABQ,...,182.64,AA,0.9774,183.09,AA,0.9774,183.09,,,202131014011298ABQDFW
4,Table1a,2021,3,30140,30466,"Albuquerque, NM","Phoenix, AZ",10140,14107,ABQ,...,177.11,WN,0.6061,184.49,AA,0.3939,165.77,,,202131014014107ABQPHX


In [4]:
df = df[["Year", "airport_1", "airport_2", "passengers", "fare"]].rename({"Year": "year"}, axis=1)
df["route"] = df["airport_1"] + "_" + df["airport_2"]
df["ln_fare"] = np.log(df["fare"])
df = df.drop(["airport_1", "airport_2", "fare"], axis=1)
df.head()

Unnamed: 0,year,passengers,route,ln_fare
0,2021,180,ABE_PIE,4.399744
1,2021,19,ABE_TPA,5.341999
2,2021,204,ABQ_DAL,5.217975
3,2021,264,ABQ_DFW,5.207517
4,2021,398,ABQ_PHX,5.176771


In [9]:
# Initialize a dictionary to store route-specific models and their coefficients
route_models = {}

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop("ln_fare", axis=1), df["ln_fare"], test_size=0.2, random_state=42)
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)

# For each unique route
for route in df_train["route"].unique():
    # Get data for this route
    route_data = df_train[df_train["route"] == route]

    # Prepare features and target
    X = route_data[["passengers", "year"]]
    y = route_data["ln_fare"]

    # Fit a linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Store the coefficients and intercept
    route_models[route] = {
        "a": np.exp(model.intercept_),
        "b": - 1 * model.coef_[0],
        "y": model.coef_[1],
    }

In [10]:
dump(route_models, "../models/route_fare_models.joblib")

['../models/route_fare_models.joblib']