In [1]:
import joblib
import numpy as np
import pandas as pd
import time
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [5]:
df = pd.read_excel("flightprice.xlsx")
df.head(5)

Unnamed: 0,Airline,Date_of_Journey,Source,Destination,Route,Dep_Time,Arrival_Time,Duration,Total_Stops,Additional_Info,Price
0,IndiGo,24/03/2019,Banglore,New Delhi,BLR → DEL,22:20,01:10 22 Mar,2h 50m,non-stop,No info,3897
1,Air India,1/05/2019,Kolkata,Banglore,CCU → IXR → BBI → BLR,05:50,13:15,7h 25m,2 stops,No info,7662
2,Jet Airways,9/06/2019,Delhi,Cochin,DEL → LKO → BOM → COK,09:25,04:25 10 Jun,19h,2 stops,No info,13882
3,IndiGo,12/05/2019,Kolkata,Banglore,CCU → NAG → BLR,18:05,23:30,5h 25m,1 stop,No info,6218
4,IndiGo,01/03/2019,Banglore,New Delhi,BLR → NAG → DEL,16:50,21:35,4h 45m,1 stop,No info,13302


In [None]:
df[['Date_of_Journey']]

Unnamed: 0,Date_of_Journey
0,24/03/2019
1,1/05/2019
2,9/06/2019
3,12/05/2019
4,01/03/2019
...,...
10678,9/04/2019
10679,27/04/2019
10680,27/04/2019
10681,01/03/2019


In [16]:
# ---- Duration ----
def duration_to_minutes(x):
    h, m = 0, 0
    if 'h' in x:
        h = int(x.split('h')[0])
    if 'm' in x:
        m = int(x.split('m')[0].split()[-1])
    return h * 60 + m

df["Duration_minutes"] = df["Duration"].apply(duration_to_minutes)

# ---- Stops ----
df["stops_num"] = df["Total_Stops"].replace({
    "non-stop": 0,
    "1 stop": 1,
    "2 stops": 2,
    "3 stops": 3,
    "4 stops": 4
})

# ---- Time ----
df["Dep_Hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
df["Arrival_Hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour

# ---- Date ----
df["Journey_Date"] = pd.to_datetime(df["Date_of_Journey"], dayfirst=True)
df["Journey_Day"] = df["Journey_Date"].dt.day
df["Journey_Month"] = df["Journey_Date"].dt.month


  df["stops_num"] = df["Total_Stops"].replace({
  df["Dep_Hour"] = pd.to_datetime(df["Dep_Time"]).dt.hour
  df["Arrival_Hour"] = pd.to_datetime(df["Arrival_Time"]).dt.hour


In [17]:
X = df[
    [
        "Airline",
        "Source",
        "Destination",
        "Duration_minutes",
        "stops_num",
        "Dep_Hour",
        "Arrival_Hour",
        "Journey_Day",
        "Journey_Month",
    ]
]

y = np.log1p(df["Price"])


In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

numeric_features = [
    "Duration_minutes",
    "stops_num",
    "Dep_Hour",
    "Arrival_Hour",
    "Journey_Day",
    "Journey_Month",
]

categorical_features = ["Airline", "Source", "Destination"]

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features),
])


In [19]:
from xgboost import XGBRegressor

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

final_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("reg", model)
])

final_pipeline.fit(X, y)


0,1,2
,steps,"[('preprocessor', ...), ('reg', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
import joblib, os

os.makedirs("backend/model", exist_ok=True)
joblib.dump(final_pipeline, "backend/model/flight_model.pkl")


['backend/model/flight_model.pkl']