## 1. Import Libraries

In [77]:
import os

import pickle

import numpy as np

import pandas as pd

import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
	OneHotEncoder,
	OrdinalEncoder,
	StandardScaler,
	MinMaxScaler,
	PowerTransformer,
	FunctionTransformer
)

from feature_engine.outliers import Winsorizer
from feature_engine.datetime import DatetimeFeatures
from feature_engine.selection import SelectBySingleFeaturePerformance
from feature_engine.encoding import (
	RareLabelEncoder,
	MeanEncoder,
	CountFrequencyEncoder
)

import matplotlib.pyplot as plt

import warnings

import xgboost as xgb

## 2. Display Settings

In [78]:
pd.set_option("display.max_column",None) #Display all the columns

In [79]:
sklearn.set_config(transform_output="pandas") # sklearn output pandas setting

In [80]:
warnings.filterwarnings("ignore")

## 3. Read Dataset

In [81]:
train=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Dataset\train.csv")
train.sample(2)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
5691,Jet Airways,2019-06-24,Delhi,Cochin,23:05:00,19:00:00,1195,2.0,No Info,12819
5794,Jet Airways,2019-05-21,Delhi,Cochin,19:30:00,04:25:00,535,2.0,No Info,15129


In [82]:
test=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Dataset\test.csv")
test.sample(2)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
1487,Jet Airways,2019-06-24,Delhi,Cochin,05:30:00,19:00:00,810,2.0,No Info,13292
1603,Air India,2019-04-21,Banglore,Delhi,17:00:00,19:45:00,165,0.0,No Info,5438


In [83]:
val=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Dataset\val.csv")
val.sample(2)

Unnamed: 0,airline,date_of_journey,source,destination,dep_time,arrival_time,duration,total_stops,additional_info,price
266,Jet Airways,2019-03-12,Banglore,New Delhi,05:45:00,21:20:00,935,1.0,No Info,13817
518,Jet Airways,2019-06-27,Delhi,Cochin,14:35:00,12:35:00,1320,2.0,In-flight meal not included,10919


## 4. Preprocessing

In [84]:
# airline
air_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", OneHotEncoder(sparse_output=False, handle_unknown="ignore"))
])

#doj
feature_to_extract = ["month", "week", "day_of_week", "day_of_year"]

doj_transformer = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=feature_to_extract, yearfirst=True, format="mixed")),
    ("scaler", MinMaxScaler())
])

# source & destination
location_pipe1 = Pipeline(steps=[
    ("grouper", RareLabelEncoder(tol=0.1, replace_with="Other", n_categories=2)),
    ("encoder", MeanEncoder()),
    ("scaler", PowerTransformer())
])

def is_north(X):
    columns = X.columns.to_list()
    north_cities = ["Delhi", "Kolkata", "Mumbai", "New Delhi"]
    return (
        X
        .assign(**{
            f"{col}_is_north": X.loc[:, col].isin(north_cities).astype(int)
            for col in columns
        })
        .drop(columns=columns)
    )

location_transformer = FeatureUnion(transformer_list=[
    ("part1", location_pipe1),
    ("part2", FunctionTransformer(func=is_north))
])

# dep_time & arrival_time
time_pipe1 = Pipeline(steps=[
    ("dt", DatetimeFeatures(features_to_extract=["hour", "minute"])),
    ("scaler", MinMaxScaler())
])

def part_of_day(X, morning=4, noon=12, eve=16, night=20):
    columns = X.columns.to_list()
    X_temp = X.assign(**{
        col: pd.to_datetime(X.loc[:, col]).dt.hour
        for col in columns
    })

    return (
        X_temp
        .assign(**{
            f"{col}_part_of_day": np.select(
                [X_temp.loc[:, col].between(morning, noon, inclusive="left"),
                 X_temp.loc[:, col].between(noon, eve, inclusive="left"),
                 X_temp.loc[:, col].between(eve, night, inclusive="left")],
                ["morning", "afternoon", "evening"],
                default="night"
            )
            for col in columns
        })
        .drop(columns=columns)
    )

time_pipe2 = Pipeline(steps=[
    ("part", FunctionTransformer(func=part_of_day)),
    ("encoder", CountFrequencyEncoder()),
    ("scaler", MinMaxScaler())
])

time_transformer = FeatureUnion(transformer_list=[
    ("part1", time_pipe1),
    ("part2", time_pipe2)
])

# duration
class RBFPercentileSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, percentiles=[0.25, 0.5, 0.75], gamma=0.1):
        self.variables = variables
        self.percentiles = percentiles
        self.gamma = gamma


    def fit(self, X, y=None):
        if not self.variables:
            self.variables = X.select_dtypes(include="number").columns.to_list()

        self.reference_values_ = {
            col: (
                X
                .loc[:, col]
                .quantile(self.percentiles)
                .values
                .reshape(-1, 1)
            )
            for col in self.variables
        }

        return self


    def transform(self, X):
        objects = []
        for col in self.variables:
            columns = [f"{col}_rbf_{int(percentile * 100)}" for percentile in self.percentiles]
            obj = pd.DataFrame(
                data=rbf_kernel(X.loc[:, [col]], Y=self.reference_values_[col], gamma=self.gamma),
                columns=columns
            )
            objects.append(obj)
        return pd.concat(objects, axis=1)
    

def duration_category(X, short=180, med=400):
    return (
        X
        .assign(duration_cat=np.select([X.duration.lt(short),
                                    X.duration.between(short, med, inclusive="left")],
                                    ["short", "medium"],
                                    default="long"))
        .drop(columns="duration")
    )

def is_over(X, value=1000):
    return (
        X
        .assign(**{
            f"duration_over_{value}": X.duration.ge(value).astype(int)
        })
        .drop(columns="duration")
    )

duration_pipe1 = Pipeline(steps=[
    ("rbf", RBFPercentileSimilarity()),
    ("scaler", PowerTransformer())
])

duration_pipe2 = Pipeline(steps=[
    ("cat", FunctionTransformer(func=duration_category)),
    ("encoder", OrdinalEncoder(categories=[["short", "medium", "long"]]))
])

duration_union = FeatureUnion(transformer_list=[
    ("part1", duration_pipe1),
    ("part2", duration_pipe2),
    ("part3", FunctionTransformer(func=is_over)),
    ("part4", StandardScaler())
])

duration_transformer = Pipeline(steps=[
    ("outliers", Winsorizer(capping_method="iqr", fold=1.5)),
    ("imputer", SimpleImputer(strategy="median")),
    ("union", duration_union)
])

# total_stops
def is_direct(X):
    return X.assign(is_direct_flight=X.total_stops.eq(0).astype(int))


total_stops_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("", FunctionTransformer(func=is_direct))
])

# additional_info
info_pipe1 = Pipeline(steps=[
    ("group", RareLabelEncoder(tol=0.1, n_categories=2, replace_with="Other")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

def have_info(X):
    return X.assign(additional_info=X.additional_info.ne("No Info").astype(int))

info_union = FeatureUnion(transformer_list=[
("part1", info_pipe1),
("part2", FunctionTransformer(func=have_info))
])

info_transformer = Pipeline(steps=[
("imputer", SimpleImputer(strategy="constant", fill_value="unknown")),
("union", info_union)
])

# column transformer
column_transformer = ColumnTransformer(transformers=[
("air", air_transformer, ["airline"]),
("doj", doj_transformer, ["date_of_journey"]),
("location", location_transformer, ["source", 'destination']),
("time", time_transformer, ["dep_time", "arrival_time"]),
("dur", duration_transformer, ["duration"]),
("stops", total_stops_transformer, ["total_stops"]),
("info", info_transformer, ["additional_info"])
], remainder="passthrough")

# feature selector
estimator = RandomForestRegressor(n_estimators=10, max_depth=3, random_state=42)

selector = SelectBySingleFeaturePerformance(
estimator=estimator,
scoring="r2",
threshold=0.1
) 

# preprocessor
preprocessor = Pipeline(steps=[
("ct", column_transformer),
("selector", selector)
])

In [85]:
preprocessor.fit(
    train.drop(columns="price"),
    train.price.copy()
)

In [86]:
preprocessor.transform(train.drop(columns="price"))

Unnamed: 0,air__airline_Indigo,air__airline_Jet Airways,air__airline_Other,doj__date_of_journey_week,doj__date_of_journey_day_of_year,location__source,location__destination,dur__duration_cat,dur__duration_over_1000,dur__duration,stops__total_stops,stops__is_direct_flight
0,0.0,0.0,0.0,0.823529,0.796610,1.054282,1.053119,2.0,0,-0.005314,1.0,0
1,0.0,0.0,1.0,0.705882,0.711864,-0.255110,-0.256634,2.0,0,-0.398762,1.0,0
2,0.0,1.0,0.0,1.000000,1.000000,1.054282,1.053119,2.0,1,2.304935,2.0,0
3,0.0,0.0,0.0,0.882353,0.872881,-0.255110,-0.256634,2.0,1,2.082990,2.0,0
4,1.0,0.0,0.0,1.000000,0.974576,1.054282,1.053119,1.0,0,-0.509735,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
6395,0.0,0.0,1.0,0.647059,0.635593,-0.255110,-0.256634,1.0,0,-0.499647,1.0,0
6396,0.0,0.0,0.0,0.882353,0.898305,-1.933380,-0.779365,0.0,0,-1.004068,0.0,1
6397,0.0,0.0,0.0,1.000000,1.000000,1.054282,1.053119,2.0,0,-0.429028,1.0,0
6398,1.0,0.0,0.0,0.941176,0.949153,-0.820210,-1.886866,0.0,0,-0.933449,0.0,1


## 5. Preprocess Data and Upload to Bucket

In [87]:
import os
import pandas as pd

file_path = r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Preprocess file"

def get_file_name(name):
    return f"{name}-pre.csv"

def export_data(data, name, pre):
    
    # split data into X and y subsets
    X = data.drop(columns="price")
    y = data["price"].copy()
    
    # transformation
    X_pre = pre.transform(X)

    file_name = get_file_name(name)
    full_path = os.path.join(file_path, file_name)
    
    pd.concat([y, pd.DataFrame(X_pre)], axis=1).to_csv(full_path, index=False)

# Export the data
export_data(train, "train", preprocessor)
export_data(val, "val", preprocessor)
export_data(test, "test", preprocessor)

In [88]:
pre_train=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Preprocess file\train-pre.csv")
pre_val=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Preprocess file\val-pre.csv")
pre_test=pd.read_csv(r"C:\Users\Debasish Das\Documents\01_ML Project\Ml Projects\Flight_Price_Prediction\Preprocess file\test-pre.csv")

## 6. Model and Hyperparameter Tuning Set-up

In [90]:

# Assuming you have your training and validation data ready
X_train, y_train = pre_train.drop(columns="price"), pre_train["price"]
X_val, y_val = pre_val.drop(columns="price"), pre_val["price"]

# Create the XGBoost model
"""model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=500,
    max_depth=6,
    learning_rate=0.1,
    random_state=42
)"""


'model = xgb.XGBRegressor(\n    objective="reg:squarederror",\n    n_estimators=500,\n    max_depth=6,\n    learning_rate=0.1,\n    random_state=42\n)'

In [91]:
import xgboost as xgb

from sklearn.metrics import r2_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 2, 3, 4, 5]
}

# Create the XGBoost model
model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='r2',
    cv=5,
    verbose=1,
    n_jobs=-1
)

# Fit the GridSearchCV object
grid_search.fit(X_train, y_train)

# Evaluate the best model on the validation set
y_val_pred = grid_search.best_estimator_.predict(X_val)
r2 = r2_score(y_val, y_val_pred)

print(f"Best R-squared on validation set: {r2:.4f}")
print(f"Best hyperparameters: {grid_search.best_params_}")

Fitting 5 folds for each of 2500 candidates, totalling 12500 fits
Best R-squared on validation set: 0.7660
Best hyperparameters: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 5, 'n_estimators': 100}


In [93]:
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    max_depth=7,
    learning_rate=0.1,
    min_child_weight = 5,
    gamma=0,
    random_state=42
)

In [94]:
model.fit(X_train,y_train)

In [95]:
# Evaluate the model on the validation set
y_val_pred = model.predict(X_val)
r2 = r2_score(y_val, y_val_pred)
print(f"Validation R-squared: {r2:.4f}")

Validation R-squared: 0.7660


## 9. Model Evaluation

In [122]:
import pickle

# Saving the model
with open("XGboost-model", "wb") as f:
    pickle.dump(model, f)

In [123]:
with open("XGboost-model", "rb") as f:
    best=pickle.load(f)

In [124]:
def evaluate_model(data):
    
    X = data.drop(columns="price")
    y = data.iloc[:, 0].copy()
    
    pred = best.predict(X)
    
    return r2_score(y, pred)

In [125]:
evaluate_model(pre_train)

0.857946515083313

In [126]:
evaluate_model(pre_val)

0.7659902572631836

In [127]:
evaluate_model(pre_test)

0.7606847882270813