In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import joblib

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
file_path = '/content/drive/MyDrive/Colab Notebooks/dataset.csv'
data = pd.read_csv(file_path)

data['stops'] = data['stops'].map({'zero': 0, 'one': 1, 'two_or_more': 2})

data['class'] = data['class'].map({'Economy': 0, 'Business': 1})

X = data[['airline', 'departure_time', 'arrival_time', 'source_city',
          'destination_city', 'stops', 'class', 'days_left']]
y = data['price']

numeric_features = ['days_left', 'stops']
categorical_features = ['airline', 'departure_time', 'arrival_time', 'source_city', 'destination_city', 'class']

numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_transformer, numeric_features),
                  ('cat', categorical_transformer, categorical_features)])

xgb_model = XGBRegressor(random_state=42, objective='reg:squarederror')

model = Pipeline(steps=[('preprocessor', preprocessor), ('regressor', xgb_model)])

param_grid = {
    'regressor__n_estimators': [100, 200],
    'regressor__max_depth': [3, 6, 10],
    'regressor__learning_rate': [0.01, 0.1, 0.3],
    'regressor__subsample': [0.8, 0.9, 1.0],
    'regressor__colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")
print(f"Best Hyperparameters: {grid_search.best_params_}")

Mean Absolute Error: 1975.6288915196549
Best Hyperparameters: {'regressor__colsample_bytree': 0.8, 'regressor__learning_rate': 0.1, 'regressor__max_depth': 10, 'regressor__n_estimators': 200, 'regressor__subsample': 1.0}


In [None]:
joblib.dump(best_model, '/content/drive/MyDrive/Colab Notebooks/flight_price_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/flight_price_model.pkl']