In [None]:
from IPython.display import clear_output
!pip3 install -U lazypredict
!pip3 install -U pandas
clear_output()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

## Load the dataset

In [None]:
data = pd.read_csv('../input/flight-price-prediction/Clean_Dataset.csv', index_col=False)
data.head(5)

In [None]:
data = data.drop(data.columns[0], axis=1)
data.head(5)

Validate if there are no missing values in the dataframe.

In [None]:
data.isna().sum()

## Encode the categorical variables

From exploratory analysis, it looks like there are a number of categorical data types. By look at 'object' types below, we can see there are 8 entries that are categorial in nature. After we see the categorical variables, we can convert them categories into numerical values using One Hot Encoding method.

In [None]:
columns = ["airline", "source_city", "departure_time", "stops", "destination_city", "class", "days_left"]
for col in columns:
    items = data[col].unique()
    print("Column:{} Counts:{} Items:{}".format(col, len(items), items))

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_columns = ['airline', 
               'source_city', 
               'departure_time', 
               'stops', 
               'arrival_time', 
               'destination_city',
               'class']

ohe = OneHotEncoder(handle_unknown='ignore')

ohe_df =  pd.DataFrame(ohe.fit_transform(data[cat_columns]).toarray())

ohe_df.columns = ohe.get_feature_names(cat_columns)

data.drop(cat_columns, axis = 1, inplace = True)

data = data.join(ohe_df)
data.head(10)

## Scaling the continous variables

We next deal with the continuous variables to have a consistent scale, using the StandardScaler function. First, we look at the columns where the data range is not within [0.0, 1.0]. duration is the column that requires scaling.

In [None]:
stats = data.agg(['min', 'max'])

for col in data.columns:
    if data[col].dtype.type is np.object_:
        continue

    if stats[col].max() > 1.0:
        print("Column Name: {}, Min: {}, Max: {}".format(col, stats[col].min(), stats[col].max()))

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data['duration'] = scaler.fit_transform(np.array(data['duration']).reshape(-1,1))
data.head(10)

## Baseline Model Evaluation

We have no idea which model will do well on this data. Let's design a test harness with 10-fold cross-validation. We will evaluate algorithms using the Mean Squared Error (MSE) metric. MSE will give a gross idea of how wrong all predictions are (0 being perfect)

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

## Prepare training data

Now prepare the train-test set from the original data set. The train-test ratio shall be 80:20.

In [None]:
train_X = data.drop(columns=['flight','price'])
train_Y = data['price']

# randomly split the data
train_x, test_x, train_y, test_y = train_test_split(train_X, train_Y,test_size=0.20, random_state=21)

# shape of train and test splits
train_x.shape, test_x.shape, train_y.shape, test_y.shape

## Model Training and Evaluation

We have no idea which model will do well on this data. Let's design a test harness with 10-fold cross-validation. We will evaluate algorithms using the Mean Squared Error (MSE) metric. MSE will give a gross idea of how wrong all predictions are (0 being perfect). In addition, I extract a data sample that extract only 5% of the train set for the test harness - this is to enable the test harness to complete quickly.

In [None]:
sample_data = data.sample(frac=0.01)
sample_x = sample_data.drop(columns=['flight','price'])
sample_y = sample_data['price']

sample_train_x, sample_test_x, sample_train_y, sample_test_y = train_test_split(sample_x, sample_y,test_size=0.20, random_state=21)
sample_train_x.shape

In [None]:
import lazypredict
from lazypredict.Supervised import LazyRegressor

In [None]:
reg = LazyRegressor(verbose=0,
                    ignore_warnings=True, 
                    custom_metric=None,
                    random_state=12)

models, predictions = reg.fit(sample_train_x, sample_test_x, sample_train_y, sample_test_y)

In [None]:
models

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

model = XGBRegressor()
k = 10
param_grid = {'learning_rate': [0.2, 0.15, 0.1, 0.05],
              'n_estimators' : [50,100,200,300,400]}

skfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=21)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, scoring='neg_mean_squared_error', cv=skfold)
start = time.time()
grid_result = grid.fit(train_x, train_y)
end = time.time()

means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("MSE: {:.4f} STD: {:.4f} with: {}".format(mean, stdev, param))

print("Best: {:.4f} using {} (run time : {:.3f})".format(grid_result.best_score_, grid_result.best_params_, end-start))

In [None]:
model = XGBRegressor()
model.set_params(**grid.best_params_)

print("Training model with best parameters={}".format(grid.best_params_))
start = time.time()
model.fit(train_x, train_y)
end = time.time()

print("Training completed with run time {:.3f} seconds".format(end-start))

In [None]:
print("Validate model on test set")
predict_test = model.predict(test_x)
print('RMSE on test data: {:.4f}'.format(mean_squared_error(test_y, predict_test)**(0.5)))

## Model Evaluation (compare prediction against ground truths)

In [None]:
diff = abs(test_y - predict_test)
percent_diff = diff/test_y * 100

compare = pd.DataFrame({'Ground Truths' : test_y, 
                        'Prediction': predict_test.round(decimals=2), 
                        'Difference': diff.round(decimals=2),
                        '% Difference': percent_diff.round(decimals=2)})
compare.head(10)

In [None]:
import joblib

joblib.dump(model, "model.joblib")

## Model training with only the more important features

After the preprocessing and encoding steps, the train data set has a total of 37 attributes which not all are useful in forecasting the prices. We can select the top attributes that have the bigger contribution in forecasting price values. Using less attributes to train a comparable model will result in a less complex model. From the chart below, there are only 4 attributes that seem to have a bigger impact on the model (quite intuitively so).

In [None]:
# plot the 15 most important features 
plt.figure(figsize=(10, 7))
feat_importances = pd.Series(model.feature_importances_, index = train_x.columns)
feat_importances.nlargest(15).plot(kind='barh');
plt.xlabel('Importance Score')
plt.ylabel('Attribute Labels')
plt.show()

In [None]:
impact_columns = ['class_Business', 
                  'class_Economy',
                  'duration',
                  'days_left', 
                  'airline_Vistara', 
                  'airline_Air_India', 
                  'source_city_Delhi', 
                  'destination_city_Delhi', 
                  'source_city_Mumbai', 
                  'destination_city_Mumbai']

train_x_if = train_x[impact_columns]
test_x_if = test_x[impact_columns]

model_with_if = XGBRegressor(random_state=21, 
                             n_estimators=grid.best_params_['n_estimators'],
                             learning_rate=grid.best_params_['learning_rate'])

# fit the model with the training data
start = time.time()
model_with_if.fit(train_x_if, train_y)
end = time.time()
print("Training completed with run time {:.3f} seconds".format(end-start))

# predict the target on the training and test data
print("Validate model on train set")
predict_train_with_if = model_with_if.predict(train_x_if)
print('RMSE on train data: ', mean_squared_error(train_y, predict_train_with_if)**(0.5))

print("Validate model on test set")
predict_test_with_if = model_with_if.predict(test_x_if)
print('RMSE on test data: ',  mean_squared_error(test_y, predict_test_with_if)**(0.5))