In [28]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.ensemble import RandomForestRegressor
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Dense, Flatten, MaxPooling1D, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [27]:
path = kagglehub.dataset_download("shubhambathwal/flight-price-prediction")

# print("Path to dataset files:", path)
df = pd.read_csv(path + "/Clean_Dataset.csv")

# splitting the last 15 records for manual testing/future prediction
last_15 = df[-15:]
df = df[:-15]
df.head(5)

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [28]:
# dropping unnecessary columns
df.drop(columns=['Unnamed: 0', 'flight'], inplace=True)

# Inspecting the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300138 entries, 0 to 300137
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300138 non-null  object 
 1   source_city       300138 non-null  object 
 2   departure_time    300138 non-null  object 
 3   stops             300138 non-null  object 
 4   arrival_time      300138 non-null  object 
 5   destination_city  300138 non-null  object 
 6   class             300138 non-null  object 
 7   duration          300138 non-null  float64
 8   days_left         300138 non-null  int64  
 9   price             300138 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [29]:
print(f"amount of duplicated rows: {df.duplicated().sum()}\n")
print(f"NaN values: \n{df.isna().sum()}")

amount of duplicated rows: 2213

NaN values: 
airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64


In [30]:
# get all the unique times and classes so we can apply label encoding
print(f"Unique values in 'departure_time' column: {df['departure_time'].unique()}")
print(f"Unique values in 'class' column: {df['class'].unique()}")
print(f"Unique values in 'stops' column: {df['stops'].unique()}")

Unique values in 'departure_time' column: ['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
Unique values in 'class' column: ['Economy' 'Business']
Unique values in 'stops' column: ['zero' 'one' 'two_or_more']


In [31]:
# preparing enumeration so we can use it to map the values to integers in label encoding
timing_enum = ['Early_Morning', 'Morning', 'Afternoon', 'Evening', 'Night', 'Late_Night']
class_enum = ['Economy', 'Business']
stops_enum = ['zero', 'one', 'two_or_more']

# Convert 'departure_time' and 'arrival_time' to ordinal categories
departure_dtype = pd.CategoricalDtype(categories=timing_enum, ordered=True)
arrival_dtype = pd.CategoricalDtype(categories=timing_enum, ordered=True)
class_dtype = pd.CategoricalDtype(categories=class_enum, ordered=True)
stops_dtype = pd.CategoricalDtype(categories=stops_enum, ordered=True)

# Applying label encoding
df['departure_time'] = df['departure_time'].astype(departure_dtype).cat.codes
df['arrival_time'] = df['arrival_time'].astype(arrival_dtype).cat.codes
df['class'] = df['class'].astype(class_dtype).cat.codes
df['stops'] = df['stops'].astype(stops_dtype).cat.codes
df.head(5)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,3,0,4,Mumbai,0,2.17,1,5953
1,SpiceJet,Delhi,0,0,1,Mumbai,0,2.33,1,5953
2,AirAsia,Delhi,0,0,0,Mumbai,0,2.17,1,5956
3,Vistara,Delhi,1,0,2,Mumbai,0,2.25,1,5955
4,Vistara,Delhi,1,0,1,Mumbai,0,2.33,1,5955


In [32]:
# Apply one-hot encoding to the categorical columns
df = pd.get_dummies(df, columns=['airline', 'source_city', 'destination_city'], drop_first=True)
df.head(5)

Unnamed: 0,departure_time,stops,arrival_time,class,duration,days_left,price,airline_Air_India,airline_GO_FIRST,airline_Indigo,...,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,3,0,4,0,2.17,1,5953,False,False,False,...,False,True,False,False,False,False,False,False,False,True
1,0,0,1,0,2.33,1,5953,False,False,False,...,False,True,False,False,False,False,False,False,False,True
2,0,0,0,0,2.17,1,5956,False,False,False,...,False,True,False,False,False,False,False,False,False,True
3,1,0,2,0,2.25,1,5955,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,1,0,1,0,2.33,1,5955,False,False,False,...,False,True,False,False,False,False,False,False,False,True


In [33]:
temp = df.copy(deep=False) # so that it doesnt affect original df in next line
# randomizing feature selection
X = temp.drop(columns=['price'], axis=1).sample(frac=0.5, axis=1)
X

Unnamed: 0,destination_city_Mumbai,arrival_time,source_city_Mumbai,airline_Air_India,days_left,airline_Vistara,destination_city_Hyderabad,airline_GO_FIRST,destination_city_Kolkata,class
0,True,4,False,False,1,False,False,False,False,0
1,True,1,False,False,1,False,False,False,False,0
2,True,0,False,False,1,False,False,False,False,0
3,True,2,False,False,1,True,False,False,False,0
4,True,1,False,False,1,True,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...
300133,False,2,False,False,49,True,True,False,False,1
300134,False,3,False,False,49,True,True,False,False,1
300135,False,3,False,False,49,True,True,False,False,1
300136,False,3,False,False,49,True,True,False,False,1


In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=0.3, random_state=42)

naive bayes below

In [82]:
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.metrics import mean_squared_error, r2_score

# Preprocess features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create simple bins for target
binner = KBinsDiscretizer(n_bins=75, encode='ordinal', strategy='quantile')
y_train_binned = binner.fit_transform(y_train.values.reshape(-1, 1))
y_test_binned = binner.transform(y_test.values.reshape(-1, 1))

# Train basic Naive Bayes
nb_model = GaussianNB(var_smoothing=1e-6)
nb_model.fit(X_train_scaled, y_train_binned.ravel())

# Predict and transform back
train_pred_binned = nb_model.predict(X_train_scaled)
test_pred_binned = nb_model.predict(X_test_scaled)

train_predictions = binner.inverse_transform(train_pred_binned.reshape(-1, 1))
test_predictions = binner.inverse_transform(test_pred_binned.reshape(-1, 1))


train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)


print(f'Training R2 Score: {train_r2:.4f}')
print(f'Test R2 Score: {test_r2:.4f}')

Training R2 Score: -0.9163
Test R2 Score: -0.9238


In [29]:
rf_regress = RandomForestRegressor()

rf_regress.fit(X_train, y_train)

# Make predictions
train_predictions = rf_regress.predict(X_train)
test_predictions = rf_regress.predict(X_test)

# Calculate metrics
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f'Training R2 Score: {train_r2:.4f}')
print(f'Test R2 Score: {test_r2:.4f}')

Training R2 Score: 0.9736
Test R2 Score: 0.9703


In [None]:
# KFold cross validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# performing cross-validation for testing the model
scores = cross_val_score(rf_regress, X, df['price'], cv=kf, scoring='r2')
# print(f'Cross-validated RMSE: {np.sqrt(-scores.mean()):.2f}')
print(f'Cross-validated R2: {scores.mean():.4f}')

Cross-validated R2: 0.9707


In [32]:
import xgboost as xgb

xgb_model = xgb.XGBRegressor()

xgb_model.fit(X_train, y_train)

train_predictions = xgb_model.predict(X_train)
test_predictions = xgb_model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_predictions))
test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
train_r2 = r2_score(y_train, train_predictions)
test_r2 = r2_score(y_test, test_predictions)

print(f'\nTraining RMSE: {train_rmse:.2f}')
print(f'Test RMSE: {test_rmse:.2f}')
print(f'Training R2 Score: {train_r2:.4f}')
print(f'Test R2 Score: {test_r2:.4f}')


Training RMSE: 4594.54
Test RMSE: 4629.21
Training R2 Score: 0.9591
Test R2 Score: 0.9582


In [34]:
# Perform cross-validation
scores = cross_val_score(xgb_model, X, df['price'], cv=kf, scoring='r2')
print(f'Cross-validated R2: {scores.mean():.4f}')

Cross-validated R2: 0.9579
