In [121]:
import kagglehub
import pandas as pd
from sklearn.model_selection import train_test_split


In [113]:
path = kagglehub.dataset_download("shubhambathwal/flight-price-prediction")

# print("Path to dataset files:", path)
df = pd.read_csv(path + "/Clean_Dataset.csv")

# splitting the last 15 records for manual testing/future prediction
last_15 = df[-15:]
df = df[:-15]
df.head(5)

Unnamed: 0.1,Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [114]:
# dropping unnecessary columns
df.drop(columns=['Unnamed: 0', 'flight'], inplace=True)

# Inspecting the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300138 entries, 0 to 300137
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   airline           300138 non-null  object 
 1   source_city       300138 non-null  object 
 2   departure_time    300138 non-null  object 
 3   stops             300138 non-null  object 
 4   arrival_time      300138 non-null  object 
 5   destination_city  300138 non-null  object 
 6   class             300138 non-null  object 
 7   duration          300138 non-null  float64
 8   days_left         300138 non-null  int64  
 9   price             300138 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 22.9+ MB


In [115]:
print(f"amount of duplicated rows: {df.duplicated().sum()}\n")
print(f"NaN values: \n{df.isna().sum()}")

amount of duplicated rows: 2213

NaN values: 
airline             0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64


In [116]:
# get all the unique times and classes so we can apply label encoding
print(f"Unique values in 'departure_time' column: {df['departure_time'].unique()}")
print(f"Unique values in 'class' column: {df['class'].unique()}")
print(f"Unique values in 'stops' column: {df['stops'].unique()}")

Unique values in 'departure_time' column: ['Evening' 'Early_Morning' 'Morning' 'Afternoon' 'Night' 'Late_Night']
Unique values in 'class' column: ['Economy' 'Business']
Unique values in 'stops' column: ['zero' 'one' 'two_or_more']


In [117]:
# preparing enumeration so we can use it to map the values to integers in label encoding
timing_enum = ['Early_Morning', 'Morning', 'Afternoon', 'Evening', 'Night', 'Late_Night']
class_enum = ['Economy', 'Business']
stops_enum = ['zero', 'one', 'two_or_more']

# Convert 'departure_time' and 'arrival_time' to ordinal categories
departure_dtype = pd.CategoricalDtype(categories=timing_enum, ordered=True)
arrival_dtype = pd.CategoricalDtype(categories=timing_enum, ordered=True)
class_dtype = pd.CategoricalDtype(categories=class_enum, ordered=True)
stops_dtype = pd.CategoricalDtype(categories=stops_enum, ordered=True)

# Applying label encoding
df['departure_time'] = df['departure_time'].astype(departure_dtype).cat.codes
df['arrival_time'] = df['arrival_time'].astype(arrival_dtype).cat.codes
df['class'] = df['class'].astype(class_dtype).cat.codes
df['stops'] = df['stops'].astype(stops_dtype).cat.codes
df.head(5)

Unnamed: 0,airline,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,Delhi,3,0,4,Mumbai,0,2.17,1,5953
1,SpiceJet,Delhi,0,0,1,Mumbai,0,2.33,1,5953
2,AirAsia,Delhi,0,0,0,Mumbai,0,2.17,1,5956
3,Vistara,Delhi,1,0,2,Mumbai,0,2.25,1,5955
4,Vistara,Delhi,1,0,1,Mumbai,0,2.33,1,5955


In [118]:
# Apply one-hot encoding to the categorical columns
df = pd.get_dummies(df, columns=['airline', 'source_city', 'destination_city'], drop_first=True)
df.head(5)

Unnamed: 0,departure_time,stops,arrival_time,class,duration,days_left,price,airline_Air_India,airline_GO_FIRST,airline_Indigo,...,source_city_Chennai,source_city_Delhi,source_city_Hyderabad,source_city_Kolkata,source_city_Mumbai,destination_city_Chennai,destination_city_Delhi,destination_city_Hyderabad,destination_city_Kolkata,destination_city_Mumbai
0,3,0,4,0,2.17,1,5953,False,False,False,...,False,True,False,False,False,False,False,False,False,True
1,0,0,1,0,2.33,1,5953,False,False,False,...,False,True,False,False,False,False,False,False,False,True
2,0,0,0,0,2.17,1,5956,False,False,False,...,False,True,False,False,False,False,False,False,False,True
3,1,0,2,0,2.25,1,5955,False,False,False,...,False,True,False,False,False,False,False,False,False,True
4,1,0,1,0,2.33,1,5955,False,False,False,...,False,True,False,False,False,False,False,False,False,True


In [130]:
temp = df.copy(deep=False) # so that it doesnt affect original df in next line
# randomizing feature selection
X = temp.drop(columns=['price'], axis=1).sample(frac=0.5, axis=1)
X

Unnamed: 0,airline_Vistara,source_city_Hyderabad,source_city_Kolkata,source_city_Chennai,destination_city_Hyderabad,source_city_Mumbai,class,destination_city_Mumbai,departure_time,destination_city_Delhi
0,False,False,False,False,False,False,0,True,3,False
1,False,False,False,False,False,False,0,True,0,False
2,False,False,False,False,False,False,0,True,0,False
3,True,False,False,False,False,False,0,True,1,False
4,True,False,False,False,False,False,0,True,1,False
...,...,...,...,...,...,...,...,...,...,...
300133,True,False,False,True,True,False,1,False,3,False
300134,True,False,False,True,True,False,1,False,4,False
300135,True,False,False,True,True,False,1,False,4,False
300136,True,False,False,True,True,False,1,False,3,False


In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, df['price'], test_size=0.3, random_state=42)