<a href="https://colab.research.google.com/github/iamcfz/flight-price-prediction-catboost/blob/main/FlightPricePrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import missingno as msno
import datetime as dt
from sklearn.model_selection import train_test_split

In [None]:
# Read in the dataset
flight_data = pd.read_csv('https://raw.githubusercontent.com/AVELURI12/Flight-Price-Prediction/refs/heads/main/Clean_Dataset.csv',index_col = 'Unnamed: 0')

In [None]:
flight_data.head()

In [None]:
flight_data.info()

In [None]:
flight_data.isna().sum()

# Dividing Data-Set by Randomly Reducing Rows (with 100k rows)

In [None]:
# Try sampling 100k rows
sampled_df = flight_data.sample(n=100000, random_state=42)

# Check memory usage
memory_in_MB = sampled_df.memory_usage(deep=True).sum() / 1024**2
print(f"Sampled memory usage: {memory_in_MB:.2f} MB")

In [None]:
sampled_df.to_csv('cleaned_flight_data.csv', index=False)
from google.colab import files
files.download('cleaned_flight_data.csv')

In [None]:
print("Number of flights from each source city:")
city_counts = sampled_df['source_city'].value_counts()
for city, count in city_counts.items():
    print(f"{city}: {count} flights")
print(f"Total: {city_counts.sum()} flights")

In [None]:
# Split column 'flight' into airline code and flight number

flight_long = sampled_df['flight'].str.split("-", expand = True)
flight_long.rename(columns = {0: 'airline_code', 1: 'flight_number'}, inplace = True)
flight_long.head()

In [None]:
 # Put new columns into dataframe and drop old one
sampled_df['airline_code'] = flight_long['airline_code']
sampled_df['flight_number'] = flight_long['flight_number']

sampled_df.drop('flight', axis = 1, inplace = True)
sampled_df.head()

In [None]:
# Change price to float64
sampled_df['price'] = sampled_df['price'].astype('float64')

In [None]:
# Rename days left column
sampled_df.rename(columns={'days_left': 'days_until_departure'}, inplace=True)

In [None]:
sampled_df.dtypes #rohan checking some things

In [None]:
sampled_df.info()

In [None]:
# Type conversions (int and float)
sampled_df['duration'] = sampled_df['duration'].astype('float64')
sampled_df['flight_number'] = sampled_df['flight_number'].astype('int64')

# Check
sampled_df.info()

In [None]:

url = 'https://raw.githubusercontent.com/AVELURI12/Flight-Price-Prediction/main/edited_flight_data.csv'
Redunadant_df = pd.read_csv(url)
Redunadant_df.head()
Redunadant_df.shape


In [None]:
redundant_columns = Redunadant_df.columns[Redunadant_df.isnull().all()]


In [None]:
df = Redunadant_df.drop(columns=redundant_columns)
df.shape


In [None]:
df = df.loc[:, ~df.T.duplicated(keep='first')]
df.shape

(100000, 11)

In [None]:
df.head()
df.shape

(100000, 11)

In [None]:
# Check for duplicate rows
duplicate_rows = df[df.duplicated()]

# Display duplicate rows, if any
if not duplicate_rows.empty:
    print("Duplicate Rows:")
    print(duplicate_rows)
else:
    print("No duplicate rows found.")

No duplicate rows found.


In [None]:
missing_values = df.isnull().sum()
print(missing_values)

airline                 0
flight                  0
source_city             0
departure_time          0
stops                   0
arrival_time            0
destination_city        0
class                   0
duration                0
days_until_departure    0
price                   0
dtype: int64


In [None]:
# One hot encoding for airline, departure time, stops, arrival time, class, airline code

# This may or may not be right
encoded_sampled_df = pd.get_dummies(
    sampled_df,
    columns=['departure_time', 'stops', 'arrival_time', 'class']
)
encoded_sampled_df.head()

Unnamed: 0,airline,source_city,destination_city,duration,days_until_departure,price,airline_code,flight_number,departure_time_Afternoon,departure_time_Early_Morning,...,stops_two_or_more,stops_zero,arrival_time_Afternoon,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy
27131,Air_India,Delhi,Kolkata,19.75,40,7366.0,AI,506,False,False,...,False,False,False,True,False,False,False,False,False,True
266857,Vistara,Kolkata,Mumbai,9.83,42,64831.0,UK,706,False,False,...,False,False,False,False,False,False,False,True,True,False
141228,Vistara,Kolkata,Bangalore,10.5,41,6195.0,UK,772,False,False,...,False,False,False,False,False,False,False,True,False,True
288329,Vistara,Chennai,Delhi,14.5,14,60160.0,UK,824,False,False,...,False,False,False,False,False,False,True,False,True,False
97334,Air_India,Bangalore,Mumbai,8.25,20,6578.0,AI,501,True,False,...,False,False,False,False,False,False,False,True,False,True


In [None]:
# Drop the original redundant columns if they exist, ignoring errors if they don't
columns_to_drop = ['departure_time', 'stops', 'arrival_time', 'class']
encoded_sampled_df = encoded_sampled_df.drop(columns=columns_to_drop, errors='ignore')
# Display the first few rows of the resulting DataFrame
encoded_sampled_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 100000 entries, 27131 to 77708
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   airline                       100000 non-null  object 
 1   source_city                   100000 non-null  object 
 2   destination_city              100000 non-null  object 
 3   duration                      100000 non-null  float64
 4   days_until_departure          100000 non-null  int64  
 5   price                         100000 non-null  float64
 6   airline_code                  100000 non-null  object 
 7   flight_number                 100000 non-null  int64  
 8   departure_time_Afternoon      100000 non-null  bool   
 9   departure_time_Early_Morning  100000 non-null  bool   
 10  departure_time_Evening        100000 non-null  bool   
 11  departure_time_Late_Night     100000 non-null  bool   
 12  departure_time_Morning        100000 non-null 

In [None]:
#checking for Non Numeric Columns
non_numeric_columns = encoded_sampled_df.select_dtypes(exclude=['number']).columns
print(non_numeric_columns)



Index(['airline', 'source_city', 'destination_city', 'airline_code',
       'departure_time_Afternoon', 'departure_time_Early_Morning',
       'departure_time_Evening', 'departure_time_Late_Night',
       'departure_time_Morning', 'departure_time_Night', 'stops_one',
       'stops_two_or_more', 'stops_zero', 'arrival_time_Afternoon',
       'arrival_time_Early_Morning', 'arrival_time_Evening',
       'arrival_time_Late_Night', 'arrival_time_Morning', 'arrival_time_Night',
       'class_Business', 'class_Economy'],
      dtype='object')


In [None]:

# Select only numeric columns (typically float or int)
numeric_cols = encoded_sampled_df.select_dtypes(include=['int64', 'float64']).columns
# checking for missing values  -
missing_summary = encoded_sampled_df.isnull().sum()
print(missing_summary[missing_summary > 0])





Series([], dtype: int64)


In [None]:
# checking data types after one hot encoding
print(encoded_sampled_df.dtypes)

airline                          object
source_city                      object
destination_city                 object
duration                        float64
days_until_departure              int64
price                           float64
airline_code                     object
flight_number                     int64
departure_time_Afternoon           bool
departure_time_Early_Morning       bool
departure_time_Evening             bool
departure_time_Late_Night          bool
departure_time_Morning             bool
departure_time_Night               bool
stops_one                          bool
stops_two_or_more                  bool
stops_zero                         bool
arrival_time_Afternoon             bool
arrival_time_Early_Morning         bool
arrival_time_Evening               bool
arrival_time_Late_Night            bool
arrival_time_Morning               bool
arrival_time_Night                 bool
class_Business                     bool
class_Economy                      bool


In [None]:
# checking for  duplicates if found any .
duplicates = encoded_sampled_df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

Duplicate rows: 0


In [None]:
#converting boolean cloumns to int / float
bool_cols = encoded_sampled_df.select_dtypes(include=['bool']).columns
encoded_sampled_df[bool_cols] = encoded_sampled_df[bool_cols].astype(int)
print(encoded_sampled_df.dtypes)

airline                          object
source_city                      object
destination_city                 object
duration                        float64
days_until_departure              int64
price                           float64
airline_code                     object
flight_number                     int64
departure_time_Afternoon          int64
departure_time_Early_Morning      int64
departure_time_Evening            int64
departure_time_Late_Night         int64
departure_time_Morning            int64
departure_time_Night              int64
stops_one                         int64
stops_two_or_more                 int64
stops_zero                        int64
arrival_time_Afternoon            int64
arrival_time_Early_Morning        int64
arrival_time_Evening              int64
arrival_time_Late_Night           int64
arrival_time_Morning              int64
arrival_time_Night                int64
class_Business                    int64
class_Economy                     int64


In [None]:
# Define features and target
X = encoded_sampled_df.drop(columns=['price'])  # all columns except the target
y = encoded_sampled_df['price']  # target column

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")


Training samples: 80000
Testing samples: 20000


In [None]:
encoded_sampled_df.head()

Unnamed: 0,airline,source_city,destination_city,duration,days_until_departure,price,airline_code,flight_number,departure_time_Afternoon,departure_time_Early_Morning,...,stops_two_or_more,stops_zero,arrival_time_Afternoon,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy
27131,Air_India,Delhi,Kolkata,19.75,40,7366.0,AI,506,0,0,...,0,0,0,1,0,0,0,0,0,1
266857,Vistara,Kolkata,Mumbai,9.83,42,64831.0,UK,706,0,0,...,0,0,0,0,0,0,0,1,1,0
141228,Vistara,Kolkata,Bangalore,10.5,41,6195.0,UK,772,0,0,...,0,0,0,0,0,0,0,1,0,1
288329,Vistara,Chennai,Delhi,14.5,14,60160.0,UK,824,0,0,...,0,0,0,0,0,0,1,0,1,0
97334,Air_India,Bangalore,Mumbai,8.25,20,6578.0,AI,501,1,0,...,0,0,0,0,0,0,0,1,0,1


**ORGINAL DATA SET AFTER ONE HOT ENCODING**


In [None]:
from google.colab import files

# Save the encoded_sampled_df to a CSV file
encoded_sampled_df.to_csv('originalDataset.csv', index=False)

# This will prompt a download in your browser
files.download('originalDataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Method: Generate synthetic data based on patterns in the original dataset
# First, create a copy of the original dataset
additional_df = encoded_sampled_df.copy()

# Create a function to add controlled random variation to numeric columns
def add_variation(value, variation_percent=0.1):
    if isinstance(value, (int, float)) and not pd.isna(value):
        variation = np.random.uniform(-variation_percent, variation_percent) * value
        return max(0, value + variation)  # Ensure non-negative values
    return value

# Apply variations to numeric columns (price, duration, days_until_departure)
numeric_cols = ['price', 'duration', 'days_until_departure']
for col in numeric_cols:
    if col in additional_df.columns:
        additional_df[col] = additional_df[col].apply(lambda x: add_variation(x))

# For categorical and binary columns, shuffle values within each column
for col in additional_df.columns:
    if col not in numeric_cols and additional_df[col].dtype != 'object':
        additional_df[col] = np.random.permutation(additional_df[col].values)

# For categorical columns like airlines, cities, flight numbers
categorical_cols = ['airline', 'source_city', 'destination_city', 'airline_code', 'flight_number']
for col in categorical_cols:
    if col in additional_df.columns:
        # Shuffle within categories
        additional_df[col] = np.random.permutation(additional_df[col].values)

# Create additional rows by sampling with replacement to get exactly 100K more rows
additional_rows = additional_df.sample(n=100000, replace=True, random_state=42).reset_index(drop=True)

# Combine original and new data to get 200K dataset
full_dataset_200k = pd.concat([encoded_sampled_df, additional_rows], ignore_index=True)

# Verify the size
print(f"Original dataset size: {len(encoded_sampled_df)}")
print(f"New dataset size: {len(full_dataset_200k)}")

# Save the 200K dataset
full_dataset_200k.to_csv('flight_data_200k.csv', index=False)

# Download the expanded dataset
from google.colab import files
files.download('flight_data_200k.csv')

Original dataset size: 100000
New dataset size: 200000


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
full_dataset_200k.head()



Unnamed: 0,airline,source_city,destination_city,duration,days_until_departure,price,airline_code,flight_number,departure_time_Afternoon,departure_time_Early_Morning,...,stops_two_or_more,stops_zero,arrival_time_Afternoon,arrival_time_Early_Morning,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy
0,Air_India,Delhi,Kolkata,19.75,40.0,7366.0,AI,506,0,0,...,0,0,0,1,0,0,0,0,0,1
1,Vistara,Kolkata,Mumbai,9.83,42.0,64831.0,UK,706,0,0,...,0,0,0,0,0,0,0,1,1,0
2,Vistara,Kolkata,Bangalore,10.5,41.0,6195.0,UK,772,0,0,...,0,0,0,0,0,0,0,1,0,1
3,Vistara,Chennai,Delhi,14.5,14.0,60160.0,UK,824,0,0,...,0,0,0,0,0,0,1,0,1,0
4,Air_India,Bangalore,Mumbai,8.25,20.0,6578.0,AI,501,1,0,...,0,0,0,0,0,0,0,1,0,1


In [None]:
# Check data types of additional_df
print("Data types of additional_df:")
print(full_dataset_200k.dtypes)

# For a more detailed summary that includes data types and other information
print("\nDetailed summary of additional_df:")
print(full_dataset_200k.info())

# You can also check if there are any null values
print("\nNull values in additional_df:")
print(full_dataset_200k.isnull().sum())

# Get a sample of the dataframe to visually inspect
print("\nSample of additional_df:")
print(full_dataset_200k.head())

Data types of additional_df:
airline                          object
source_city                      object
destination_city                 object
duration                        float64
days_until_departure            float64
price                           float64
airline_code                     object
flight_number                     int64
departure_time_Afternoon          int64
departure_time_Early_Morning      int64
departure_time_Evening            int64
departure_time_Late_Night         int64
departure_time_Morning            int64
departure_time_Night              int64
stops_one                         int64
stops_two_or_more                 int64
stops_zero                        int64
arrival_time_Afternoon            int64
arrival_time_Early_Morning        int64
arrival_time_Evening              int64
arrival_time_Late_Night           int64
arrival_time_Morning              int64
arrival_time_Night                int64
class_Business                    int64
class_Econo

In [None]:
a=full_dataset_200k['source_city'].unique()
print(sorted(a))

['Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai']


In [None]:
a=full_dataset_200k['destination_city'].unique()
print(sorted(a))

['Bangalore', 'Chennai', 'Delhi', 'Hyderabad', 'Kolkata', 'Mumbai']


In [None]:
#Banglore to Chennai = 166.6
#Banglore to Delhi = 1058.1
#Banglore to Hyderabad = 281.5
#

In [None]:
# Random miles
import random
numbers = random.choices(range(1, 1001), k=len(encoded_sampled_df))

encoded_sampled_df['PreExisting_Miles'] = numbers

numbers = random.choices(range(1, 1001), k=len(full_dataset_200k))

full_dataset_200k['PreExisitingMiles'] = numbers


In [None]:
# Miles between Cities
distance_map = {
    ('Bangalore', 'Chennai'): 166.6,
    ('Bangalore', 'Delhi'): 1058.1,
    ('Bangalore', 'Hyderabad'): 281.5,
    ('Bangalore', 'Kolkata'): 959.9,
    ('Bangalore', 'Mumbai'): 517.1,

    ('Chennai', 'Delhi'): 1089.7,
    ('Chennai', 'Hyderabad'): 313.9,
    ('Chennai', 'Kolkata'): 859.1,
    ('Chennai', 'Mumbai'): 641.3,

    ('Delhi', 'Hyderabad'): 784.6,
    ('Delhi', 'Kolkata'): 816,
    ('Delhi', 'Mumbai'): 705.1,

    ('Hyderabad', 'Kolkata'): 750.4,
    ('Hyderabad', 'Mumbai'): 387.3,

    ('Kolkata', 'Mumbai'): 1036.1,


    ('Chennai', 'Bangalore'): 166.6,
    ('Delhi', 'Bangalore'): 1058.1,
    ('Hyderabad', 'Bangalore'): 281.5,
    ('Kolkata', 'Bangalore'): 959.9,
    ('Mumbai', 'Bangalore'): 517.1,

    ('Delhi', 'Chennai'): 1089.7,
    ('Hyderabad', 'Chennai'): 313.9,
    ('Kolkata', 'Chennai'): 859.1,
    ('Mumbai', 'Chennai'): 641.3,

    ('Hyderabad', 'Delhi'): 784.6,
    ('Kolkata', 'Delhi'): 816,
    ('Mumbai', 'Delhi'): 705.1,

    ('Kolkata', 'Hyderabad'): 750.4,
    ('Mumbai', 'Hyderabad'): 387.3,
    ('Mumbai', 'Kolkata'): 1036.1
}

def get_distance(distance) :
    return distance_map.get((distance['source_city'], distance['destination_city']), "Unknown")


encoded_sampled_df['Distance'] = encoded_sampled_df.apply(get_distance, axis=1)
full_dataset_200k['Distance'] = full_dataset_200k.apply(get_distance, axis=1)



In [None]:
# Pre existing miles * 0.5 'currency' per mile
# Discount from the formula above
encoded_sampled_df['Discount'] = 0.5 * encoded_sampled_df['PreExisting_Miles'] * 0.5
full_dataset_200k['Discount'] = full_dataset_200k['PreExisitingMiles'] * 0.5
# Please change this typo, it bothers me.

# Discounted price
encoded_sampled_df['Discounted_Price'] = encoded_sampled_df['price'] - encoded_sampled_df['Discount']
full_dataset_200k['Discounted_Price'] = full_dataset_200k['price'] - full_dataset_200k['Discount']

encoded_sampled_df.head()
full_dataset_200k.head()

Unnamed: 0,airline,source_city,destination_city,duration,days_until_departure,price,airline_code,flight_number,departure_time_Afternoon,departure_time_Early_Morning,...,arrival_time_Evening,arrival_time_Late_Night,arrival_time_Morning,arrival_time_Night,class_Business,class_Economy,PreExisitingMiles,Distance,Discount,Discounted_Price
0,Air_India,Delhi,Kolkata,19.75,40.0,7366.0,AI,506,0,0,...,0,0,0,0,0,1,834,816.0,417.0,6949.0
1,Vistara,Kolkata,Mumbai,9.83,42.0,64831.0,UK,706,0,0,...,0,0,0,1,1,0,229,1036.1,114.5,64716.5
2,Vistara,Kolkata,Bangalore,10.5,41.0,6195.0,UK,772,0,0,...,0,0,0,1,0,1,715,959.9,357.5,5837.5
3,Vistara,Chennai,Delhi,14.5,14.0,60160.0,UK,824,0,0,...,0,0,1,0,1,0,866,1089.7,433.0,59727.0
4,Air_India,Bangalore,Mumbai,8.25,20.0,6578.0,AI,501,1,0,...,0,0,0,1,0,1,262,517.1,131.0,6447.0


In [None]:
full_dataset_200k.to_csv('full_dataset_200k.csv', index=False)
from google.colab import files
files.download('full_dataset_200k.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>