In [4]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

In [5]:
# Load dataset
df = pd.read_csv('airlines_flights_data.csv')

In [7]:
# Basic info
print('shape:', df.shape)
print('\ncolumns:', df.columns.tolist())
df.head()

shape: (300153, 12)

columns: ['index', 'airline', 'flight', 'source_city', 'departure_time', 'stops', 'arrival_time', 'destination_city', 'class', 'duration', 'days_left', 'price']


Unnamed: 0,index,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


In [9]:
# Checking for missing values and data types
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300153 entries, 0 to 300152
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             300153 non-null  int64  
 1   airline           300153 non-null  object 
 2   flight            300153 non-null  object 
 3   source_city       300153 non-null  object 
 4   departure_time    300153 non-null  object 
 5   stops             300153 non-null  object 
 6   arrival_time      300153 non-null  object 
 7   destination_city  300153 non-null  object 
 8   class             300153 non-null  object 
 9   duration          300153 non-null  float64
 10  days_left         300153 non-null  int64  
 11  price             300153 non-null  int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 27.5+ MB


index               0
airline             0
flight              0
source_city         0
departure_time      0
stops               0
arrival_time        0
destination_city    0
class               0
duration            0
days_left           0
price               0
dtype: int64

In [10]:
# Statistical summary
df.describe()

Unnamed: 0,index,duration,days_left,price
count,300153.0,300153.0,300153.0,300153.0
mean,150076.0,12.221021,26.004751,20889.660523
std,86646.852011,7.191997,13.561004,22697.767366
min,0.0,0.83,1.0,1105.0
25%,75038.0,6.83,15.0,4783.0
50%,150076.0,11.25,26.0,7425.0
75%,225114.0,16.17,38.0,42521.0
max,300152.0,49.83,49.0,123071.0


In [2]:
# import custom modules
from src.data_preprocessing import load_data, clean_data, get_features_and_target
from src.model_training import train_model

In [4]:
# step 1: load the load_data
df = load_data('airlines_flights_data.csv')
print('data loaded successfully!')
print(df.head())

data loaded successfully!
   index   airline   flight source_city departure_time stops   arrival_time  \
0      0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1      1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2      2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3      3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4      4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  
0           Mumbai  Economy      2.17          1   5953  
1           Mumbai  Economy      2.33          1   5953  
2           Mumbai  Economy      2.17          1   5956  
3           Mumbai  Economy      2.25          1   5955  
4           Mumbai  Economy      2.33          1   5955  


In [5]:
# step 2: clean the data 
df_cleaned = clean_data(df)
print('data cleaned successfully!')
print(df_cleaned.head())

data cleaned successfully!
    airline   flight source_city departure_time stops   arrival_time  \
0  SpiceJet  SG-8709       Delhi        Evening  zero          Night   
1  SpiceJet  SG-8157       Delhi  Early_Morning  zero        Morning   
2   AirAsia   I5-764       Delhi  Early_Morning  zero  Early_Morning   
3   Vistara   UK-995       Delhi        Morning  zero      Afternoon   
4   Vistara   UK-963       Delhi        Morning  zero        Morning   

  destination_city    class  duration  days_left  price  
0           Mumbai  Economy      2.17          1   5953  
1           Mumbai  Economy      2.33          1   5953  
2           Mumbai  Economy      2.17          1   5956  
3           Mumbai  Economy      2.25          1   5955  
4           Mumbai  Economy      2.33          1   5955  


In [12]:
print(df_cleaned.columns)

Index(['airline', 'flight', 'source_city', 'departure_time', 'stops',
       'arrival_time', 'destination_city', 'class', 'duration', 'days_left',
       'price'],
      dtype='object')


In [19]:
import inspect
print(inspect.getsource(dp.get_features_and_target))

def get_features_and_target(df):
    '''Separate independent and dependent variables'''
    X = df[['duration', 'days_left']]
    y = df['price']
    return X, y



In [4]:
from src.data_preprocessing import load_data, clean_data, get_features_and_target

In [6]:
df = load_data('airlines_flights_data.csv')
df_cleaned = clean_data(df)
X, y = get_features_and_target(df_cleaned)

In [7]:
print("Features and target extracted successfully!")
print("X shape:", X.shape)
print("y shape:", y.shape)

Features and target extracted successfully!
X shape: (300153, 2)
y shape: (300153,)


In [8]:
from src.model_training import train_model
model = train_model(X, y)

Model Performance:
MAE: 19129.17
MSE: 489490905.72
R2 Score: 0.0504

Model saved to models/flight_price_model.pkl


In [11]:
import os
print(os.path.exists('models/flight_price_model.pkl'))

True


In [12]:
import joblib

# Load the saved model
model = joblib.load('models/flight_price_model.pkl')

# Example: Predict flight price for a flight
# (duration = 12 hours, days_left = 25)
sample_data = [[12.0, 25]]
predicted_price = model.predict(sample_data)
print("Predicted Price:", predicted_price)

Predicted Price: [20896.58889003]


