In [1]:
import pandas as pd 
import numpy as np  
import seaborn as sns  
import matplotlib.pyplot as plt 
%matplotlib inline

print("Warning: When this code is run, warnings from pandas and seaborn libraries may appear.")

try:
    import warnings
    warnings.filterwarnings('ignore')  
except ImportError:
    print("Warning: Failed to load module 'warnings'. It may not be possible to filter alerts.")



In [2]:
file_path = r"C:\Users\kadir\Desktop\Flight Price Predication using Machine Learning\PRICE_OF_FLIGHT_DATASET.xlsx"
data = pd.read_excel(file_path)
print(data.head())


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\kadir\\Desktop\\Flight Price Predication using Machine Learning\\PRICE_OF_FLIGHT_DATASET.xlsx'

In [None]:
num_rows = len(data)
print('Number of Rows:', num_rows)

num_columns = len(data.columns)
print('Number of Columns:', num_columns)

pd.set_option('display.max_columns', None)
data.head()


In [None]:
data.drop(columns='Unnamed: 0', inplace =True)

In [None]:
grouped_columns = data.columns.to_series().groupby(data.dtypes)

for dtype, columns in grouped_columns:
    print(f"Data type: {dtype}")
    print(", ".join(columns.tolist()))
    print()

# Data Consistency Check

In [None]:
data.duplicated().sum()

In [None]:
unwanted_values = [' ', '?', '-', 'null', 'NA']

has_unwanted_values = data.isin(unwanted_values).any().any()

print(has_unwanted_values)


In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(data.isnull(), cmap= 'cool_r')
plt.show()

# Data Preprocessing

In [None]:
data['Duration'] = data['Duration'].map(lambda x : x.replace('06m','6m'))

In [None]:
data['Duration'] = data['Duration'].apply(lambda x: int(x.split('h')[0]) * 60 + int(x.split('h')[1].split('m')[0]))

data['Duration'] = pd.to_numeric(data['Duration'])

In [None]:
data['Day']= data['Date'].map(lambda x :x[:3])

In [None]:
data['Date']= data['Date'].map(lambda x :x[4:])

In [None]:
Categorical = ['Airline','Class','Day','Stops','Aeroplane']

In [None]:
pd.set_option('display.max_rows',None)
for i in Categorical :
    print(i)
    print(data[i].value_counts())
    print('='*100)

In [None]:
summary = data.describe(include='object')

styled_summary = summary.style.background_gradient(cmap='summer_r')

styled_summary


# EDA

In [None]:
plt.figure(figsize=(16, 8))

plt.subplot(1, 2, 1)
data['Airline'].value_counts().plot.pie(autopct='%2.1f%%', textprops={'fontsize': 13, 'fontweight': 'bold'}, shadow=True)
plt.title('Flight-based Distribution of Airlines', fontsize=18, fontweight='bold')
plt.ylabel('')

plt.subplot(1, 2, 2)
sns.countplot(x='Airline', data=data)
plt.title('Airline Vs Number of Flights', fontsize=18, fontweight='bold')
plt.xlabel("Airline", fontsize=18, fontweight='bold')
plt.ylabel("Number of Flights", fontsize=18, fontweight='bold')
plt.xticks(fontsize=12, rotation=90)

plt.tight_layout()
plt.show()


In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('Set1_r')
f,ax=plt.subplots(1,2,figsize=(14,6))
data['Class'].value_counts().plot.pie(autopct='%2.1f%%',
                                    textprops={'fontsize': 13, 'fontweight': 'bold'}, ax=ax[0], shadow=True)
ax[0].set_title('Class-Wise Distribution of Flights', fontsize=20, fontweight='bold')
ax[0].set_ylabel('')
sns.countplot(x='Class', data=data, ax=ax[1])

ax[1].set_title('Class Vs Number of Flights', fontsize=18, fontweight='bold')
ax[1].set_xlabel("Class", fontsize=18, fontweight='bold')
ax[1].set_ylabel("Number of Flights", fontsize=18, fontweight='bold')
ax[1].set_xticklabels(ax[1].get_xticklabels(), fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()








In [None]:
# Counting No of stops
plt.rcParams["figure.autolayout"] = True
sns.set_palette('hsv')
f,ax=plt.subplots(1,2,figsize=(14,6))
data['Stops'].value_counts().plot.pie(autopct='%2.1f%%',
                                          textprops ={ 'fontsize':14,'fontweight' :'bold'}, ax=ax[0],shadow=True)
ax[0].set_title('Stops-Wise Distribution of Flights', fontsize=20,fontweight ='bold')
ax[0].set_ylabel('')
sns.countplot(x='Stops', data=data, ax=ax[1])

ax[1].set_title('Stops Vs Number of Flights',fontsize=18,fontweight ='bold')
ax[1].set_xlabel("Stops",fontsize=18,fontweight ='bold')
ax[1].set_ylabel("Number of Flights",fontsize=18,fontweight ='bold')
plt.xticks(fontsize=12,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('hsv')
f, ax = plt.subplots(1, 2, figsize=(14, 6))
data['Day'].value_counts().plot.pie(autopct='%2.1f%%',
                                   textprops={'fontsize': 14, 'fontweight': 'bold'}, ax=ax[0], shadow=True)
ax[0].set_title('Day-Wise Distribution of Flights', fontsize=20, fontweight='bold')
ax[0].set_ylabel('')

sns.countplot(x='Day', data=data, ax=ax[1])
ax[1].set_title('Day Vs Number of Flights', fontsize=18, fontweight='bold')
ax[1].set_xlabel("Day", fontsize=18, fontweight='bold')
ax[1].set_ylabel("Number of Flights", fontsize=18, fontweight='bold')
plt.xticks(fontsize=12, fontweight='bold', rotation=45)  
plt.tight_layout()
plt.show()


In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('gnuplot')
plt.figure(figsize =(16,8))
sns.barplot(x=data['Day'],y=data['Price'],hue=data['Class'])
plt.title("Day Vs Price",fontsize=20,fontweight ='bold')
plt.xlabel('Day',fontsize = 20,fontweight ='bold')
plt.ylabel('Average Price of Flights',fontsize = 22,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('gnuplot')
plt.figure(figsize =(14,7))
sns.barplot(x=data['Class'],y=data['Duration'],hue=data['Stops'])
plt.title("Class Vs Duration",fontsize=20,fontweight ='bold')
plt.xlabel('Class',fontsize = 20,fontweight ='bold')
plt.ylabel('Average Duration of Flights',fontsize = 22,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('viridis')
plt.figure(figsize =(14,7))
sns.countplot(x=data['Date'])
plt.title("Date-wise Flight Distribution",fontsize=20,fontweight ='bold')
plt.xlabel('Date',fontsize = 20,fontweight ='bold')
plt.ylabel('Number of Flights',fontsize = 21,fontweight ='bold')
plt.tight_layout()
plt.show()


In [None]:
plt.rcParams["figure.autolayout"] = True
palette = sns.color_palette('hsv', n_colors=3)  # Renk paletini oluştur
dark_palette = [tuple(color * 0.8 for color in rgb) for rgb in palette]  # Renkleri biraz daha koyu yap
sns.set_palette(dark_palette)
plt.figure(figsize=(9,6))
sns.swarmplot(y=data['Duration'], x=data['Stops'], hue=data['Class'], alpha=0.7)  # Noktaları daha belirgin yapmak için alpha parametresi eklendi
plt.title("Stops VS Duration", fontsize=20, fontweight='bold')
plt.xlabel('Number of Stops', fontsize=20, fontweight='bold')
plt.ylabel('Duration', fontsize=20, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette('mako')
plt.figure(figsize =(12,8))
sns.swarmplot(y=data['Price'],x=data['Stops'], hue= data['Day'])
plt.title("Price VS Stops",fontsize=20,fontweight ='bold')
plt.xlabel('Number of Stop',fontsize = 20,fontweight ='bold')
plt.ylabel('Price',fontsize = 20,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
sns.set_palette("deep")
plt.figure(figsize =(10,8))
sns.swarmplot(x=data['Airline'],y=data['Duration'], hue =data['Class'])
plt.title("Airline VS Duration",fontsize=20,fontweight ='bold')
plt.xlabel('Airline',fontsize = 20,fontweight ='bold')
plt.ylabel('Duration',fontsize = 20,fontweight ='bold')
plt.tight_layout()
plt.show()

In [None]:
plt.rcParams["figure.autolayout"] = True
palette = sns.color_palette('hsv', n_colors=3)  
dark_palette = [tuple(color * 0.8 for color in rgb) for rgb in palette]  
sns.set_palette(dark_palette)

plt.figure(figsize=(9,6))
sns.scatterplot(x=data['Stops'], y=data['Duration'], hue=data['Class'], alpha=0.7, palette='hsv')  

plt.title("Stops VS Duration", fontsize=20, fontweight='bold')
plt.xlabel('Number of Stops', fontsize=20, fontweight='bold')
plt.ylabel('Duration', fontsize=20, fontweight='bold')
plt.tight_layout()
plt.show()


# Feature Engineering

In [None]:
data.drop(columns=['Departure_Time','Arrival_Time','Source','Destination'], inplace= True)

In [None]:
for column_name, dtype in zip(data.columns, data.dtypes):
    print(f"Column Name: {column_name}, Data Type: {dtype}")


In [None]:
Categorical = ['Airline', 'Class', 'Aeroplane', 'Date', 'Stops', 'Day']
Numerical = ['Duration', 'Price']

In [None]:
def clean_value(value):
    return str(value).replace('-', '').replace('/', '')

data['Aeroplane'] = data['Aeroplane'].apply(clean_value)



In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in Categorical:
    data[i] =le.fit_transform(data[i])
data.head()

In [None]:
plt.figure(figsize=(15,4), facecolor='white')
plt_num = 1

for i in Numerical:
    if plt_num <= 2:
        ax = plt.subplot(1,2,plt_num)
        custom_palette = sns.color_palette('husl')  
        sns.boxplot(data[i], palette=custom_palette) 
        plt.xlabel(i, fontsize=16)
    plt_num += 1

plt.show()


In [None]:
data.corr()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(data.corr(),linecolor='black', vmin=-1, vmax=1, annot=True, square=True, fmt='0.4f', 
            annot_kws={'size':9}, cmap='gist_stern')
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.show()

In [None]:
plt.figure(figsize = (12,6))
data.corr()['Price'].drop(['Price']).sort_values(ascending=False).plot(kind='bar',cmap = 'spring')
plt.xlabel('Features',fontsize=15,fontweight='bold')
plt.ylabel('Price (USD)',fontsize=15,fontweight='bold')
plt.title('Correlation of features with Target Variable Price (USD)',fontsize = 20,fontweight='bold')
plt.show()

In [None]:
data.skew()

In [None]:
X = data.drop(['Price'], axis=1)
Y = data['Price']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

# Model Building

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=99, test_size=0.33)
print('Training Feature Matrix Size:', X_train.shape)
print('Training Target Vector Size :', Y_train.shape)
print('Test Feature Matrix Size:', X_test.shape)
print('Test Target Vector Size:', Y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Random Forest Regressor': RandomForestRegressor(),
    'Extra Trees Regressor': ExtraTreesRegressor(),
    'Decision Tree Regressor': DecisionTreeRegressor(),
    'XGBoost Regressor': XGBRegressor()
}

In [None]:
maxR2_score=0
maxRS=0
for i in range(50,500):
    X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=i, test_size=.33)
    lin_reg=LinearRegression()
    lin_reg.fit(X_train,Y_train)
    y_pred=lin_reg.predict(X_test)
    R2=r2_score(Y_test,y_pred)
    if R2>maxR2_score:
        maxR2_score=R2
        maxRS=i
print('Best R2 Score is', maxR2_score ,'on Random_state', maxRS)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state= 70, test_size=0.33)
lin_reg= LinearRegression()
lin_reg.fit(X_train, Y_train)
y_pred = lin_reg.predict(X_test)
print('\033[1m'+ 'Error :'+ '\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test, y_pred))
print('Root Mean squared error :', np.sqrt(mean_squared_error(Y_test, y_pred)))
print('\033[1m'+' R2 Score :'+'\033[0m')
print(r2_score(Y_test,y_pred)*100)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(lin_reg, X_scale, Y, cv=5)
print('\033[1m'+'Cross Validation Score :',lin_reg,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state= 70, test_size=0.33)
rfc = RandomForestRegressor()
rfc.fit(X_train, Y_train)
y_pred = rfc.predict(X_test)
print('\033[1m'+ 'Error of Random Forest Regressor:'+ '\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test, y_pred))
print('Root Mean squared error :', np.sqrt(mean_squared_error(Y_test, y_pred)))
print('\033[1m'+'R2 Score of Random Forest Regressor :'+'\033[0m')
print(r2_score(Y_test,y_pred)*100)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(rfc, X_scale, Y, cv=5)
print('\033[1m'+'Cross Validation Score :',rfc,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state= 70, test_size=0.33)
dtc = DecisionTreeRegressor()
dtc.fit(X_train, Y_train)
y_pred = dtc.predict(X_test)
print('\033[1m'+ 'Error of Decision Tree Regressor:'+ '\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test, y_pred))
print('Root Mean squared error :', np.sqrt(mean_squared_error(Y_test, y_pred)))
print('\033[1m'+'R2 Score of Decision Tree Regressor :'+'\033[0m')
print(r2_score(Y_test,y_pred)*100)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(dtc, X_scale, Y, cv=5)
print('\033[1m'+'Cross Validation Score :',dtc,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state= 70, test_size=0.33)
etc = ExtraTreesRegressor()
dtc.fit(X_train, Y_train)
y_pred = dtc.predict(X_test)
print('\033[1m'+ 'Error of Extra Tree Regressor:'+ '\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test, y_pred))
print('Root Mean squared error :', np.sqrt(mean_squared_error(Y_test, y_pred)))
print('\033[1m'+'R2 Score of Extra Tree Regressor :'+'\033[0m')
print(r2_score(Y_test,y_pred)*100)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(etc, X_scale, Y, cv=5)
print('\033[1m'+'Cross Validation Score :',etc,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=70, test_size=0.33)

lasso = Lasso()
lasso.fit(X_train, Y_train)

y_pred = lasso.predict(X_test)

mae = mean_absolute_error(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
rmse = np.sqrt(mean_squared_error(Y_test, y_pred))
r2 = r2_score(Y_test, y_pred) * 100

print('\033[1mError of Lasso Regressor:\033[0m')
print(f'Mean absolute error : {mae}')
print(f'Mean squared error : {mse}')
print(f'Root Mean squared error : {rmse}')
print('\033[1mR2 Score of Lasso Regressor :\033[0m')
print(f'{r2}')

In [None]:
lasso = Lasso()

scores = cross_val_score(lasso, X_scale, Y, cv=5)

mean_score = scores.mean()

print('\033[1mCross Validation Score for Lasso Regression:\033[0m\n')
print("Mean CV Score:", mean_score)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state= 70, test_size=0.33)
xgb = XGBRegressor()
xgb.fit(X_train, Y_train)
y_pred = xgb.predict(X_test)
print('\033[1m'+ 'Error of XGB Regressor:'+ '\033[0m')
print('Mean absolute error :', mean_absolute_error(Y_test,y_pred))
print('Mean squared error :', mean_squared_error(Y_test, y_pred))
print('Root Mean squared error :', np.sqrt(mean_squared_error(Y_test, y_pred)))
print('\033[1m'+'R2 Score of XGB Regressor :'+'\033[0m')
print(r2_score(Y_test,y_pred)*100)

In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(xgb, X_scale, Y, cv=5)
print('\033[1m'+'Cross Validation Score :',xgb,":"+'\033[0m\n')
print("Mean CV Score :",score.mean())

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X_scale, Y, random_state=70, test_size=0.33)

In [None]:
parameter = {'n_estimators':[400,500],'gamma':np.arange(0,0.2,0.1),
              'booster' : ['gbtree','dart','gblinear'], 'max_depth':[6,8],
              'eta' : [0.01, 0.1] }

In [None]:
GCV = GridSearchCV(XGBRegressor(),parameter,verbose =10)

In [None]:
GCV.fit(X_train,Y_train)

In [None]:
GCV.best_params_

# FINAL STAGE MODEL

In [None]:
Final_mod=XGBRegressor(booster='gbtree', max_depth=6, eta=0.1,
                                gamma=0.1, n_estimators=400)
Final_mod.fit(X_train,Y_train)
pred=Final_mod.predict(X_test)
print('R2_Score:',r2_score(Y_test,pred)*100)
print('mean_squared_error:',mean_squared_error(Y_test,pred))
print('mean_absolute_error:',mean_absolute_error(Y_test,pred))
print("RMSE value:",np.sqrt(mean_squared_error(Y_test, pred)))


In [None]:
import joblib
joblib.dump(Final_mod,"Prediction_Of _Flight _Prices.pkl")

In [None]:
model = joblib.load("Prediction_Of _Flight _Prices.pkl")

prediction = model.predict(X_test)
prediction

In [None]:
pd.DataFrame([model.predict(X_test)[:], Y_test[:]], index = ['Predicted', 'Actual'])

In [None]:
from xgboost import XGBRegressor


model = XGBRegressor()
model.fit(X_train, y_train)


model.save_model("flight_price_model.xgb")
