## Flight Price Prediction (EDA + Analysis)

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import xgboost as xgb
%matplotlib inline

In [None]:
# Load the dataset
try:
    train_df = pd.read_excel('Data_Train.xlsx')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'Data_Train.xlsx' not found. Please ensure the file is in the working directory.")
    raise

# Create a working copy
final_df = train_df.copy()
print("Dataset Shape:", final_df.shape)
final_df.head()

In [None]:
# Basic Exploratory Data Analysis
print("Dataset Info:")
final_df.info()

print("\nMissing Values:")
print(final_df.isnull().sum())

print("\nFirst Few Rows:")
print(final_df.head())

print("\nLast Few Rows:")
print(final_df.tail())

In [None]:
# Analyze Categorical Features
print("Unique Airlines:")
print(final_df['Airline'].unique())

print("\nAirline Distribution:")
print(final_df['Airline'].value_counts())

# Plot Airline Distribution
plt.figure(figsize=(12, 6))
sns.countplot(x='Airline', data=final_df, order=final_df['Airline'].value_counts().index)
plt.xticks(rotation=45, ha='right')
plt.title('Distribution of Airlines')
plt.xlabel('Airline')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Feature Engineering for Analysis
# Handle missing values
final_df.dropna(inplace=True)

# Split Date_of_Journey
final_df['Date'] = final_df['Date_of_Journey'].str.split('/').str[0].astype(int)
final_df['Month'] = final_df['Date_of_Journey'].str.split('/').str[1].astype(int)
final_df['Year'] = final_df['Date_of_Journey'].str.split('/').str[2].astype(int)

# Convert Dep_Time and Arrival_Time
final_df['Dep_hour'] = pd.to_datetime(final_df['Dep_Time'], format='%H:%M').dt.hour
final_df['Dep_min'] = pd.to_datetime(final_df['Dep_Time'], format='%H:%M').dt.minute

final_df['Arrival_Time'] = final_df['Arrival_Time'].str.split(' ').str[0]
final_df['Arrival_hour'] = pd.to_datetime(final_df['Arrival_Time'], format='%H:%M').dt.hour
final_df['Arrival_min'] = pd.to_datetime(final_df['Arrival_Time'], format='%H:%M').dt.minute

# Convert Duration to minutes
def duration_to_minutes(duration):
    hours, minutes = 0, 0
    if 'h' in duration:
        hours = int(duration.split('h')[0])
        duration = duration.split('h')[1]
    if 'm' in duration:
        minutes = int(duration.split('m')[0])
    return hours * 60 + minutes

final_df['Duration_minutes'] = final_df['Duration'].apply(duration_to_minutes)

# Encode Total_Stops
final_df['Total_Stops'] = final_df['Total_Stops'].map({
    'non-stop': 0, '1 stop': 1, '2 stops': 2, '3 stops': 3, '4 stops': 4
})

print("Processed Dataframe Head:")
final_df.head()

In [None]:
# Additional Visualizations
# Price Distribution
plt.figure(figsize=(10, 6))
sns.histplot(final_df['Price'], bins=50, kde=True, color='blue')
plt.title('Distribution of Flight Prices')
plt.xlabel('Price (₹)')
plt.ylabel('Frequency')
plt.show()

# Price vs Total Stops
plt.figure(figsize=(10, 6))
sns.boxplot(x='Total_Stops', y='Price', data=final_df)
plt.title('Price vs Total Stops')
plt.xlabel('Number of Stops')
plt.ylabel('Price (₹)')
plt.show()

# Price vs Airline
plt.figure(figsize=(12, 6))
sns.boxplot(x='Airline', y='Price', data=final_df)
plt.xticks(rotation=45, ha='right')
plt.title('Price vs Airline')
plt.xlabel('Airline')
plt.ylabel('Price (₹)')
plt.tight_layout()
plt.show()

In [None]:
# Correlation Analysis
# Select numerical columns for correlation
numeric_cols = ['Price', 'Total_Stops', 'Date', 'Month', 'Dep_hour', 'Dep_min', 
                'Arrival_hour', 'Arrival_min', 'Duration_minutes']
correlation_matrix = final_df[numeric_cols].corr()

# Heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Model Training and Evaluation (for Scatter Plot)
# Prepare features and target
final_df_encoded = pd.get_dummies(final_df.drop(['Date_of_Journey', 'Dep_Time', 'Arrival_Time', 
                                                 'Duration', 'Route', 'Additional_Info'], axis=1),
                                  columns=['Airline', 'Source', 'Destination'])
X = final_df_encoded.drop('Price', axis=1)
y = final_df_encoded['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train XGBoost model
xgb_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train)

# Predictions
y_pred = xgb_model.predict(X_test)

# R² Score
accuracy = r2_score(y_test, y_pred)
print(f"R² Score: {accuracy:.4f}")

In [None]:
# Scatter Plot of Predictions (Your Original Graph)
plt.figure(figsize=(8, 8))
plt.scatter(y_test, y_pred, alpha=0.5, color='blue')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Price (y_test)')
plt.ylabel('Predicted Price (y_pred)')
plt.title('Actual vs Predicted Flight Prices')
plt.tight_layout()
plt.show()