# Phase 1: Data Acquisition & Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the main dataset
# Ensure this path matches where you saved 'flights.csv' in your Drive
path = '/content/drive/MyDrive/dataset/flights.csv'
df = pd.read_csv(path, low_memory=False)

print(f"Dataset loaded with {df.shape[0]} rows.")

In [None]:
# Target: 1 if delayed, 0 if on time/early
# We use ARRIVAL_DELAY to create our classification target
df['is_delayed'] = (df['ARRIVAL_DELAY'] > 0).astype(int)

# Data Cleaning: Drop rows where ARRIVAL_DELAY is NaN so we have a clean target
df = df.dropna(subset=['ARRIVAL_DELAY'])

print("Target variable 'is_delayed' created.")

In [None]:
# Identify valid numerical features (Excluding 'is_delayed' and direct delay metrics)
# These are columns available BEFORE the flight arrives
cols_to_exclude = ['ARRIVAL_DELAY', 'is_delayed', 'CANCELLED', 'DIVERTED',
                   'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
                   'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_DELAY']

numeric_df = df.drop(columns=cols_to_exclude).select_dtypes(include=[np.number])

# Calculate Correlation
correlations = numeric_df.corrwith(df['is_delayed']).abs().sort_values(ascending=False)

# Select Top 10
top_10_features = correlations.head(10).index.tolist()
print("Your Top 10 Features are:")
for i, feat in enumerate(top_10_features, 1):
    print(f"{i}. {feat}")

# Keep only these 10 features + our target
df_final = df[top_10_features + ['is_delayed']]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df_final.drop('is_delayed', axis=1))
y = df_final['is_delayed'].values

# Handling Imbalance
# Combine for resampling
temp_df = pd.concat([pd.DataFrame(X, columns=top_10_features),
                     df_final['is_delayed'].reset_index(drop=True)], axis=1)

df_majority = temp_df[temp_df.is_delayed == 0]
df_minority = temp_df[temp_df.is_delayed == 1]

# Downsample majority to match minority
df_balanced = resample(df_majority, replace=False,
                       n_samples=len(df_minority),
                       random_state=42)

df_final_balanced = pd.concat([df_balanced, df_minority])

print("Data is scaled and balanced.")
print(df_final_balanced['is_delayed'].value_counts(normalize=True))

# Phase 2: Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix of our balanced dataset
plt.figure(figsize=(12, 8))
corr_matrix = df_final_balanced.corr()

# Plot Heatmap
sns.heatmap(corr_matrix, annot=True, cmap='RdBu', fmt='.2f', center=0)
plt.title('Correlation Heatmap of Selected Features')
plt.show()

print("Correlation heatmap generated.")

In [None]:
# Univariate Analysis: Distribution of the target variable
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x='is_delayed', data=df_final_balanced, palette='viridis')
plt.title('Class Distribution (Balanced)')

# Bivariate Analysis: Feature vs Target
# See how TAXI_OUT affects delays
plt.subplot(1, 2, 2)
sns.boxplot(x='is_delayed', y='TAXI_OUT', data=df_final_balanced)
plt.title('Taxi-Out Time vs Flight Delay')

plt.tight_layout()
plt.show()

# Statistical Summaries as required by the instructions
print("\n--- Statistical Summary of Features ---")
print(df_final_balanced.describe())