In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path
path = '/content/drive/MyDrive/dataset/flights.csv'

# Load the dataset
# Using low_memory=False because the dataset has mixed data types
df = pd.read_csv(path, low_memory=False)

df.head()

In [None]:
# Create the binary target variable
# 1 = Delayed (>15 mins), 0 = On Time (<=15 mins)
df['Is_Delayed'] = (df['ARRIVAL_DELAY'] > 15).astype(int)

# Check how many of each we have
print("Target Variable Distribution:")
print(df['Is_Delayed'].value_counts(normalize=True))

In [None]:
# Check for missing values in each column
missing_data = df.isnull().sum()
print("Missing values per column:\n", missing_data[missing_data > 0])

# Check for duplicate rows
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

In [None]:
# Drop columns that are mostly empty (over 90% missing values)
limit = len(df) * 0.9
df_cleaned = df.dropna(thresh=limit, axis=1)

# Drop rows with missing values in critical columns
critical_cols = ['DEPARTURE_TIME', 'DEPARTURE_DELAY', 'ARRIVAL_DELAY']
df_cleaned = df_cleaned.dropna(subset=critical_cols)

# Check how much data is left?
print(f"Original rows: {len(df)}")
print(f"Cleaned rows: {len(df_cleaned)}")
print("\nRemaining missing values:")
print(df_cleaned.isnull().sum().sum())

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numerical columns for correlation
numerical_df = df_cleaned.select_dtypes(include=[np.number])

# Calculate absolute correlation with the target 'Is_Delayed'
correlations = numerical_df.corr()['Is_Delayed'].abs().sort_values(ascending=False)

# Get the top 10 features (excluding the target itself)
top_10_features = correlations.iloc[1:11]

print("Top 10 Features Selected by Correlation")
print(top_10_features)

# Visualization for EDA Report
plt.figure(figsize=(10, 6))
sns.barplot(x=top_10_features.values, y=top_10_features.index, palette='viridis')
plt.title('Top 10 Features Correlated with Flight Delays')
plt.xlabel('Absolute Correlation Coefficient')
plt.ylabel('Features')
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler

# Create a focused dataframe with only Top 10 features + the target
selected_cols = list(top_10_features.index) + ['Is_Delayed']
df_final = df_cleaned[selected_cols].copy()

# Define Features (X) and Target (y)
X = df_final.drop('Is_Delayed', axis=1)
y = df_final['Is_Delayed']

# Standardize the data (Mean=0, Variance=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Data successfully transformed and scaled.")
print(f"Final shape for modeling: {X_scaled.shape}")

In [None]:
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd

# To save time and memory, sample 100,000 rows from our 5.7M rows
# This keeps the project manageable for SVM and KNN
df_sample = df_final.sample(n=100000, random_state=42)

X_sample = df_sample.drop('Is_Delayed', axis=1)
y_sample = df_sample['Is_Delayed']

# Balance Report (Before SMOTE)
print(f"Distribution before SMOTE: {Counter(y_sample)}")

# Apply SMOTE to balance the 17% delayed flights with the 83% on-time flights
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_sample, y_sample)

# Balance Report (After SMOTE)
print(f"Distribution after SMOTE: {Counter(y_resampled)}")