# Phase 1: Data Acquisition & Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Load the main dataset
# Ensure this path matches where you saved 'flights.csv' in your Drive
path = '/content/drive/MyDrive/dataset/flights.csv'
df = pd.read_csv(path, low_memory=False)

print(f"Dataset loaded with {df.shape[0]} rows.")

In [None]:
# Target: 1 if delayed, 0 if on time/early
# We use ARRIVAL_DELAY to create our classification target
df['is_delayed'] = (df['ARRIVAL_DELAY'] > 0).astype(int)

# Data Cleaning: Drop rows where ARRIVAL_DELAY is NaN so we have a clean target
df = df.dropna(subset=['ARRIVAL_DELAY'])

print("Step 2 Complete: Target variable 'is_delayed' created.")

In [None]:
# Identify valid numerical features (Excluding 'is_delayed' and direct delay metrics)
# These are columns available BEFORE the flight arrives
cols_to_exclude = ['ARRIVAL_DELAY', 'is_delayed', 'CANCELLED', 'DIVERTED',
                   'AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY',
                   'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY', 'DEPARTURE_DELAY']

numeric_df = df.drop(columns=cols_to_exclude).select_dtypes(include=[np.number])

# Calculate Correlation
correlations = numeric_df.corrwith(df['is_delayed']).abs().sort_values(ascending=False)

# Select Top 10
top_10_features = correlations.head(10).index.tolist()
print("Your Top 10 Features are:")
for i, feat in enumerate(top_10_features, 1):
    print(f"{i}. {feat}")

# Keep only these 10 features + our target
df_final = df[top_10_features + ['is_delayed']]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample

# Scaling
scaler = StandardScaler()
X = scaler.fit_transform(df_final.drop('is_delayed', axis=1))
y = df_final['is_delayed'].values

# Handling Imbalance
# Combine for resampling
temp_df = pd.concat([pd.DataFrame(X, columns=top_10_features),
                     df_final['is_delayed'].reset_index(drop=True)], axis=1)

df_majority = temp_df[temp_df.is_delayed == 0]
df_minority = temp_df[temp_df.is_delayed == 1]

# Downsample majority to match minority
df_balanced = resample(df_majority, replace=False,
                       n_samples=len(df_minority),
                       random_state=42)

df_final_balanced = pd.concat([df_balanced, df_minority])

print("Data is scaled and balanced.")
print(df_final_balanced['is_delayed'].value_counts(normalize=True))

# Phase 2: Exploratory Data Analysis

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate the correlation matrix of our balanced dataset
plt.figure(figsize=(12, 8))
corr_matrix = df_final_balanced.corr()

# Plot Heatmap
sns.heatmap(corr_matrix, annot=True, cmap='RdBu', fmt='.2f', center=0)
plt.title('Correlation Heatmap of Selected Features')
plt.show()

print("Correlation heatmap generated.")

In [None]:
# Univariate Analysis: Distribution of the target variable
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.countplot(x='is_delayed', data=df_final_balanced, palette='viridis')
plt.title('Class Distribution (Balanced)')

# Bivariate Analysis: Feature vs Target
# See how TAXI_OUT affects delays
plt.subplot(1, 2, 2)
sns.boxplot(x='is_delayed', y='TAXI_OUT', data=df_final_balanced)
plt.title('Taxi-Out Time vs Flight Delay')

plt.tight_layout()
plt.show()

# Statistical Summaries as required by the instructions
print("\n--- Statistical Summary of Features ---")
print(df_final_balanced.describe())

In [None]:
from sklearn.model_selection import train_test_split

# 1. Prepare Features (X) and Target (y) from the balanced dataframe
X = df_final_balanced.drop('is_delayed', axis=1)
y = df_final_balanced['is_delayed']

# 2. Split into Training (80%) and Testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Create a smaller sample for computationally heavy models (KNN, SVM, RF)
# We take 20,000 rows from the training set to ensure it runs quickly in Colab
X_train_small, _, y_train_small, _ = train_test_split(X_train, y_train, train_size=20000, random_state=42)

print("Step Complete: Data split into Training and Testing sets.")
print(f"Full Train set: {X_train.shape[0]} rows")
print(f"Small Train sample: {X_train_small.shape[0]} rows")

# Phase 3: Machine Learning Analysis & Model Building

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

# Initialize the results list - only do this ONCE
results = []
print("Metrics imported and results list initialized.")

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

# Define the models
base_models = {
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(max_iter=1000)
}

# Train and evaluate
for name, model in base_models.items():
    print(f"Training {name}...")
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1-Score": f1_score(y_test, preds)
    })

print("Base models trained and added to results.")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

# Define the models
slow_models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM": LinearSVC(max_iter=1000, dual=False)
}

# Train and evaluate using the small sample
for name, model in slow_models.items():
    print(f"Training {name} on sample...")
    model.fit(X_train_small, y_train_small)
    preds = model.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, preds),
        "Precision": precision_score(y_test, preds),
        "Recall": recall_score(y_test, preds),
        "F1-Score": f1_score(y_test, preds)
    })

print("KNN and SVM trained on sample.")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Define and train
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
print("Training Random Forest...")
rf_model.fit(X_train_small, y_train_small)

# Evaluate
rf_preds = rf_model.predict(X_test)
results.append({
    "Model": "Random Forest",
    "Accuracy": accuracy_score(y_test, rf_preds),
    "Precision": precision_score(y_test, rf_preds),
    "Recall": recall_score(y_test, rf_preds),
    "F1-Score": f1_score(y_test, rf_preds)
})

print("Random Forest trained.")

In [None]:
import pandas as pd

comparison_df = pd.DataFrame(results)
print("\n--- Final Model Comparison Table ---")
display(comparison_df)

In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 8))

# Combine all trained models for plotting
plot_models = {
    "Naive Bayes": base_models["Naive Bayes"],
    "Logistic Regression": base_models["Logistic Regression"],
    "KNN": slow_models["KNN"],
    "Random Forest": rf_model
}

for name, model in plot_models.items():
    # LinearSVC (SVM) uses decision_function instead of predict_proba
    if hasattr(model, "predict_proba"):
        probs = model.predict_proba(X_test)[:, 1]
    else:
        probs = model.decision_function(X_test)

    fpr, tpr, _ = roc_curve(y_test, probs)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc(fpr, tpr):.2f})")

plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Model Performance: ROC Curves')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()

In [None]:
# Save to your Drive path (Double check the folder exists!)
drive_path = '/content/drive/MyDrive/dataset/'

joblib.dump(rf_model, drive_path + 'best_model.pkl')
joblib.dump(scaler, drive_path + 'scaler.pkl')

print(f"Exported: 'best_model.pkl' and 'scaler.pkl' saved to {drive_path}")