In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('flights.csv')

# Features and target
features = ['month', 'day', 'hour', 'minute', 'carrier', 'origin', 'dest', 'distance']
target = 'arr_delay'

# Binary target: 1 = delayed, 0 = on time
df['is_delayed'] = (df[target] > 15).astype(int)

# Drop rows with missing values in essential columns
df = df.dropna(subset=['is_delayed'] + features)

# Encode categorical variables
label_encoders = {}
for col in ['carrier', 'origin', 'dest']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split dataset
X = df[features]
y = df['is_delayed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values (if any)
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define and train MLP model
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500, random_state=42)
mlp_model.fit(X_train_scaled, y_train)

# Evaluate
y_pred = mlp_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Function for predicting delay
def predict_delay(new_flight_data):
    """
    Predict whether a flight will be delayed using the MLP model.
    
    Parameters:
    new_flight_data (dict): Dictionary with flight details
    
    Returns:
    str: Prediction result
    """
    input_df = pd.DataFrame([new_flight_data])

    for col in ['carrier', 'origin', 'dest']:
        input_df[col] = label_encoders[col].transform(input_df[col])
    
    input_imputed = imputer.transform(input_df)
    input_scaled = scaler.transform(input_imputed)
    
    prediction = mlp_model.predict(input_scaled)
    probability = mlp_model.predict_proba(input_scaled)

    if prediction[0] == 1:
        return f"Flight is likely to be delayed (probability: {probability[0][1]:.2f})"
    else:
        return f"Flight is likely to be on time (probability: {probability[0][0]:.2f})"

# Example usage
example_flight = {
    'month': 1,
    'day': 2,
    'hour': 15,
    'minute': 30,
    'carrier': 'UA',
    'origin': 'EWR',
    'dest': 'ORD',
    'distance': 719
}

print("\nExample Prediction:")
print(predict_delay(example_flight))


Accuracy: 0.8089490186169908

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.97      0.89     69663
           1       0.58      0.17      0.26     17408

    accuracy                           0.81     87071
   macro avg       0.70      0.57      0.58     87071
weighted avg       0.77      0.81      0.76     87071


Confusion Matrix:
[[67517  2146]
 [14489  2919]]

Example Prediction:
Flight is likely to be delayed (probability: 0.55)
