# Machine learning
In this notebook we implement a model to study the data

In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('df_cleaned.csv')
df_cancel = pd.read_csv('df_cancel.csv')

In [37]:
# Add the missing columns to the regular flights dataframe
df['CANCELLED'] = False
df['CANCELLATION_CODE'] = np.nan  # or '' depending on your preference
df['DIVERTED'] = False

# Ensure column order and types match
# You might need to adjust dtypes to match
df['CANCELLED'] = df['CANCELLED'].astype(bool)
df['DIVERTED'] = df['DIVERTED'].astype(bool)

# Combine the dataframes
combined_flights = pd.concat([df, df_cancel], ignore_index=True)

# Optional: Verify the combination
print(combined_flights['CANCELLED'].value_counts())
print(combined_flights['DIVERTED'].value_counts())

CANCELLED
False    6273470
True       96012
Name: count, dtype: int64
DIVERTED
False    6355322
True       14160
Name: count, dtype: int64


Here we format the time attributes hh:mm into minutes from midnight:

In [38]:
print(combined_flights.iloc[39]['DEP_TIME'])

14:34


In [39]:
def time_to_minutes(time_str):
    if pd.isna(time_str):
        return None
    hours, minutes = map(int, time_str.split(":"))
    return hours * 60 + minutes

time_columns = ['CRS_DEP_TIME', 'DEP_TIME', 'WHEELS_OFF', 'WHEELS_ON', 'CRS_ARR_TIME', 'ARR_TIME']

# Combine approaches
for col in time_columns:
    # Minutes since midnight
    combined_flights[col] = combined_flights[col].apply(time_to_minutes)

print(combined_flights.iloc[39]['DEP_TIME'])

874.0


Now we decide what attribute we will try to predict and what attributes should be available for the prediction.

For this, lets once again review the attributes list:

<font color='green'> **Green colored attributes are available before the flight**</font>
 
<font color='lightblue'> **Blue colored attributes signify attributes that we can or want to predict**   </font> 

- <font color='green'>OP_CARRIER        </font> 
- <font color='green'>OP_CARRIER_FL_NUM </font> 
- <font color='green'>ORIGIN            </font> 
- <font color='green'>DEST              </font> 
- <font color='green'>CRS_DEP_TIME      </font> 
- <font color='green'>CRS_ARR_TIME      </font> 
- <font color='green'>CRS_ELAPSED_TIME  </font> 
- <font color='green'>DISTANCE          </font> 
- <font color='green'>MONTH             </font> 
- <font color='green'>DAY               </font> 
- <font color='green'>YEAR              </font> 
- <font color='green'>DAY_OF_WEEK       </font> 
- DEP_TIME               
- DEP_DELAY              
- TAXI_OUT               
- WHEELS_OFF             
- WHEELS_ON              
- TAXI_IN                
- ARR_TIME               
- <font color='lightblue'>ARR_DELAY</font>
- ACTUAL_ELAPSED_TIME    
- AIR_TIME               
- CARRIER_DELAY          
- WEATHER_DELAY          
- NAS_DELAY              
- SECURITY_DELAY         
- LATE_AIRCRAFT_DELAY    
- <font color='lightblue'> CANCELLED        </font>     
- <font color='lightblue'> CANCELLATION_CODE</font>    
- <font color='lightblue'> DIVERTED         </font>       

In [40]:
print(combined_flights.dtypes)

OP_CARRIER              object
OP_CARRIER_FL_NUM        int64
ORIGIN                  object
DEST                    object
CRS_DEP_TIME             int64
DEP_TIME               float64
DEP_DELAY              float64
TAXI_OUT               float64
WHEELS_OFF             float64
WHEELS_ON              float64
TAXI_IN                float64
CRS_ARR_TIME             int64
ARR_TIME               float64
ARR_DELAY              float64
CRS_ELAPSED_TIME         int64
ACTUAL_ELAPSED_TIME    float64
AIR_TIME               float64
DISTANCE                 int64
CARRIER_DELAY          float64
WEATHER_DELAY          float64
NAS_DELAY              float64
SECURITY_DELAY         float64
LATE_AIRCRAFT_DELAY    float64
MONTH                    int64
DAY                      int64
YEAR                     int64
DAY_OF_WEEK              int64
CANCELLED                 bool
CANCELLATION_CODE       object
DIVERTED                  bool
dtype: object


In [41]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [56]:
# Preprocessing function
def preprocess_for_arrival_delay(combined_flights):
    # Select features for prediction
    features = [
        'OP_CARRIER', 
        'ORIGIN', 
        'DEST', 
        'CRS_DEP_TIME', 
        'CRS_ARR_TIME', 
        'CRS_ELAPSED_TIME', 
        'DISTANCE', 
        'MONTH', 
        'DAY', 
        'YEAR', 
        'DAY_OF_WEEK'
    ]
    
    # Create a copy of the dataframe
    df = combined_flights[features + ['ARR_DELAY']].copy()
    
    # Handle missing values
    df.dropna(inplace=True)
    
    # Convert time columns to numeric minutes
    def time_to_minutes(time_value):
        if pd.isna(time_value):
            return np.nan
        
        # If it's a numeric value like 1430
        time_value = int(time_value)
        hours = time_value // 100
        minutes = time_value % 100
        return hours * 60 + minutes
    
    df['CRS_DEP_TIME_MINUTES'] = df['CRS_DEP_TIME'].apply(time_to_minutes)
    df['CRS_ARR_TIME_MINUTES'] = df['CRS_ARR_TIME'].apply(time_to_minutes)
    
    # Drop original time columns
    df.drop(columns=['CRS_DEP_TIME', 'CRS_ARR_TIME'], inplace=True)
    
    # Separate features and target
    X = df.drop('ARR_DELAY', axis=1)
    y = df['ARR_DELAY']
    
    return X, y

# Example prediction
def predict_arrival_delay(model, flight_data):
    # Preprocess the input data similar to training data
    processed_data = preprocess_for_arrival_delay(flight_data)[0]
    
    # Make prediction
    predicted_delay = model.predict(processed_data)
    
    return predicted_delay

# Optional: Feature importance
def get_feature_importance(model):
    # Get feature names directly from the preprocessor
    preprocessor = model.named_steps['preprocessor']
    
    # Get numeric feature names
    numeric_transformer = preprocessor.named_transformers_['num']
    numeric_feature_names = numeric_transformer.get_feature_names_out().tolist()
    
    # Get categorical feature names
    categorical_transformer = preprocessor.named_transformers_['cat']
    categorical_feature_names = categorical_transformer.get_feature_names_out().tolist()
    
    # Combine feature names
    feature_names = numeric_feature_names + categorical_feature_names
    
    # Get feature importances
    importances = model.named_steps['regressor'].feature_importances_
    
    # Create a DataFrame of features and their importances
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    return feature_importance_df

In [55]:
def train_arrival_delay_model(combined_flights):
    # Preprocess data
    X, y = preprocess_for_arrival_delay(combined_flights)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define preprocessing for numeric and categorical columns
    numeric_features = [
        'CRS_DEP_TIME_MINUTES', 
        'CRS_ARR_TIME_MINUTES', 
        'CRS_ELAPSED_TIME', 
        'DISTANCE', 
        'MONTH', 
        'DAY', 
        'YEAR', 
        'DAY_OF_WEEK'
    ]
    
    categorical_features = [
        'OP_CARRIER', 
        'ORIGIN', 
        'DEST'
    ]
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Create a pipeline
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print("Model Performance:")
    print(f"Mean Absolute Error: {mae:.2f} minutes")
    print(f"Root Mean Squared Error: {rmse:.2f} minutes")
    print(f"R-squared Score: {r2:.4f}")
    
    return model

In [44]:
def train_arrival_delay_model_linear_regression(combined_flights):
    # Preprocess data
    X, y = preprocess_for_arrival_delay(combined_flights)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define preprocessing for numeric and categorical columns
    numeric_features = [
        'CRS_DEP_TIME_MINUTES', 
        'CRS_ARR_TIME_MINUTES', 
        'CRS_ELAPSED_TIME', 
        'DISTANCE', 
        'MONTH', 
        'DAY', 
        'YEAR', 
        'DAY_OF_WEEK'
    ]
    
    categorical_features = [
        'OP_CARRIER', 
        'ORIGIN', 
        'DEST'
    ]
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Create a pipeline with Linear Regression
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print("Linear Regression Model Performance:")
    print(f"Mean Absolute Error: {mae:.2f} minutes")
    print(f"Root Mean Squared Error: {rmse:.2f} minutes")
    print(f"R-squared Score: {r2:.4f}")
    
    return model

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

def train_arrival_delay_model_gbm(combined_flights):
    # Preprocess data
    X, y = preprocess_for_arrival_delay(combined_flights)
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Define preprocessing for numeric and categorical columns
    numeric_features = [
        'CRS_DEP_TIME_MINUTES', 
        'CRS_ARR_TIME_MINUTES', 
        'CRS_ELAPSED_TIME', 
        'DISTANCE', 
        'MONTH', 
        'DAY', 
        'YEAR', 
        'DAY_OF_WEEK'
    ]
    
    categorical_features = [
        'OP_CARRIER', 
        'ORIGIN', 
        'DEST'
    ]
    
    # Create preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Create a pipeline with GradientBoostingRegressor
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', GradientBoostingRegressor(random_state=42))
    ])
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print("Gradient Boosting Machine (GBM) Model Performance:")
    print(f"Mean Absolute Error: {mae:.2f} minutes")
    print(f"Root Mean Squared Error: {rmse:.2f} minutes")
    print(f"R-squared Score: {r2:.4f}")
    
    return model

In [63]:
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [84]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def train_arrival_delay_model_lightgbm(combined_flights):
    
    # Preprocess data
    X, y = preprocess_for_arrival_delay(combined_flights)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define preprocessing for numeric and categorical columns
    numeric_features = [
        'CRS_DEP_TIME_MINUTES', 
        'CRS_ARR_TIME_MINUTES', 
        'CRS_ELAPSED_TIME', 
        'DISTANCE', 
        'MONTH', 
        'DAY', 
        'YEAR', 
        'DAY_OF_WEEK'
    ]
    categorical_features = [
        'OP_CARRIER', 
        'ORIGIN', 
        'DEST'
    ]

    # Create a preprocessor
    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    # Create a pipeline with LightGBM
    model_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', lgb.LGBMRegressor(verbosity=-1, random_state=42))
    ])

    # Perform hyperparameter tuning with RandomizedSearchCV
    param_grid = {
        'regressor__num_leaves': [31, 50, 100],
        'regressor__learning_rate': [0.01, 0.05, 0.1],
        'regressor__n_estimators': [100, 200, 300],
        'regressor__max_depth': [5, 10, 15],
        'regressor__colsample_bytree': [0.7, 0.8, 0.9]
    }
    
    # Initialize RandomizedSearchCV
    random_search = RandomizedSearchCV(
        estimator=model_pipeline,
        param_distributions=param_grid,
        n_iter=100,
        cv=5,
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=-1
    )

    # Train using RandomizedSearchCV
    random_search.fit(X_train, y_train)

    # Extract the best model
    best_model = random_search.best_estimator_

    # Evaluate the model
    y_pred = best_model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print("LightGBM Model Performance with Tuning:")
    print(f"Mean Absolute Error: {mae:.2f} minutes")
    print(f"Root Mean Squared Error: {rmse:.2f} minutes")
    print(f"R-squared Score: {r2:.4f}")

    return best_model, random_search.best_params_


In [None]:
chosen_flight = 67
single_flight = combined_flights.iloc[[chosen_flight]]
print(f"Actual delay : {single_flight['ARR_DELAY']}")

# Define feature lists before model training
numeric_features = ['DISTANCE', 'CRS_ELAPSED_TIME', 'DEP_TIME', 'ARR_TIME']
categorical_features = ['OP_CARRIER', 'ORIGIN', 'DEST', 'DAY_OF_WEEK', 'MONTH']

combined_flights_2 = combined_flights.sample(frac=0.01, random_state=52)

## RFR prediction.
arrival_delay_model = train_arrival_delay_model(combined_flights_2)
predicted_delay = predict_arrival_delay(arrival_delay_model, single_flight)
print("Predicted Delay (RFR):", predicted_delay)

# Get feature importances
# feature_importances = get_feature_importance(arrival_delay_model)
# print(feature_importances.head(10))  # Top 10 most important features


## LINEAR REGRESSION prediction.
#print("---")
#arrival_delay_model_2 = train_arrival_delay_model_linear_regression(combined_flights_2)
#predicted_delay = predict_arrival_delay(arrival_delay_model_2, single_flight)
#print("Predicted Delay (LR):", predicted_delay)


## GBM prediction.
print("---")
#arrival_delay_model_3 = train_arrival_delay_model_gbm(combined_flights_2)
#predicted_delay = predict_arrival_delay(arrival_delay_model_3, single_flight)
#print("Predicted Delay (GBM):", predicted_delay)


## lightGBM prediction.
#arrival_delay_model_4 = train_arrival_delay_model_lightgbm(combined_flights_2)[0]
#predicted_delay = predict_arrival_delay(arrival_delay_model_4, single_flight)
#print("Predicted Delay (lightGBM):", predicted_delay)



Actual delay : 67    10.0
Name: ARR_DELAY, dtype: float64


# Machine learning

Ideas to look into: 
- **Using only the data we know before the flight, to predict something about the flight**
- **Using data about the flight, to predict something about the delays, how long is some delay based on whole delay**

