In [2]:
#Initial Import

In [1]:
import pandas as pd

# Read the CSV file
data = pd.read_csv('Training_Data_v5.csv')

In [30]:
print(data)

         FlightDate                                    Airline Origin Dest  \
0        2022-04-04  Commutair Aka Champlain Enterprises, Inc.    GJT  DEN   
1        2022-04-04  Commutair Aka Champlain Enterprises, Inc.    HRL  IAH   
2        2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
3        2022-04-04  Commutair Aka Champlain Enterprises, Inc.    IAH  GPT   
4        2022-04-04  Commutair Aka Champlain Enterprises, Inc.    DRO  DEN   
...             ...                                        ...    ...  ...   
4078313  2022-03-31                          Republic Airlines    MSY  EWR   
4078314  2022-03-17                          Republic Airlines    CLT  EWR   
4078315  2022-03-08                          Republic Airlines    ALB  ORD   
4078316  2022-03-25                          Republic Airlines    EWR  PIT   
4078317  2022-03-07                          Republic Airlines    EWR  RDU   

         Cancelled  Diverted  CRSDepTime  DepTime  DepDelayMinu

In [2]:
from river import compose
from river import linear_model
from river import metrics
from river import preprocessing
from river import optim
from river import stream


In [3]:
# Step 1: Stream the data
streaming_data = stream.iter_pandas(data)

In [4]:
# Step 2: Data Preprocessing
# Define selected columns
selected_features = ['DayOfWeek', 'Operating_Airline', 'OriginAirportID', 'DestAirportID']
target_column = 'ArrDelayMinutes'

# Define categorical and numerical features
categorical_features = ['DayOfWeek', 'Operating_Airline', 'OriginAirportID', 'DestAirportID']
numerical_features = [target_column]

# Create individual transformers
cat_encoder = preprocessing.OneHotEncoder()
num_scaler = preprocessing.StandardScaler()


In [5]:
# Create a processing pipeline
model = compose.TransformerUnion(
    ('cat_features', compose.Select(*categorical_features) | cat_encoder),
    ('num_features', compose.Select(*numerical_features) | num_scaler)
)

In [None]:
################# TRAIN MODEL OPTION 1 ##################

# Apply transformations
transformed_data = []
for x, y in streaming_data:  # Loop through your dataset
    transformed_row = model.learn_one(x)
    transformed_data.append(transformed_row)

In [None]:
# Step 3: Define and Train the Model

# Initialize the linear regression model
lr = linear_model.LinearRegression()

# Train the model
for row in transformed_data:
    y = row[target_column]
    x = {key: row[key] for key in selected_features}
    lr.learn_one(x, y)



In [None]:
# Step 4: Evaluate the Model

# Define metric for evaluation
metric = metrics.MAE()  # Mean Absolute Error

# Test the model on new data
for row in transformed_data:
    y = row[target_column]
    x = {key: row[key] for key in selected_features}
    y_pred = lr.predict_one(x)
    metric.update(y, y_pred)

# Get the mean absolute error
mae = metric.get()

# Output the mean absolute error
print(f"Mean Absolute Error: {mae}")

In [None]:
######### ALTERNATIVE AND MIGHT BE BETTER AND SIMULATES REAL TIME: #######
# Initialize the linear regression model
lr = linear_model.LinearRegression()

# Define metric for evaluation
metric = metrics.MAE()  # Mean Absolute Error

# Train the model and evaluate on new data
for x, y in streaming_data:  # Loop through your dataset
    # Apply transformations
    x_transformed = model.learn_one(x)

    # Train the model with the new sample
    lr.learn_one(x_transformed, y)

    # Test the model on the same sample
    y_pred = lr.predict_one(x_transformed)
    metric.update(y, y_pred)

# Get the mean absolute error
mae = metric.get()

# Output the mean absolute error
print(f"Mean Absolute Error: {mae}")



In [None]:
#### OUTPUT ###

# Function to predict delay for user input
def predict_delay(user_input):
    # Transform user input
    transformed_input = model.learn_one(user_input)

    # Predict the delay using the trained model
    predicted_delay = lr.predict_one(transformed_input)

    return predicted_delay

# User input to predict delay
user_input = {
    'DayOfWeek': 'Monday',
    'Operating_Airline': 'ABC Airlines',
    'OriginAirportID': 'JFK',
    'DestAirportID': 'LAX',
    'ArrDelayMinutes': 0  # Placeholder for target column; value doesn't matter here
}

# Get the predicted delay for user input
predicted_delay = predict_delay(user_input)

# Output the predicted delay
print(f"Predicted delay: {predicted_delay:.2f} minutes")