In [2]:
import pandas as pd
import torch
from torch_geometric.data import Data
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os

DATA_PATH = "../data/processed/flights_final_dataset.csv"
OUTPUT_PATH = "../data/processed/graph_data_full.pt"
STATE_PATH = "../state/models"

os.makedirs(STATE_PATH, exist_ok=True)

df = pd.read_csv(DATA_PATH)
print(f"Input data shape: {df.shape}")


print("Encoding Airports and Carriers...")


le_airport = LabelEncoder()
all_airports = pd.concat([df['ORIGIN'], df['DEST']]).unique()
le_airport.fit(all_airports)

df['Src_Node'] = le_airport.transform(df['ORIGIN'])
df['Dst_Node'] = le_airport.transform(df['DEST'])

print(f" - Number of airports (Nodes): {len(le_airport.classes_)}")

le_carrier = LabelEncoder()
df['Carrier_ID'] = le_carrier.fit_transform(df['OP_UNIQUE_CARRIER'])
print(f" - Number of carriers: {len(le_carrier.classes_)}")


print("Feature Scaling (StandardScaler)...")


feature_cols = [
    'DAY_OF_MONTH', 
    'DAY_OF_WEEK', 
    'CRS_MINUTES', 
    
    'DISTANCE', 
    'Carrier_ID', 
    
    'temp', 'rhum', 'prcp', 'wspd', 'coco'
]


scaler = StandardScaler()
features_scaled = scaler.fit_transform(df[feature_cols])


print("Building Graph Data...")


edge_index = torch.tensor([
    df['Src_Node'].values,
    df['Dst_Node'].values
], dtype=torch.long)


edge_attr = torch.tensor(features_scaled, dtype=torch.float)


y = torch.tensor(df['DEP_DEL15'].values, dtype=torch.float).unsqueeze(1)


num_nodes = len(le_airport.classes_)
x = torch.eye(num_nodes, dtype=torch.float)

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

print("Saving files...")

torch.save(data, OUTPUT_PATH)

joblib.dump(scaler, f'{STATE_PATH}/scaler.pkl')
joblib.dump(le_airport, f'{STATE_PATH}/airport_encoder.pkl')
joblib.dump(le_carrier, f'{STATE_PATH}/carrier_encoder.pkl')

print("\n" + "="*30)
print(data)
print(f" - Number of Nodes: {data.num_nodes}")
print(f" - Number of Edges (Flights): {data.num_edges}")
print(f" - Edge features: {data.num_edge_features}")
print(f" - Validation check: {data.validate(raise_on_error=True)}")


Input data shape: (190679, 13)
Encoding Airports and Carriers...
 - Number of airports (Nodes): 273
 - Number of carriers: 14
Feature Scaling (StandardScaler)...
Building Graph Data...
Saving files...

Data(x=[273, 273], edge_index=[2, 190679], edge_attr=[190679, 10], y=[190679, 1])
 - Number of Nodes: 273
 - Number of Edges (Flights): 190679
 - Edge features: 10
 - Validation check: True
