In [30]:
import numpy as np
import pandas as pd
import xarray as xr
import scipy.sparse as sp
import glob
import os
from tqdm.notebook import tqdm

In [2]:
filedir_path = "../../data/raw/btsdelay/"
files = glob.glob(f"{filedir_path}*.parquet")

raw = pd.read_parquet(files[0]).sample(100_000)


In [29]:
raw.head()

Unnamed: 0,FlightDate,Airline,Origin,Dest,Cancelled,Diverted,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,...,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,ArrTimeBlk,DistanceGroup,DivAirportLandings
311538,2018-12-10,Comair Inc.,PHL,DAY,False,False,1548,1716.0,88.0,88.0,...,1728.0,1836.0,5.0,1746,55.0,1.0,3.0,1700-1759,2,0.0
373480,2018-01-21,Horizon Air,SEA,MSO,False,False,2138,2130.0,0.0,-8.0,...,2139.0,2338.0,4.0,2358,-16.0,0.0,-2.0,2300-2359,2,0.0
62766,2018-11-15,Spirit Air Lines,FLL,LGA,True,False,1632,1806.0,94.0,94.0,...,1824.0,,,1928,,,,1900-1959,5,9.0
139759,2018-03-11,Southwest Airlines Co.,MKE,MCO,False,False,1325,1400.0,35.0,35.0,...,1410.0,1728.0,11.0,1710,29.0,1.0,1.0,1700-1759,5,0.0
143113,2018-12-15,Frontier Airlines Inc.,LAS,SLC,False,False,1804,1815.0,11.0,11.0,...,1827.0,2024.0,6.0,2030,0.0,0.0,0.0,2000-2059,2,0.0


In [5]:
def construct_adj_mat(flights: pd.DataFrame) -> pd.DataFrame:
    """ajd matrix of form Aij => flight from i to j, id est rows are origin, cols are dest """

    squared_delays = (flights
                      .assign(sqrd_delay = lambda df_: df_['ArrDelay']**2,
                              dtime=lambda df_: pd.to_datetime((df_['FlightDate'].astype(str) + " " +
                                                                df_['CRSDepTime'].astype(int).astype(str)),
                                    format="%Y-%m-%d %H%M%S", errors='coerce'))
                      [['Origin', 'Dest', 'sqrd_delay', 'dtime']]
                      .groupby(['Origin', 'Dest', pd.Grouper(key='dtime', freq="H")])
                      .agg(mean_sqr_delay = pd.NamedAgg('sqrd_delay', np.sum),)
                      .reset_index()
                      )

    locs = np.unique(np.concatenate((flights['Origin'].unique(), flights['Dest'].unique())))
    
    time_grouped = (squared_delays
                    .groupby(pd.Grouper(key="dtime", freq="H")))

    adj_mat = xr.DataArray(np.zeros((len(locs), len(locs), len(time_grouped)), dtype=np.single),
                           dims=("Origin", "Dest", "timestamp"),
                           coords={"Origin": locs, 'Dest': locs, 'timestamp': [t for t, _ in time_grouped]})
    
    for timestamp, group in time_grouped:
        for idx, origin, dest, dtime, mean_sqr_delay in group.itertuples():
            adj_mat.loc[{'Origin': origin, 'Dest': dest, 'timestamp': timestamp}] = mean_sqr_delay

    return adj_mat


adj_mat = construct_adj_mat(raw)

In [36]:
def adj_mat_to_file(adj_mat: xr.DataArray=None, filepath: str=None) -> None:
    if not filepath:
        filepath = os.path.join(os.path.dir(os.getcwd()), "data", "processed", "flight_ts_graphs")
    os.makedirs(filepath, exist_ok=True)
    amt = adj_mat.transpose('timestamp', ...)
    for hr in amt:
        coo_adj_mat = sp.coo_matrix(hr.values)
        sp.save_npz(os.path.join(filepath, f"file_{hr.timestamp.item()}.npz"), coo_adj_mat, compressed=True)
adj_mat_to_file(adj_mat)

In [8]:
unique, counts = np.unique(adj_mat.to_numpy(), return_counts=True)

In [9]:
dict(zip(unique, counts))

{0.0: 1192538272,
 1.0: 3867,
 2.0: 1,
 4.0: 3859,
 5.0: 1,
 9.0: 3912,
 10.0: 1,
 16.0: 3757,
 17.0: 1,
 18.0: 1,
 25.0: 3741,
 26.0: 1,
 36.0: 3873,
 41.0: 2,
 49.0: 3916,
 50.0: 1,
 52.0: 1,
 53.0: 1,
 61.0: 1,
 64.0: 3954,
 65.0: 1,
 68.0: 1,
 81.0: 3903,
 85.0: 2,
 90.0: 1,
 97.0: 1,
 98.0: 2,
 100.0: 3833,
 101.0: 1,
 104.0: 2,
 109.0: 1,
 116.0: 1,
 117.0: 1,
 121.0: 3798,
 122.0: 2,
 128.0: 1,
 144.0: 3616,
 145.0: 1,
 149.0: 1,
 153.0: 2,
 157.0: 1,
 164.0: 1,
 169.0: 3489,
 170.0: 3,
 178.0: 1,
 185.0: 2,
 186.0: 1,
 196.0: 3403,
 221.0: 1,
 225.0: 3206,
 226.0: 1,
 232.0: 2,
 233.0: 1,
 241.0: 1,
 245.0: 1,
 256.0: 2895,
 260.0: 1,
 265.0: 3,
 274.0: 1,
 288.0: 1,
 289.0: 2739,
 290.0: 3,
 292.0: 1,
 293.0: 1,
 305.0: 1,
 306.0: 1,
 313.0: 1,
 317.0: 1,
 324.0: 2578,
 333.0: 1,
 361.0: 2408,
 365.0: 1,
 369.0: 2,
 370.0: 2,
 377.0: 2,
 400.0: 2106,
 405.0: 1,
 410.0: 1,
 425.0: 1,
 433.0: 1,
 441.0: 1956,
 442.0: 1,
 457.0: 1,
 461.0: 1,
 464.0: 1,
 466.0: 1,
 481.0: 2,
 484

In [None]:
#raw = pd.concat([pd.read_parquet(i) for i in files]).sample(1_000_000)


In [21]:
for graph in tqdm(adj_mat_ts):
    eigvecs, eigvals = np.linalg.eig(graph)
    print(type(eigvecs))

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [37]:
len("1530306000000000000")

19