In [None]:
import dask.dataframe as dd
import pandas as pd
import glob
import os
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

# Load Challenge Data

In [None]:
challenge_data = pd.read_csv("../data/challenge_set.csv")
submission_data = pd.read_csv("../data/final_submission_set.csv")
print(f"{challenge_data.shape[0]=}, {submission_data.shape[0]=}")
challenge_data = pd.concat([challenge_data, submission_data], axis=0)
challenge_data.reset_index(drop=True, inplace=True)
print(f"{challenge_data.shape[0]=}")
challenge_data['takeoff_time'] = pd.to_datetime(challenge_data['actual_offblock_time'], utc=True) + pd.to_timedelta(challenge_data['taxiout_time'], unit='m')
challenge_data['arrival_time'] = pd.to_datetime(challenge_data['arrival_time'], utc=True)
print(challenge_data.dtypes)
print(f"{challenge_data[['flight_id']].drop_duplicates().shape[0]=}")
challenge_data[['flight_id','date','actual_offblock_time','taxiout_time','takeoff_time','arrival_time','flight_duration','flown_distance']]

In [None]:
challenge_data = challenge_data[['flight_id', "takeoff_time", "arrival_time"]]
challenge_data

# Load _ALL_ `parquet` files

In [None]:
# Define input and output directories
input_dir = Path("../data/")
output_dir = iPath("../data_cleaned/")
output_dir.mkdir(parents=True, exist_ok=True)

# List all .parquet files in the input directory
parquet_files = glob.glob(str(input_dir / "*.parquet"))

# Function to process each file
def process_file(file):
    # Load file using pyarrow engine for faster reads
    df = pd.read_parquet(file, engine='pyarrow')

    # Sort and merge as before
    df.sort_values(["flight_id", "timestamp"], inplace=True)
    df = df.merge(challenge_data, on='flight_id', how='inner')

    # Filter based on timestamp conditions
    df = df[(df.timestamp >= df.takeoff_time) & 
            (df.timestamp <= df.takeoff_time + (df.arrival_time - df.takeoff_time) / 2)]

    # Select relevant columns
    df = df[['flight_id', 'timestamp', 'temperature', 'altitude', 'groundspeed', 'vertical_rate']]
    
    # Save the processed DataFrame to the output directory
    output_file = output_dir / os.path.basename(file)
    df.to_parquet(output_file, engine='pyarrow')
    
    print(f"Converted {file=}")

In [None]:
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
    executor.map(process_file, parquet_files)