## Explore and get schemas by year

### Parallel processing with delayed

In [None]:
import json
import os

from dask import delayed
import dask.dataframe as dd

from dask_jobqueue import SLURMCluster
from dask.distributed import Client
from pprint import pprint

INPUT_DATA_PATH = '/d/hpc/projects/FRI/bigdata/data/Taxi'
HOME_DIR = '/d/hpc/projects/FRI/bigdata/students/in7357'
# Configure SLURM cluster
cluster = SLURMCluster(
    queue='all',
    processes=1,
    cores=8,
    memory='32GB',
    walltime='00:10:00',
    scheduler_options={'dashboard_address': ':8087'},
    job_extra=['--output=slurm-%j.out'],
    env_extra=['export LANG="en_US.utf8"', 'export LC_ALL="en_US.utf8"']
)

cluster.scale(jobs=10)

client = Client(cluster)

# Lazy evaluation to get column schemas
@delayed
def get_schema(path):
    df = dd.read_parquet(path)
    return {
        'columns': df.columns.tolist(),
        'dtypes': df.dtypes.to_dict()
    }

schemas_futures = {
    year: get_schema(f"{INPUT_DATA_PATH}/yellow_tripdata_{year}*.parquet")
    for year in range(2009, 2025)
}

# Compute in parallel on the cluster
schemas = client.compute(schemas_futures, sync=True)


with open(os.path.join(HOME_DIR, 'schemas.json'), 'w') as f:
    json.dump(schemas, f, indent=2, default=str) 

pprint(schemas)
