# Setup: only need to run this notebook once

### Copy data to current server

In [16]:
import os

# Remote data directory
remote_data_dir = "/data/2m_temperature_GL_2015-2024.nc"
remote_access = {
    'user': 'uribe055',
    'host': 'cs-u-spatial-513.cs.umn.edu',
    'path': remote_data_dir

}

# Local data directory
home_dir = "/home/uribe055/sedona_experiments"
local_data_dir = os.path.join(home_dir, "unprocessed_data")

local_access = {
    'user': 'uribe055',
    'host': 'cs-spatial-501.cs.umn.edu',
    'path': local_data_dir
}

In [None]:
''' 
Run in notebook once
'''
# import subprocess
# scp_command = [
#     "scp",
#     f"{remote_access['user']}@{remote_access['host']}:{remote_access['path']}",
#     local_access['path']
# ]

# result = subprocess.run(scp_command, capture_output=True, text=True)
# print("STDOUT:", result.stdout)
# print("STDERR:", result.stderr)

'''
Run in terminal once
'''
# scp uribe055@cs-u-spatial-513.cs.umn.edu:/data/2m_temperature_GL_2015-2024.nc uribe055@cs-spatial-501.cs.umn.edu:/home/uribe055/sedona_experiments/unprocessed_data/2m_temperature_GL_2015-2024.nc

### Prepare data

**Notes:** 
* When I ran the data splitting without changing the *time* type, it took about 2 hours.
* The *time* type needs to be in milliseconds to be read into spark. I originally did not change the *time* type, and ran additional code to fix my version of the data. The code below should create the parquet files with the timestamp in milliseconds (but I haven't tried it) with the following line:
    
    `df_single["time"] = df_single["time"].astype("datetime64[ms]")`

In [None]:
# import xarray as xr
# import os

# # Get latest 5 years of data
# ds_full = xr.open_dataset(os.path.join(local_data_dir, "2m_temperature_GL_2015-2024.nc"))
# ds = ds_full.rename({"validtime": "time", "t2m": "2m_temperature"})
# ds = ds.sel(time=slice('2020-01-01', '2024-12-31'))

# # No splitting data         # TOO LARGE A FILE TO READ INTO SEDONA IN NOTEBOOK
# # ds.to_netcdf(os.path.join(home_dir, f"processed_data/2m_temperature_GL_2020-2024.nc"))

# # Split data into daily .parquet files      # TOOK ~115 MINUTES
# for t in ds.time[::24]:
#     date = str(t.values)[:10]
#     ds_single = ds.sel(time=t)
#     df_single = ds_single.to_dataframe().reset_index()
#     df_single["time"] = df_single["time"].astype("datetime64[ms]")

#     df_single.to_parquet(os.path.join(home_dir, f"processed_data/t2m_{date}.parquet"))
#     ds_single.close()

### Code to fix timestamp type to ms

**Notes:**
* Took ~2 minutes to run.

In [18]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path

parquet_files = list(Path("/home/uribe055/sedona_experiments/processed_data").rglob("*.parquet"))

for f in parquet_files:

    table = pq.read_table(f)
    table  = table.set_column(table.schema.get_field_index("time"), "time",
                              table.column("time").cast(pa.timestamp("ms")))
    pq.write_table(table, f"/home/uribe055/sedona_experiments/data/{Path(f).stem}.parquet")