### Description

This file takes the raw xarray files (which can be found at https://figshare.com/collections/Large_ensemble_pCO2_testbed/4568555), applies feature transformations, and saves it into a pandas dataframe.

### Inputs

In [None]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/local/data/artemis/workspace/jfs2167/recon_eval" # Set this to the path of the project
ensemble_dir_head = "/local/data/artemis/simulations/LET" # Set this to where you have placed the raw data

data_output_dir = f"{root_dir}/data/processed"
reference_output_dir = f"{root_dir}/references"
xco2_path = f"{ensemble_dir_head}/CESM/member_001/XCO2_1D_mon_CESM001_native_198201-201701.nc" # Forcing is the same across members so only reference it once

### Modules

In [1]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
from sklearn.model_selection import train_test_split

# Python file with supporting functions
import pre 

Using TensorFlow backend.


### Predefined values

In [None]:
# Loading references
path_LET = f"{reference_output_dir}/members_LET_dict.pickle"
path_seeds = f"{reference_output_dir}/random_seeds.npy"
path_loc = f"{reference_output_dir}/members_seed_loc_dict.pickle"
with open(path_LET,'rb') as handle:
    mems_dict = pickle.load(handle)
    
random_seeds = np.load(path_seeds)    
    
with open(path_loc,'rb') as handle:
    seed_loc_dict = pickle.load(handle)

In [3]:
# =========================================
# Setting the date range to unify the date type
# =========================================

# Define date range
date_range_start = '1982-01-01T00:00:00.000000000'
date_range_end = '2017-01-31T00:00:00.000000000'

# create date vector
dates = pd.date_range(start=date_range_start, 
                      end=date_range_end,freq='MS') + np.timedelta64(14, 'D')

# Select the start and end
date_start = dates[0]
date_end = dates[420]

### Loop to load in data, clean it, and save it

In [None]:
# ensemble_list = ['CanESM2', 'CESM', 'GFDL', 'MPI']
ensemble_list = []

for ens, mem_list in mems_dict.items():
    for member in mem_list:
        # This function loads in the data, cleans it, and creates a pandas data frame
        df = pre.create_inputs(ensemble_dir_head, ens, member, dates, xco2_path=xco2_path)
        
        # Save the pandas data frame to my workspace
        pre.save_clean_data(df, data_output_dir, ens, member)