In [47]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib

In [54]:
folder_path = "../../../../../../../Volumes/T7 Shield/exp_1"
output_path = "../data/preprocessed_data"

features = ['SST', 'SAL', 'ice_frac', 'mixed_layer_depth', 'heat_flux_down', 
            'water_flux_up', 'stress_X', 'stress_Y', 'currents_X', 'currents_Y', 
            'nav_lat', 'nav_lon', 'month','tmask','year']
features_to_scale = ['SST', 'SAL', 'ice_frac', 'mixed_layer_depth', 'heat_flux_down', 
            'water_flux_up', 'stress_X', 'stress_Y', 'currents_X', 'currents_Y', 
            'nav_lat', 'nav_lon','year']
targets = ['fco2','fco2_pre','co2flux','co2flux_pre']

scaler = MinMaxScaler()

def setUpScaler():
    file_1959 = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_1959_df.pkl")
    # file_2018 = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_2018_df.pkl")
    data_1959 = pd.read_pickle(file_1959)
    # data_2018 = pd.read_pickle(file_2018)
    data_1959["year"] = 1959
    # data_2018["year"] = 2018

    df = data_1959
    # df = pd.concat([data_1959,data_2018], ignore_index=True)
    df["month"] = df["time_counter"].apply(lambda x: x.month)
    
    scaler.fit(df[features_to_scale])
    # Save the fitted scaler
    joblib.dump(scaler, 'scaler.pkl')

def preprocess_and_store(year):
    input_file = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_{year}_df.pkl")
    df = pd.read_pickle(input_file)

    df["year"] = year
    df["month"] = df["time_counter"].apply(lambda x: x.month)
    df = df[features+targets+['x','y']]

    df.loc[df['tmask'] == 0, 'fco2_pre'] = 0
    df.loc[df['tmask'] == 0, 'fco2'] = 0

    scaler_loaded = joblib.load('scaler.pkl')
    df[features_to_scale] = scaler_loaded.transform(df[features_to_scale])

    features_array = np.full((1442, 1021, len(features)), np.nan, dtype=np.float32)
    target_array = np.full((1442, 1021, len(targets)), np.nan, dtype=np.float32)


    for month in range(1, 13):
        month_data = df.loc[(df['month'] == month)]
        x_indices = month_data['x'].astype(int)
        y_indices = month_data['y'].astype(int)

        # Map data to arrays
        input_values = month_data[features].values
        target_values = month_data[targets].values

        # Ensure shape compatibility
        if input_values.shape[0] == len(x_indices) and target_values.shape[0] == len(x_indices):
            features_array[x_indices, y_indices, :] = input_values
            target_array[x_indices, y_indices, :] = target_values

            np.save(os.path.join(output_path, f"{year}_{month}_features.npy"), features_array)
            np.save(os.path.join(output_path, f"{year}_{month}_targets.npy"), target_array)
        else:
            print(f"Shape mismatch for month {month}. Skipping...")





In [56]:
# setUpScaler()
for year in range(1960, 1970):
    print(year)
    preprocess_and_store(year)

1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
