In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import joblib
from utils.global_co2 import getGlobalCo2ForYear

In [8]:
folder_path = "../../../../../../../Volumes/T7 Shield/exp_1"
output_path = "../../../../../../../Volumes/T7 Shield/exp_1_point_data"

features = ['SST', 'SAL', 'ice_frac', 'mixed_layer_depth', 'heat_flux_down', 
            'water_flux_up', 'stress_X', 'stress_Y', 'currents_X', 'currents_Y', 'month','tmask','year','nav_lat','nav_lon','global_co2']
targets = ['fco2','fco2_pre','co2flux','co2flux_pre','nav_lat_unscaled','nav_lon_unscaled']

scaler = MinMaxScaler()

def setUpScaler():
    file_1959 = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_1959_df.pkl")
    file_2018 = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_2018_df.pkl")
    data_1959 = pd.read_pickle(file_1959)
    data_2018 = pd.read_pickle(file_2018)
    data_1959["year"] = 1959
    data_2018["year"] = 2018
    data_1959['global_co2'] = getGlobalCo2ForYear(1959)
    data_2018['global_co2'] = getGlobalCo2ForYear(2018)

    df = pd.concat([data_1959,data_2018], ignore_index=True)
    df["month"] = df["time_counter"].apply(lambda x: x.month)
    
    scaler.fit(df[features])
    joblib.dump(scaler, 'point_scaler.pkl')


def preprocess_and_store(year):
    input_file = os.path.join(folder_path, f"ORCA025.L46.LIM2vp.CFCSF6.MOPS.JRA.LP04-KLP002.hind_{year}_df.pkl")
    df = pd.read_pickle(input_file)

    # just use ocean points here
    df = df[df['tmask'] == 1]
    df["year"] = year
    df['global_co2'] = getGlobalCo2ForYear(year)
    df["month"] = df["time_counter"].dt.month
    df["nav_lat_unscaled"] =  df["nav_lat"]
    df["nav_lon_unscaled"] =  df["nav_lon"]
    df = df[features+targets]

    scaler_loaded = joblib.load('point_scaler.pkl')
    df[features] = scaler_loaded.transform(df[features])

    np.save(os.path.join(output_path, f"{year}_features.npy"),  df[features].to_numpy())
    np.save(os.path.join(output_path, f"{year}_targets.npy"),  df[targets].to_numpy())






In [10]:
# setUpScaler()
for year in range(2000, 2018):
    print(year)
    preprocess_and_store(year)

2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
