In [8]:
!pip install healpix



In [27]:
# requires to install eofs and gpytorch
import xarray as xr
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
#import gpytorch
import os
import glob
from eofs.xarray import Eof

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from typing import Dict, Optional, List, Callable, Tuple, Union

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [34]:
def load_train_data(mode: str = 'train'):
    X, (so2_solver, bc_solver) = get_input_data(input_dir, mode)
    y = get_output_data(target_dir, mode)
    return torch.tensor(X), torch.tensor(y), (so2_solver, bc_solver)


def load_test_data(mode: str = 'train', solvers = None):
    X, (so2_solver, bc_solver) = get_input_data(input_dir, mode, solvers)
    y = get_output_data(target_dir, mode)
    return torch.tensor(X), torch.tensor(y), (so2_solver, bc_solver)


def load_data_npz(path: str): #If np data already exists
    X_train, y_train = np.load(os.path.join(base_dir, ''))
    X_test, y_test = np.load(os.path.join(base_dir, ''))
    return X_train, y_train, X_test, y_test


def get_input_data(path: str, mode: str, solvers = None, n_eofs : int = 5):
    # train_experiments = ["ssp126", "ssp370"]
    train_experiments = ["ssp126"]
    # TODO: 585 har annat format
    test_experiments = ["ssp245"]
    input_gases = ['BC_sum', 'CH4_sum', 'CO2_sum', 'SO2_sum']
    fire_type = 'all-fires'

    
    BC = []
    CH4 = []
    CO2 = []
    SO2 = []
    
    if mode == 'train':      
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for exp in experiments:
        print(exp)
        for gas in input_gases:
            input_dir = os.path.join(datapath, "inputs", "input4mips")
            var_dir = os.path.join(input_dir, exp, gas, '250_km', 'mon')
            files = glob.glob(var_dir + '/**/*.nc', recursive=True)
            #print("var dir", var_dir)
            #print("files", files)
            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    BC.append(f)
            for f in files:
                if gas == 'CH4_sum' and fire_type in f:
                    CH4.append(f)
            for f in files:
                if gas == 'BC_sum' and fire_type in f:
                    SO2.append(f)
            for f in files:
                if gas == 'CO2_sum':
                    CO2.append(f)
    #print("BC", BC)
    print("opening datsets from paths")
    BC_data = xr.open_mfdataset(BC, concat_dim='time', combine='nested').compute().to_array()  # .to_numpy()
    SO2_data = xr.open_mfdataset(SO2, concat_dim='time', combine='nested').compute() .to_array()  #.to_numpy()
    CH4_data = xr.open_mfdataset(CH4, concat_dim='time', combine='nested').compute().to_array().to_numpy()
    CO2_data = xr.open_mfdataset(CO2, concat_dim='time', combine='nested').compute().to_array().to_numpy()
    print(BC_data.shape)
    # BC_data = np.moveaxis(BC_data, 0, 1)
    # SO2_data = np.moveaxis(SO2_data, 0, 1)
    print("configuring data")
    CH4_data = np.moveaxis(CH4_data, 0, 1)
    CO2_data = np.moveaxis(CO2_data, 0, 1)
    CH4_data = CH4_data.reshape(CH4_data.shape[0], -1)
    CO2_data = CO2_data.reshape(CO2_data.shape[0], -1)

    return 
    BC_data = BC_data.transpose('time', 'variable', 'lat', 'lon')
    SO2_data = SO2_data.transpose('time', 'variable', 'lat', 'lon')
    BC_data = BC_data.assign_coords(time=np.arange(len(BC_data.time)))
    SO2_data = SO2_data.assign_coords(time=np.arange(len(SO2_data.time)))

    
    # Compute EOFs for BC
    print("Solvers...")
    if solvers is None:
        # print(BC_data.shape)
        bc_solver = Eof(BC_data)
        bc_eofs = bc_solver.eofsAsCorrelation(neofs=n_eofs)
        bc_pcs = bc_solver.pcs(npcs=n_eofs, pcscaling=1)

        # Compute EOFs for SO2
        so2_solver = Eof(SO2_data)
        so2_eofs = so2_solver.eofsAsCorrelation(neofs=n_eofs)
        so2_pcs = so2_solver.pcs(npcs=n_eofs, pcscaling=1)

        print(bc_pcs)

        # Convert to pandas
        bc_df = bc_pcs.to_dataframe().unstack('mode')
        bc_df.columns = [f"BC_{i}" for i in range(n_eofs)]

        so2_df = so2_pcs.to_dataframe().unstack('mode')
        so2_df.columns = [f"SO2_{i}" for i in range(n_eofs)]
    else:
        so2_solver = solvers[0]
        bc_solver = solvers[1]
        
        so2_pcs = so2_solver.projectField(SO2_data, neofs=n_eofs, eofscaling=1)
        so2_df = so2_pcs.to_dataframe().unstack('mode')
        so2_df.columns = [f"SO2_{i}" for i in range(n_eofs)]

        bc_pcs = bc_solver.projectField(BC_data, neofs=n_eofs, eofscaling=1)
        bc_df = bc_pcs.to_dataframe().unstack('mode')
        bc_df.columns = [f"BC_{i}" for i in range(n_eofs)]
    
    CH4_data = CH4_data[:, :1]
    CO2_data = CO2_data[:, :1]

    print(bc_df.shape)
    print(CH4_data.shape)
    print(CO2_data.shape)
    print(so2_df.shape)
    print("merging data...")
    merged_data = np.concatenate((bc_df, CH4_data, CO2_data, so2_df), axis=1)
    return merged_data, (so2_solver, bc_solver)


def get_output_data(path: str, mode: str):
    total_ensembles = 1
    nc_files = []
    
    if mode == 'train':
        experiments = train_experiments
    elif mode == 'test':
        experiments = test_experiments
        
    for mod in models:

        model_dir = os.path.join(path, mod)
        print(model_dir)
        ensembles = os.listdir(model_dir)

        if total_ensembles == 1:
            ensembles = ensembles[0]
        
        exp_counter = 0
        for exp in experiments:
            for var in variables:
                var_dir = os.path.join(path, mod, ensembles, exp, var, '250_km/mon')
                files = glob.glob(var_dir + '/**/*.nc', recursive=True)
                nc_files += files
        
            if exp_counter == 0:
                dataset = xr.open_mfdataset(nc_files).compute().to_array().to_numpy()
        
            else: #concatenate dataset in time dimension
                other_experiment = xr.open_mfdataset(nc_files).compute().to_array().to_numpy()
                dataset = np.concatenate((dataset, other_experiment), axis=1)
                
                
            exp_counter += 1
            
        dataset = np.moveaxis(dataset, 0, 1)
        print(dataset.shape)
        dataset = dataset.reshape(dataset.shape[0], -1)
        
        # TODO: remove next line, only used for making quick tests
        dataset = dataset[:, :1]
    
    return dataset

In [29]:
import os
datapath = "/mnt/c/Users/Tage/00_Programming/5_Masters_thesis/Climateset_test/Dataset"

#current_dir = os.getcwd()
input_dir = os.path.join(datapath, "inputs", "input4mips")
target_dir = os.path.join(datapath, "outputs", "CMIP6")

 # MOVED INSIDE FUNCTION!!
fire_type = 'all-fires'
variables = ['pr']
models = ['CAS-ESM2-0']
#train_experiments = ["ssp585", "ssp126", "ssp370"] 
#test_experiments = ["ssp245"]
# ------------------------

input_gases = ['BC_sum', 'CH4_sum', 'CO2_sum', 'SO2_sum']

In [35]:
X_train, y_train, (so2_solver, bc_solver) = load_train_data('train')

ssp126
opening datsets from paths
(1, 1032, 96, 144)
configuring data


TypeError: cannot unpack non-iterable NoneType object

In [31]:
type(X_train)

torch.Tensor

In [None]:
import healpix
import healpy

def interpolate_dh_to_hp(nside, variable: xr.DataArray):
    npix = healpix.nside2npix(nside)
    hlong, hlat = healpix.pix2ang(nside, np.arange(0, npix, 1), lonlat=True, nest=True)
    hlong = np.mod(hlong, 360)
    xlong = xr.DataArray(hlong, dims="z")
    xlat = xr.DataArray(hlat, dims="z")

    xhp = variable.interp(latitude=xlat, longitude=xlong, kwargs={"fill_value": None})
    hp_image = np.array(xhp.to_array().to_numpy(), dtype=np.float32)
    return hp_image


def e5_to_numpy_hp(e5xr, nside: int, normalized: bool):

    hp_surface = interpolate_dh_to_hp(nside, e5xr.surface)
    hp_upper = interpolate_dh_to_hp(nside, e5xr.upper)

    if normalized:
        stats = deserialize_dataset_statistics(nside)
        hp_surface, hp_upper = normalize_sample(stats.item(), hp_surface, hp_upper)

    return hp_surface, hp_upper