# A simple classification model for Ocean Heat Uptake

In [1]:
import os
import random
import numpy as np
import pandas as pd
from os import listdir
import matplotlib.pyplot as plt
import xarray as xr
import numpy as np

import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Load datasets

In [2]:
hfls = xr.open_dataset('data/CMIP6_allmodels.hfls.NA.piC.FFT.OHUlabel.112022.nc')
hfss = xr.open_dataset('data/CMIP6_allmodels.hfss.NA.piC.FFT.OHUlabel.112022.nc')
pr = xr.open_dataset('data/CMIP6_allmodels.pr.NA.piC.FFT.OHUlabel.112022.nc')
psl = xr.open_dataset('data/CMIP6_allmodels.psl.NA.piC.FFT.OHUlabel.112022.nc')
tas = xr.open_dataset('data/CMIP6_allmodels.tas.NA.piC.FFT.OHUlabel.112022.nc')

## Assign labels

In [3]:
labels = hfls.model_OHU.data
target = pd.DataFrame(labels)

target.columns = ['labels']

We are converting the numerical labels to categories. <4 = low (assigned '0'), 4-8 is medium (assigned '1') and >8 is high (assigned '2'). Eventually, we have the map this to AMOC index.

In [4]:
bins = [0, 4, 8]
names = [0, 1, 2]

# 0 i low, 1 is medium, 2 is high

d = dict(enumerate(names, 1))
target['ocn_cat'] = np.vectorize(d.get)(np.digitize(target['labels'], bins))

## Classification model

Ocean heat uptake is ONE value. Aggregating all the locations together to train on one number might not be a good idea, so this is a workaround.

The function below takes in a single lat, lon location, trains a classifier for that location on different model ensembles and tests the accuracy. It can be used in two ways: 
- Use the function to deduce accuracy for one location
- Use the function in a loop to deduce accuracies for all the locations to understand which location(s) better predict Ocean Heat Uptake.

In [5]:
def set_seed(seed):
    
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)

In [112]:

def ocn_classification(lat, lon):
    
    set_seed(42)
    
    tas_npy = tas.tas.data[:,:,lat,lon]
    hfls_npy = hfls.hfls.data[:,:,lat,lon]
    hfss_npy = hfss.hfss.data[:,:,lat,lon]
    pr_npy = pr.pr.data[:,:,lat,lon]
    psl_npy = psl.psl.data[:,:,lat,lon]
    
    scaler = MinMaxScaler(feature_range=(0, 1))
    
    scaler_tas = scaler.fit(tas_npy)
    norm_tas = scaler_tas.transform(tas_npy)

    scaler_hfls = scaler.fit(hfls_npy)
    norm_hfls = scaler_hfls.transform(hfls_npy)

    scaler_pr = scaler.fit(pr_npy)
    norm_pr = scaler_pr.transform(pr_npy)

    scaler_psl = scaler.fit(psl_npy)
    norm_psl = scaler_psl.transform(psl_npy)

    scaler_hfss = scaler.fit(hfss_npy)
    norm_hfss = scaler_hfss.transform(hfss_npy)
    
    data = []
    data.append(np.expand_dims(norm_tas, axis=2))
    data.append(np.expand_dims(norm_hfls, axis=2))
    data.append(np.expand_dims(norm_hfss, axis=2))
    data.append(np.expand_dims(norm_pr, axis=2))
    data.append(np.expand_dims(norm_psl, axis=2))
    data = np.concatenate(data, axis = 2)
    
    X_train, X_test, y_train, y_test = train_test_split(data, target.ocn_cat, test_size=0.1, random_state=42, stratify=target.ocn_cat)
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42, stratify=y_train)

    
    
    
    model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(601,5)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(3, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])
    
    es = EarlyStopping(monitor='val_loss', mode='auto', restore_best_weights=True, verbose=0, patience=0)

    model.fit(X_train, y_train, epochs=10, verbose=0, validation_data=(X_val,y_val) , callbacks=[es])
    
    test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=0)
    
    return test_acc
    
    #print('Test accuracy for Latitude {} and Longitude {} is {}'.format(lat, lon, test_acc))

In [113]:
## One location

print(ocn_classification(26,102))

0.6000000238418579


In [107]:
## Multiple locations--assign an empty matrix first, and run the function in a loop
## Usually it takes a long time to go through all the locations and it kills the kernel
## it might be a good practice to 'split' the locations into different for loops for efficient usage
## for now I am assigning just a small number to demonstrate

lat_max = 4 #36 (max lat of this problem)
lon_max = 4 #111 (max lon of this problem)
pred_ocn = np.zeros((lat_max,lon_max))

In [108]:

for lat in range(0,lat_max):
    for lon in range(0,lon_max):
        print('Lat {} Lon {}', lat, lon)
        pred_ocn[lat][lon] = ocn_classification(lat,lon)

Lat {} Lon {} 0 0
Lat {} Lon {} 0 1
Lat {} Lon {} 0 2
Lat {} Lon {} 0 3
Lat {} Lon {} 1 0
Lat {} Lon {} 1 1
Lat {} Lon {} 1 2
Lat {} Lon {} 1 3
Lat {} Lon {} 2 0
Lat {} Lon {} 2 1
Lat {} Lon {} 2 2
Lat {} Lon {} 2 3
Lat {} Lon {} 3 0
Lat {} Lon {} 3 1
Lat {} Lon {} 3 2
Lat {} Lon {} 3 3


In [109]:
pred_ocn

array([[0.55000001, 0.5       , 0.5       , 0.55000001],
       [0.44999999, 0.40000001, 0.5       , 0.55000001],
       [0.55000001, 0.5       , 0.60000002, 0.55000001],
       [0.44999999, 0.55000001, 0.5       , 0.5       ]])