In [1]:
import numpy as np
import xarray as xr
import scipy.io as sio
import datetime as dt
import sys

import torch
import torchvision
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from skimage.transform import rescale as skrescale
from scipy import signal as ssignal

In [2]:
#vstring = int(sys.argv[1])
rseed = int(330)
#humidvar = sys.argv[3]
import random
random.seed(rseed)
np.random.seed(rseed)
torch.manual_seed(rseed)
torch.cuda.manual_seed(rseed)
torch.cuda.manual_seed_all(rseed)

In [3]:
opt_model = 'R18'

In [4]:
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()

    def forward(self, x):
        return x

In [5]:
class SCL(nn.Module):
    """
    We opt for simplicity and adopt the commonly used ResNet (He et al., 2016) to obtain hi = f(x ̃i) = ResNet(x ̃i) where hi ∈ Rd is the output after the average pooling layer.
    """

    def __init__(self, train, encoder, projection_dim, n_features):
        super(SCL, self).__init__()
        self.train = train
        self.encoder = encoder
        self.n_features = n_features

        # increse input channel to 6
        layer = self.encoder.conv1
        new_nc = 6
        new_layer = nn.Conv2d(in_channels=new_nc,
                              out_channels=layer.out_channels,
                              kernel_size=layer.kernel_size,
                              stride=layer.stride,
                              padding=layer.padding,
                              bias=layer.bias)
        # Extending the weights by copying from the old 3 to the new 3 channels
        new_layer.weight.data[:, 0:3, :, :] = layer.weight.clone()
        new_layer.weight.data[:, 3:6, :, :] = layer.weight.clone()
        new_layer.weight = nn.Parameter(new_layer.weight)
        self.encoder.conv1 = new_layer

        # Replace the fc layer with an Identity function
        self.encoder.fc = Identity()
        # We use a MLP with one hidden layer to obtain z_i = g(h_i) = W(2)σ(W(1)h_i) where σ is a ReLU non-linearity.
        # xc: This is the part that needs to be trained
        self.projector = nn.Sequential(
            nn.Linear(self.n_features, self.n_features, bias=False),
            nn.ReLU(),
            nn.Linear(self.n_features, projection_dim, bias=False),
        )
        # These are the parameters obtained from simCLR repo. I have also patched it to include 6 channels at the conv1
        param_file = rootdir + 'model_lib/SCL_param.encoder.%s.6_channel.init.tar' % opt_model
        self.encoder.load_state_dict(torch.load(param_file, map_location='cpu'))


        # freeze the encoder so it is not re-trained
        for param in self.encoder.parameters():
            param.requires_grad = False


    def forward(self, x_i):
        # z_i = self.encoder(x_i.type(torch.FloatTensor).cuda()).type(torch.FloatTensor).cuda()
        z_i = self.encoder(x_i.type(torch.FloatTensor)).type(torch.FloatTensor)

        del x_i
        return z_i

In [11]:
def collect_norm_data_by_var(my_var, vname):
    
    fname = f'{datadir}/{my_var}.2001-2020.daymean.anomaly.nc'
    with xr.open_dataset(fname) as ds:
        da = ds[vname]
        da_max = da.max(['time','latitude','longitude']).data
        da_min = da.max(['time','latitude','longitude']).data
        
        my_vmax = np.maximum(da_max, -1*da_min)
        vmax, vmin = my_vmax, -1*my_vmax
        print(f'{my_var}: {vmin}, {vmax}')
        
        out = (da - vmin) / (vmax - vmin)
        
    return out

In [8]:
mean5kernel = np.ones((5,5))/25

class TrainDataset(Dataset):
    '''
    Since we need to mannually normalize the data, let's create datasets elsewhere, and just aggreagate them here.
    Requires: T_full, H_full, W_full, U_full, V_full, Z_full
    '''

    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.sample_data = root_dir + 't500.2001-2020.daymean.anomaly.nc'

    def __len__(self):
        with xr.open_dataset(self.sample_data) as inds:
            nt = inds['T'].shape[0]
        return int(nt)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # find a corresponding idx_pair, outside the 360-length window of idx
        # idx_pair = xxxx


        sample_raw = np.zeros((6,92,112))

        for i,fullds in zip(np.arange(6), [t500_full, t850_full, z500_full, z850_full, t2_full, sp_full]):
            
            # construct input for idx
            # rescaling
            data_step1 = skrescale(fullds[idx], (2.5, 2.5), anti_aliasing=True)
            # mean using 5x5
            data_step2 = ssignal.convolve2d(data_step1, mean5kernel, boundary='symm', mode='same')
            sample_raw[i] = data_step2


            # construct input for idx_pair
            # rescaling

            # mean using 5x5

        return sample_raw, idx

In [9]:
def SCLloss(my_x, my_y, my_temperature=0.5):
    '''
    my_x and my_y has a one-to-one pair. So there are in total N*N pairs. In these N*N, the diagonal pairs are positive,
     and the rest are negative. So we want to maximum diagonal while suppressing the rest.
    '''
    ns = my_x.shape[0]
    # use broadcasting to achieve pairwise cos. Note my_y.t() operation and dimension handling
    cos_matrix = torch.nn.functional.cosine_similarity(my_x[:,:,None], my_y.t()[None,:,:])/my_temperature
    similarity_matrix = torch.exp(cos_matrix)


    loss = torch.tensor([0.0], requires_grad=True)
    for i in np.arange(ns):
        loss = loss -1*torch.log(similarity_matrix[i,i]/(torch.sum(similarity_matrix[i,:])-similarity_matrix[i,i]))
        loss = loss -1*torch.log(similarity_matrix[i,i]/(torch.sum(similarity_matrix[:,i])-similarity_matrix[i,i]))

    loss = loss/(2*ns)

    return loss

In [14]:
rootdir = '/global/cfs/projectdirs/m1657/liuy351/TallTower/SCL/'

datadir = '/global/cfs/projectdirs/m1657/liuy351/TallTower/ERA5_reduced/'

In [15]:
# 0. major parameters
if opt_model=='R18':
    batch_size = 128
elif opt_model=='R15':
    batch_size = 64
    
# 1. construct functions
if opt_model=='R18':
    encoder = torchvision.models.resnet18(weights=None)
elif opt_model=='R50':
    encoder = torchvision.models.resnet50(weights=None)

n_features = encoder.fc.in_features  # get dimensions of fc layer

# 2. construct two models, one with random parameters, one with pre-trained parameters
projection_dim = 256
SCL = SCL(True, encoder, projection_dim, n_features)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SCL = SCL.to(device)

In [16]:
# 3. load data

t500_full = collect_norm_data_by_var('t500', 'T')
t850_full = collect_norm_data_by_var('t850', 'T')
z500_full = collect_norm_data_by_var('z500', 'Z')
z850_full = collect_norm_data_by_var('z850', 'Z')
t2_full   = collect_norm_data_by_var('2t', 'VAR_2T')
sp_full   = collect_norm_data_by_var('sp', 'SP')

print(t500_full.shape)

t500: -271.6439208984375, 271.6439208984375
t850: -307.06951904296875, 307.06951904296875
z500: -58777.48046875, 58777.48046875
z850: -16449.490234375, 16449.490234375
2t: -310.2423400878906, 310.2423400878906
sp: -104705.5390625, 104705.5390625
(7305, 37, 45)


In [17]:
batch_size

128

In [18]:
train_dataset = TrainDataset(root_dir=datadir)

# turn off shuffle, so data is processed in the time order
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

In [19]:
outdata = np.zeros((t500_full.shape[0], n_features))
outindex = np.zeros(t500_full.shape[0])

sindex = -1*batch_size
eindex = 0

print(dt.datetime.now())
for step, data in enumerate(train_dataloader):
    # print(data[0].shape)

    testout = SCL(data[0].to(device))


    sindex = eindex
    eindex += testout.shape[0]

    outdata[sindex:eindex, :] = testout.detach().cpu().numpy()
    outindex[sindex:eindex] = data[1]

    print(sindex, eindex)

print(dt.datetime.now())

2024-06-25 11:57:59.156274
0 128
128 256
256 384
384 512
512 640
640 768
768 896
896 1024
1024 1152
1152 1280
1280 1408
1408 1536
1536 1664
1664 1792
1792 1920
1920 2048
2048 2176
2176 2304
2304 2432
2432 2560
2560 2688
2688 2816
2816 2944
2944 3072
3072 3200
3200 3328
3328 3456
3456 3584
3584 3712
3712 3840
3840 3968
3968 4096
4096 4224
4224 4352
4352 4480
4480 4608
4608 4736
4736 4864
4864 4992
4992 5120
5120 5248
5248 5376
5376 5504
5504 5632
5632 5760
5760 5888
5888 6016
6016 6144
6144 6272
6272 6400
6400 6528
6528 6656
6656 6784
6784 6912
6912 7040
7040 7168
7168 7296
7296 7305
2024-06-25 12:00:59.490795


In [20]:
outfile = rootdir + 'ResNet_output/%s_output.anomaly.2001-2020.ERA5.mat' % opt_model
print('writing to %s ...' % outfile)
description = 'Just the simCLR encoder output. So in 512 dimension. Use %s model' % opt_model
script = '/global/cfs/projectdirs/m1657/liuy351/TallTower/SCL/02.ResNet_encoder_production_run.ipynb'
sio.savemat(outfile, {'ResNetoutput':outdata, 'tindex':outindex, 'description':description, 'script':script})

writing to /global/cfs/projectdirs/m1657/liuy351/TallTower/SCL/ResNet_output/R18_output.anomaly.2001-2020.ERA5.mat ...


In [30]:
mp.cpu_count()

256