Note: 
    
Only run for winter, since we do clustering here.

In [1]:
import numpy as np
import scipy.io as sio

import pandas as pd
import datetime as dt

In [2]:
from sklearn.cluster import HDBSCAN

In [3]:
rootdir = '/global/cfs/projectdirs/m1657/liuy351/TallTower/'


In [4]:
opt_model = 'R18'

In [5]:
ts_full = pd.date_range('2001-01-01 00:00', end='2020-12-31 23:00', freq='h')

## 1. Run a classifier to cluster weather systems

### 1.1 20 years

In [6]:
tag_20year = (ts_full.year>=2015)&(ts_full.year<=2020)
print(tag_20year.shape, tag_20year.sum())

(175320,) 52608


In [7]:
infile = rootdir + 'ResNet_output/%s_output.anomaly.2001-2020.ERA5.mat' % opt_model
print(infile)
inds = sio.loadmat(infile)
full_NCLtag = inds['ResNetoutput'][tag_20year==1]

print(full_NCLtag.shape)

/global/cfs/projectdirs/m1657/liuy351/TallTower/ResNet_output/R18_output.anomaly.2001-2020.ERA5.mat
(52608, 512)


In [8]:
# import dask.array as da
# full_NCLtag = da.from_array(full_NCLtag, chunks=(128, 512))

In [9]:
import numpy as np
from scipy.spatial.distance import cdist
from multiprocessing import Pool
import functools

def compute_distances(data_chunk, full_data):
    """Compute distances between a chunk of data and the full dataset."""
    return cdist(data_chunk, full_data, metric='euclidean')

def parallel_distance_matrix(data, num_splits):
    """Calculate the full distance matrix in parallel."""
    # Split data into chunks
    chunks = np.array_split(data, num_splits)

    # Create a pool of processes
    with Pool() as pool:
        # Partial function with fixed full_data
        partial_compute_distances = functools.partial(compute_distances, full_data=data)
        
        # Map-reduce: map the function over the chunks and reduce the result
        distance_chunks = pool.map(partial_compute_distances, chunks)

    # Concatenate the results to form the full distance matrix
    return np.vstack(distance_chunks)

t1 = dt.datetime.now()
# Example usage
num_processors = 128  # Number of processors
distance_matrix = parallel_distance_matrix(full_NCLtag, num_processors)
t2 = dt.datetime.now()
print(distance_matrix.shape)
print((t2-t1))

(52608, 52608)
0:02:09.870056


In [10]:
import xarray as xr
da = xr.DataArray(distance_matrix, dims=['x', 'y'])
da.attrs['description'] = 'Calculate the distance matrix for reducing the computation of clustering'
da.attrs['script'] = '/global/cfs/projectdirs/m1657/liuy351/TallTower/03.clustering_HDBSCAN.ipynb'
da.to_netcdf(f'{rootdir}/ResNet_output/{opt_model}_distance_matrix.2015-2020.ERA5.nc')
