In [None]:
%matplotlib inline
#!pip install UMAP-learn
#!pip install netCDF4

# Clustering with UMAP on MNIST
UMAP is a clustering/classification library that has been showing great potential. Before we apply UMAP to radar data we will go through and apply UMAP to the MNIST digits dataset based on an example in the UMAP documentation (https://umap-learn.readthedocs.io/en/)

In [None]:
from sklearn.datasets import load_iris, load_digits
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import umap
import numpy as np

In [None]:
digits = load_digits()


In [None]:
fig, ax_array = plt.subplots(20, 20)
axes = ax_array.flatten()
for i, ax in enumerate(axes):
    ax.imshow(digits.images[i], cmap='gray_r')
plt.setp(axes, xticks=[], yticks=[], frame_on=False)
plt.tight_layout(h_pad=0.5, w_pad=0.01)

We can look at an individual digits image. 

In [None]:
plt.figure(figsize=(5,5))
plt.imshow(digits.images[5], cmap='gray_r')

In [None]:
reducer = umap.UMAP(a=1.576943460405378,  angular_rp_forest=False,
   b=0.8950608781227859,  init='spectral',
   local_connectivity=1.0, metric='euclidean', metric_kwds={},
   min_dist=0.1, n_components=2, n_neighbors=15,
   negative_sample_rate=5, random_state=42, set_op_mix_ratio=1.0,
   spread=1.0, target_metric='categorical', target_metric_kwds={},
   transform_queue_size=4.0, transform_seed=42, verbose=False)
reducer.fit(digits.data)


In [None]:
embedding = reducer.transform(digits.data)


So we've specified we want 2 output components, which makes this much easier to visualize. 

In [None]:
print(embedding.shape)

In [None]:
plt.figure(figsize=(8,8))
plt.scatter(embedding[:, 0], embedding[:, 1], c=digits.target, cmap='tab10', s=5)
plt.gca().set_aspect('equal', 'datalim')
plt.colorbar(boundaries=np.arange(11)-0.5).set_ticks(np.arange(10))
plt.title('UMAP projection of the Digits dataset', fontsize=24);

__Exercise__
1.Play with some of the parameters and rerun this and see what happens. 

In [None]:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(digits.data)

In [None]:
plt.scatter(tsne_results[:,0], tsne_results[:,1], c = digits.target, cmap='tab10')
plt.colorbar()

# Radar HID
Next we will apply UMAP to some radar data. We've taken some radar data and run an HID algorithm over it already. The goal is to explore the output of the HID algorithm and look into what clustering shows about the separability of the HID clusters. 

In [None]:
%matplotlib inline
import os, sys, glob
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from scipy.signal import medfilt
from matplotlib.colors import LogNorm
from netCDF4 import Dataset
# import pyart
# from skewt import SkewT
# import xarray as xr


In [None]:
# !wget https://github.com/josephhardinee/weather_radar_ml_course/raw/master/XSAPR20110511_150603.nc

In [None]:
filename = './XSAPR20110511_150603.nc'
grid = Dataset(filename)


In [None]:
# Form dataset
rhohv = grid.variables['cross_correlation_ratio'][:].squeeze()
height = grid.variables['z']
x = grid.variables['x']
y = grid.variables['y']
zh = grid.variables['attenuation_corrected_reflectivity'][:].squeeze()
zdr = grid.variables['attenuation_corrected_differential_reflectivity'][:].squeeze()
kdp = grid.variables['specific_differential_phase_pos_lp'][:].squeeze()

HID = grid.variables['HID'][:].squeeze()

In [None]:
plt.pcolormesh(x, y, HID[4,:,:], cmap='tab10')
plt.colorbar()

In [None]:
rhv_idx = rhohv.ravel() > .7
data = np.array([zh.ravel()[rhv_idx], zdr.ravel()[rhv_idx], rhohv.ravel()[rhv_idx], kdp.ravel()[rhv_idx]]).T
print(data.shape)
print(HID.shape)


In [None]:
reducer = umap.UMAP(n_neighbors=25)
embedding = reducer.fit_transform(data)
# embedding = reducer.fit_transform(data, y=HID.ravel()[rhv_idx])

In [None]:
plt.figure(figsize=(12,12))
plt.scatter(embedding[:, 0], embedding[:, 1], c=HID.ravel()[rhv_idx], cmap='tab10')
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of xsapr HID', fontsize=24);
# plt.xlim(-10, 10)
# plt.ylim(10, 25)
plt.colorbar()

    1. Drizzle
    2. Rain
    3. Ice Crystals
    4. Aggregates
    5. Wet/Melting Snow
    6. Vertically Aligned Ice
    7. Low-Density Graupel
    8. High-Density Graupel
    9. Hail
    10. Big Drops

Exercises:
1. Play with the parameters. Can you improve this?
2. Wat does this say about HID. Do we necessarily trust the classes?
3. How can we improve this?