# Clone the repo from Github and installation

In [None]:
!git clone https://github.com/poyentung/autoencoder.git
%cd /content/autoencoder
!pip install -r requirements.txt

Cloning into 'autoencoder'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 33 (delta 14), reused 27 (delta 8), pack-reused 0[K
Unpacking objects: 100% (33/33), done.
/content/autoencoder
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hyperspy==1.7.3
  Downloading hyperspy-1.7.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (32.6 MB)
[K     |███████████████████▌            | 19.9 MB 153 kB/s eta 0:01:23

!!! Please restart the runtime after installation !!!

Runtime -> Restart runtime (or using `ctrl + M + .`)

# Download the dataset

In [None]:
from google.colab import drive
import os
drive.mount('/gdrive', force_remount=True)

In [None]:
def load_and_rebin_data(p):
  # If file not found, create it by rebining the original file on path_no_rebin
  path_no_rebin = p.replace("calibrated_data_nav2rebin_radial2d", "calibrated_data_radial2d")
  import hyperspy.api as hs
  s = hs.load(path_no_rebin, lazy=True)
  s = s.inav[:-1,:-1].rebin(scale=[2,2,1,1])
  s.compute()
  s.save(p)
  del s
  import gc
  gc.collect()

In [None]:
#!wget -O 20200209_163154_corrected_nav_rebin_radial2d_crop2d.hspy https://www.dropbox.com/s/bjvxbbkz2x4e0ib/20200209_163154_corrected_nav_rebin_radial2d_crop2d.hspy?dl=0

## OR

path = '/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/tcdh_thin_20211117_182521/20211117_182521_data_calibrated_data_nav2rebin_radial2d_crop2d.hspy'
#path = '/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/tcdh_thin_20211117_182814/20211117_182814_data_calibrated_data_nav2rebin_radial2d_crop2d.hspy'
#path = '/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/dcdh_20220628_114631/20220628_114631_data_calibrated_data_nav2rebin_radial2d_crop2d.hspy'
#path = '/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/tiarnan_science_paper/20200209_163154_calibrated_data_nav2rebin_radial2d_crop2d.hspy'
#path = '/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/polytype_all_20220720_155557/20220720_155557_data_calibrated_data_nav2rebin_radial2d_crop2d.hspy'

if not os.path.exists(path):
  load_and_rebin_data(path)

assert os.path.exists(path)
print(path)

/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/data/experimental/tcdh_thin_20211117_182521/20211117_182521_data_calibrated_data_nav2rebin_radial2d_crop2d.hspy


# Import libraries

In [None]:
import os
import pytorch_lightning as pl
from dimension_reduction.data_module import DPDataModule, DPDatasetMultiChannel1D
from dimension_reduction.model import VariationalAutoEncoderMultiChannel1D
from dimension_reduction.lightning_module import VAE
from pytorch_lightning.callbacks.progress import RichProgressBar
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

# Init for parameters

In [None]:
ml_path = "/gdrive/MyDrive/PhD/projects/sed_ML/clustering_poyen/models"

model_name = f'multichannel_inplane240_bn_bs64_{os.path.basename(os.path.dirname(path))}'
train_model = True
#model_name = "multichannel_inplane240_bn_bs64_permutation_tcdh_thin_20211117_182521"

args = dict(path = path,
            checkpoint_path=os.path.join(ml_path, model_name),
            model_name=model_name,
            version='0',
            val_data_ratio = 0.0,
            batch_size = 64,
            n_cpu = 2,
            num_epoch = 20,
            cube_root = False,
            model = VariationalAutoEncoderMultiChannel1D,
            dataset = DPDatasetMultiChannel1D,
            random_permutation=False,
            inplanes = 240,
            learning_rate = 1e-4,
            decay = 0.0,
            optimizer = 'adam',
            precision_for_training = 16,
            lr_scheduler_kw = None)

# save hyperparameters
if not os.path.isdir(args['checkpoint_path']): os.mkdir(args['checkpoint_path'])
with open(args['checkpoint_path']+'/parameters.txt', 'w+') as fp:
    for key in args.keys():
        fp.write(f'{key}: {args[key]}\n')

# Set Logger
logger = TensorBoardLogger(save_dir="lightning_logs",
                           name=args['model_name'],
                           version=args['version'])

logger.log_hyperparams(params=args)

# Set checkpoints paths
checkpoint_callback = ModelCheckpoint(
    save_top_k=5,
    monitor="Epoch_val_loss",
    # every_n_epochs=5,
    mode="min",
    save_last=True,
    dirpath=args['checkpoint_path'],
    filename= args['model_name'] + "-{epoch:02d}-{val_loss:.2f}")

# Start training

In [None]:
# Load model
pl.seed_everything(0)

model = VAE(model = args['model'],
            inplanes = args['inplanes'],
            optimizer = args['optimizer'],
            lr_scheduler_kw = args['lr_scheduler_kw'],
            learning_rate = args['learning_rate'],
            decay = args['decay'])

if not train_model:
  model = model.load_from_checkpoint(os.path.join(ml_path, model_name, "last.ckpt"))


INFO:pytorch_lightning.utilities.seed:Global seed set to 0


In [None]:
# Load dataset
data_module = DPDataModule(path=args['path'],
                           dataset=args['dataset'],
                           random_permutation=args['random_permutation'],
                           val_data_ratio = args['val_data_ratio'],
                           batch_size = args['batch_size'],
                           n_cpu = args['n_cpu'],
                           cube_root = args['cube_root'])




In [None]:
try:
  trainer = pl.Trainer(max_epochs=args['num_epoch'],
                      callbacks=[RichProgressBar(),checkpoint_callback],
                      logger=logger,
                      precision=args['precision_for_training'],
                      accelerator='gpu',
                      devices=1)
  if train_model:
    trainer.fit(model, data_module)
except:
  trainer = pl.Trainer(max_epochs=args['num_epoch'],
                      callbacks=[RichProgressBar(),checkpoint_callback],
                      logger=logger,
                       )
  if train_model:
    trainer.fit(model, data_module)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit native Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True, used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

# Evaluation on data

In [None]:
trainer.test(model, data_module)

INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{}]

In [None]:
from dimension_reduction.clustering import PixelSegmenter
from dimension_reduction import gui

PS = PixelSegmenter(latent=model.latent,
                    dataset=data_module.dataset_full.seddataset,
                    method= "HDBSCAN",
                    method_args = dict(min_cluster_size=20,
                                       min_samples=1,
                                       cluster_selection_epsilon=0.05))

In [None]:
print(os.path.basename(os.path.dirname(path)))
gui.ckeck_latent_space(PS, ratio_to_be_shown=1.0)

# Check cluster data

(use `ctrl` to select multiple clusters)

In [None]:
gui.show_cluster_distribution(PS)

SelectMultiple(options=('All', 'cluster_0', 'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5', '…

HBox(children=(Text(value='results', description='Folder name:', layout=Layout(width='auto'), placeholder='Typ…

Output()

Output()