# Intro to Training and Configurations

First we import fibad and create a new fibad object, instantiated (implicitly), with the default configuration file.

In [None]:
import fibad

fibad_instance = fibad.Fibad(config_file="/home/drew/code/fibad/drews_config.toml")

For this demo, we'll make a few adjustments to the default configuration settings that the `fibad` object was instantiated with. By accessing the `.config` attribute of the fibad instance, we can modify any configuration value. Here we change which built in model to use, the dataset, batch size, number of epochs for training.

In [None]:
fibad_instance.config["model"]["name"] = "ExampleAutoencoder"
fibad_instance.config["data_set"]["name"] = "HSCDataSet"
fibad_instance.config["data_loader"]["batch_size"] = 64
fibad_instance.config["train"]["epochs"] = 20

We call the `.train()` method to train the model

In [None]:
fibad_instance.train()

The output of the training will be stored in a time-stamped directory under the `./results/`. By default, a copy of the final configuration used in training is persisted as `runtime_config.toml`. To run fibad again with the same configuration, you can reference the runtime_config.toml file.

If running in another notebook, instantiate a fibad object like so:
```
new_fibad_instance = fibad.Fibad(config_file='./results/<timestamped_directory>/runtime_config.toml')
```

Or from the command line:
```
>> fibad train --runtime-config ./results/<timestamped_directory>/runtime_config.toml
```

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir ./results

# if running on a remote server, and tunnelling a connection,
# pass the --bind-all flag
# %tensorboard --logdir ./results --bind_all
# and then forward the selected port to your local machine

Update the config with the trained model that we want to use and set a few other parameters.

In [None]:
fibad_instance.config["predict"][
    "model_weights_file"
] = "/home/drew/code/fibad/docs/notebooks/results/20241216-203332-train/example_model.pth"
fibad_instance.config["predict"]["split"] = "test"
fibad_instance.config["data_set"]["test_size"] = 1.0
fibad_instance.config["data_set"]["train_size"] = 0.0
fibad_instance.config["data_set"]["validate_size"] = 0.0
fibad_instance.config["data_loader"]["batch_size"] = 128

Run inference on the data set using the specified data and trained model.

In [None]:
fibad_instance.predict()

Grab a copy of the PyTorch data_set object to use as a reference for file names.

In [None]:
prepped_output = fibad_instance.prepare()

Define a couple of functions to help with plotting and open a connection to our vector database

In [None]:
import chromadb
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits


# Function to normalize the data to the range [0, 1]
def normalize(data):
    data_min = np.min(data)
    data_max = np.max(data)
    return (data - data_min) / (data_max - data_min)


# Plot our 3 filter images
def plotter(file_name):
    # Read the FITS files
    base_path = "/home/drew/code/fibad/docs/notebooks/data/hsc_example/hsc_8asec_1000/"
    fits_file_r = base_path + file_name + "_HSC-I.fits"
    fits_file_g = base_path + file_name + "_HSC-R.fits"
    fits_file_b = base_path + file_name + "_HSC-G.fits"

    data_r = fits.getdata(fits_file_r)
    data_g = fits.getdata(fits_file_g)
    data_b = fits.getdata(fits_file_b)

    # Normalize the data
    data_r = normalize(data_r)
    data_g = normalize(data_g)
    data_b = normalize(data_b)

    # Combine the data into an RGB image
    rgb_image = np.zeros((data_r.shape[0], data_r.shape[1], 3))
    rgb_image[..., 0] = data_r  # Red channel
    rgb_image[..., 1] = data_g  # Green channel
    rgb_image[..., 2] = data_b  # Blue channel

    # Display the image
    plt.imshow(rgb_image, origin="lower")
    plt.axis("off")  # Hide the axis
    plt.show()


# open a connection to the vector database
client = chromadb.PersistentClient(path="/home/drew/code/fibad/docs/notebooks/results/vdb")
collection = client.get_collection("fibad_collection")

Load one of the .npy files that was saved when we ran the data through the trained model.

In [None]:
import numpy as np

a = np.load("/home/drew/code/fibad/docs/notebooks/results/20241216-203830-predict/0.npy")

Pick a few random embeddings from the file and query the vector database to find the most similar data samples.
"Similar" in this case is the L2 norm metric, $ d = \sum_{} (A_i - B_i)^2 $.

Cosine similarity and Inner product distance metrics are also supported.

In [None]:
# 97 is a cool example
# 93 is a clean example
# Pleasantly, 42 is a nice face-on spiral
indx = 92

query_results = collection.query(
    query_embeddings=[a[indx]],
    n_results=5,
)

print(query_results["distances"])

metadatas = query_results["metadatas"]

files_to_plot = []
for m in metadatas[0]:
    files = prepped_output.container.files[int(m["filename"])]
    g_file = files["HSC-G"]
    files_to_plot.append(g_file[:-11])

for i, file_name in enumerate(files_to_plot):
    plotter(file_name)

Now let's look for outliers. For every entry in the database find a number to represent the distance to it's nearest neighbor.

For instance, it could be the distance to it's closest neighbor, the mean (or in this case median) distance to it's closest N neighbors, etc...

Note that this is an inefficient way to query the database - Chromadb recommends batching the queries.

In [None]:
import glob

found_files = glob.glob("/home/drew/code/fibad/docs/notebooks/results/20241216-203830-predict/*.npy")

distances = []
file_names = []

# for each embedding in each output file from inference, calculate a representation of the distance to it's nearest neighbor
for f in found_files:
    a = np.load(f)
    for i in range(len(a)):
        query_results = collection.query(
            query_embeddings=[a[i]],
            n_results=10,
        )
        distances.append(np.median(query_results["distances"][0][1:]))
        file_names.append(query_results["metadatas"][0][0]["filename"])

# print some statistics about the distances
print(f"Total values: {len(distances)}")
print(f"Max: {max(distances)} Min: {min(distances)}")
print(f"Mean: {np.mean(distances)} Median: {np.median(distances)} Std: {np.std(distances)}")

# create a histogram of the distances
_ = plt.hist(distances, bins=50, range=(0, 3500))

So let's look more closely at the objects in the tail of the histogram. i.e. the ones that are "far" from their nearest neighbors.

In [None]:
# get the indexes of distances where the value is between
# Near by range: 2750 and 3250
# Far away range: 5_000 and 30_000
indexes = [i for i, x in enumerate(distances) if 5_000 < x < 30_000]

# use those indexes to get the file names from the file_names list
files_to_plot = [file_names[i] for i in indexes]
files_to_plot


# plot the images that are in the range of distances we specified.
plot_em = []
names = []
for m in files_to_plot:
    files = prepped_output.container.files[int(m)]
    g_file = files["HSC-G"]
    plot_em.append(g_file[:-11])
    names.append(g_file[:-11])

for file_name, name in zip(plot_em, names):
    plotter(file_name)
    print(name)

In [None]:
found_files = glob.glob("/home/drew/code/fibad/docs/notebooks/results/20241216-203830-predict/*.npy")

distances = []
file_names = []
latent_spaces = []

# for each embedding in each output file from inference, calculate a representation of the distance to it's nearest neighbor
for f in found_files:
    latent_spaces.append(np.load(f))

latent_spaces = np.asarray(np.concatenate(latent_spaces))
latent_spaces.shape

In [None]:
import umap

reducer = umap.UMAP()
embedding = reducer.fit_transform(latent_spaces)
embedding.shape
plt.scatter(embedding[:, 0], embedding[:, 1], s=1)