In [None]:
import sys
sys.path.append("/workdir/unsupervised_pretrain/")

In [None]:
import numpy as np
import json
import torch
from InstructorEmbedding import INSTRUCTOR
from tqdm.notebook import tqdm
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Load and embed #

In [None]:
device = torch.device("cuda")

In [None]:
embed_model = INSTRUCTOR("hkunlp/instructor-xl").to(device)
embed_model.max_seq_length = 4096

In [None]:
with open("/datasets/datasets/unsupervised-sentinel2/testset-16SEF/testset-16SEF-512.json", "r") as f:
    data = json.load(f)

In [None]:
instruction = "Represent the geospatial data (the number, proportion, and description of geographic features) for clustering:"

In [None]:
pairs = []
for datum in data:
    pairs.append([instruction, datum])

In [None]:
embeddings = embed_model.encode(pairs)

In [None]:
normalized_embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Visualize #

In [None]:
tsne = TSNE(n_components=2, random_state=0)

## Directly from `instructor-xl` ##

In [None]:
data_2d = tsne.fit_transform(normalized_embeddings)

# plot the result
plt.figure(figsize=(6, 5))
plt.scatter(data_2d[:, 0], data_2d[:, 1])
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
plt.show()

## From `.npy` files ##

In [None]:
import glob
npy_embeddings = []
for filename in glob.glob("/datasets/datasets/unsupervised-sentinel2/testset-16SEF/*.npy", recursive=True):
    npy_embeddings.append(np.load(filename))
npy_embeddings = np.concatenate(npy_embeddings, axis=0)
npy_embeddings = npy_embeddings / np.linalg.norm(npy_embeddings, axis=1, keepdims=True)

In [None]:
data_2d = tsne.fit_transform(npy_embeddings)

# plot the result
plt.figure(figsize=(6, 5))
plt.scatter(data_2d[:, 0], data_2d[:, 1])
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
plt.show()

## From `SeriesEmbedDataset` ##

In [None]:
from datasets import SeriesEmbedDataset

In [None]:
ds = SeriesEmbedDataset(["/datasets/datasets/unsupervised-sentinel2/testset-16SEF/"], size=512, series_length=20)

In [None]:
ds_embeddings = []
with torch.inference_mode():
    for i in tqdm(range(0, len(ds), 2)):
        _, _, embedding = ds[i]
        ds_embeddings.append(embedding.reshape(1,-1))
ds_embeddings = np.concatenate(ds_embeddings, axis=0)

In [None]:
ds_embeddings /= np.linalg.norm(ds_embeddings, axis=1, keepdims=True)

In [None]:
data_2d = tsne.fit_transform(ds_embeddings)

# plot the result
plt.figure(figsize=(6, 5))
plt.scatter(data_2d[:, 0], data_2d[:, 1])
plt.xlabel("t-SNE feature 0")
plt.ylabel("t-SNE feature 1")
plt.show()