In [1]:
import numpy as np
import os
import glob
from sklearn.decomposition import IncrementalPCA, PCA
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import multiprocessing  # <--- Importado para paralelismo

In [12]:
# Define input and output directories
in_directory = "./../../hyspecnet-11k/hyspecnet-11k/patches/"
out_directory = "./../../hyspecnet11k-PCA/"
npy_files = glob.glob(f"{in_directory}**/**/*DATA.npy")

n_files = len(npy_files)


# create out directory if dont exist
if not os.path.exists(out_directory):
    os.makedirs(out_directory)

In [3]:

# Parâmetros do Group-wise PCA
TOTAL_NC = 32          # Número total de componentes principais desejado no final
NUM_GROUPS = 4           # Em quantos grupos as bandas espectrais serão divididas

# Dimensões esperadas de cada patch (bandas, altura, largura)
DIMS = (202, 128,128)

BATCH_SIZE = 128
# --- Verificações Iniciais ---
if TOTAL_NC % NUM_GROUPS != 0:
    raise ValueError("O número total de componentes (TOTAL_NC) deve ser divisível pelo número de grupos (NUM_GROUPS).")
NC_PER_GROUP = TOTAL_NC // NUM_GROUPS





In [4]:
# Função para processar um único arquivo .npy
def process_single_file(file_path):
    try:
        X = np.load(file_path)
        if X.shape == DIMS:
            X = np.moveaxis(X, 0, -1)
            
            X = np.reshape(X, (-1, DIMS[0]))
            return X
    except Exception:
        # Ignora arquivos corrompidos ou com erro de leitura
        return None


def split_data(data_list, group=4):
    output_data = data_list
    step = group // 2
    for i in range(step):
        split_data = []
        for data in output_data:
            n, c = data.shape
            data_s1 = data[:, :c // 2]
            data_s2 = data[:, c // 2:]
            split_data.append(data_s1)
            split_data.append(data_s2)
        output_data = split_data
    return output_data


def applyGWPCA(X, nc=32, group=4, whiten=True):
    h, w, c = X.shape
    X = np.reshape(X, (-1, c))
    X = (X - X.min()) / (np.max(X) - np.min(X))

    X_split = split_data([X], group)
    pca_data_list = []
    for i, x in enumerate(X_split):
        pca = PCA(n_components=nc // group, whiten=whiten, random_state=42)
        pca_data = pca.fit_transform(x)
        pca_data_list.append(pca_data)

    out = np.concatenate(pca_data_list, axis=-1)
    out = np.reshape(out, (h, w, -1))
    return out





In [5]:
X = np.load(npy_files[0])
# colocar o canal como a última dimensão
X = np.moveaxis(X, 0, -1)

h, w, c = X.shape
whiten = True
group = 4
nc = 32

X = np.reshape(X, (-1, DIMS[0]))


X_split = split_data([X], group)
pca_data_list = []
for i, x in enumerate(X_split):
    pca = PCA(n_components=nc // group, whiten=whiten, random_state=42)
    pca_data = pca.fit_transform(x)
    pca_data_list.append(pca_data)

out = np.concatenate(pca_data_list, axis=-1)
out = np.reshape(out, (h, w, -1))


In [6]:
pca.components_.shape

(8, 51)

In [7]:
pca_data_list[0].shape

(16384, 8)

In [8]:
split_data([process_single_file(npy_files[0])])[0].shape

(16384, 50)

In [9]:
# incremental pca

# create the four ipca
standardscaler = [StandardScaler() for _ in range(NUM_GROUPS)]
ipcas = [IncrementalPCA(n_components=NC_PER_GROUP, whiten=True) for _ in range(NUM_GROUPS)]


for npy_file in tqdm(npy_files, desc="Loading and processing files"):
    pixel_matrix = process_single_file(npy_file)
    if pixel_matrix is not None:
        # Dividir os dados em grupos
        grouped_data = split_data([pixel_matrix], group=NUM_GROUPS)
        for i, group_data in enumerate(grouped_data):
            # Ajustar o IPCA incrementalmente
            ipcas[i].partial_fit(group_data)


Loading and processing files: 100%|██████████| 11483/11483 [16:28<00:00, 11.62it/s]


In [None]:
# apply the pca tranform in all data

name_file = 0

for npy_file in tqdm(npy_files, desc="Transforming and saving files"):
    name_file += 1
    pixel_matrix = process_single_file(npy_file)
    h, w, c = pixel_matrix.shape
    if pixel_matrix is not None:
        # Dividir os dados em grupos
        grouped_data = split_data([pixel_matrix], group=NUM_GROUPS)
        pca_data_list = []
        for i, group_data in enumerate(grouped_data):
            # Transformar os dados usando o IPCA ajustado
            pca_data = ipcas[i].transform(group_data)
            pca_data_list.append(pca_data)

        # Concatenar os dados PCA de todos os grupos
        out = np.concatenate(pca_data_list, axis=-1)
        out = np.reshape(out, (h, w, -1))
        # Salvar o resultado em um arquivo .npy
        output_file = os.path.join(out_directory, f"pca_{name_file:05d}_DATA.npy")
        np.save(output_file, out)


Transforming and saving files: 100%|██████████| 11483/11483 [11:04<00:00, 17.28it/s]
