In [1]:
import os
import numpy as np
from glob import glob
from tensorflow.keras.utils import to_categorical

2025-07-25 17:58:28.441971: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-25 17:58:28.457688: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753477108.475765   11437 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753477108.481280   11437 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753477108.495773   11437 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [8]:
# Configurações
MAX_LEN = 754
NUM_FEATURES = 20
NUM_CLASSES = 3
DATA_PATH = "/home/jf/GitRepositories/RedeNeuralArtificial/data/pssm_rs126"  # pasta com arquivos PSSM
LABELS_FILE = "/home/jf/GitRepositories/RedeNeuralArtificial/data/RS126.data.txt"

In [9]:
label_map = {'H': 0, 'E': 1, 'C': 2}

In [10]:
def load_pssm(file_path, max_len=MAX_LEN):
    scores = []
    with open(file_path) as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) >= 22 and parts[0].isdigit():
                values = [float(x) for x in parts[2:22]]
                scores.append(values)
    scores = np.array(scores)

    # Normalização z-score
    mean = scores.mean(axis=0)
    std = scores.std(axis=0) + 1e-6
    scores = (scores - mean) / std

    # Padding
    padded = np.zeros((max_len, NUM_FEATURES))
    padded[:scores.shape[0], :] = scores
    return padded, scores.shape[0]

In [11]:
def encode_labels(seq, max_len=MAX_LEN):
    encoded = [label_map[ch] for ch in seq]
    padded = np.zeros(max_len, dtype=int)
    padded[:len(encoded)] = encoded
    return to_categorical(padded, num_classes=NUM_CLASSES)

In [12]:
# Ler arquivo de labels
sequences, labels = [], []
with open(LABELS_FILE) as f:
    lines = [l.strip() for l in f if l.strip()]
    for i in range(0, len(lines), 2):
        sequences.append(lines[i])
        labels.append(lines[i+1])

pssm_files = sorted(glob(os.path.join(DATA_PATH, "*.pssm")))
print(f"Total PSSMs: {len(pssm_files)}, Labels: {len(labels)}")

Total PSSMs: 126, Labels: 126


In [13]:
X_list, y_list, w_list = [], [], []

for i, fpath in enumerate(pssm_files):
    pssm_matrix, seq_len = load_pssm(fpath)
    X_list.append(pssm_matrix)
    y_list.append(encode_labels(labels[i]))
    weights = np.zeros(MAX_LEN)
    weights[:seq_len] = 1.0
    w_list.append(weights)

X = np.array(X_list)
y = np.array(y_list)
weights = np.array(w_list)

print("Shapes -> X:", X.shape, "y:", y.shape, "weights:", weights.shape)

Shapes -> X: (126, 754, 20) y: (126, 754, 3) weights: (126, 754)


In [14]:
# Salvar em único arquivo
np.savez_compressed("data/pssm_dataset.npz", X=X, y=y, weights=weights)
print("Arquivo salvo: pssm_dataset.npz")

Arquivo salvo: pssm_dataset.npz
