In [3]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from tqdm import tqdm
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# ========== CAMINHOS ==========
#base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
#json_path = os.path.join(base_dir, "classifications_3classes.json")
#output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino/3classes/"
#output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao/3classes/"
#output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/teste/teste/3classes/"
base_dir = "E:/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications_3classes.json")
output_dir_treino = "E:/datasets/imagens/treino/treino/3classes/"
output_dir_val = "E:/datasets/imagens/validacao/validacao/3classes/"
output_dir_teste = "E:/datasets/imagens/teste/teste/3classes/"

# ========== CRIAR DIRETÓRIOS ==========
train_neg_dir_rgb = os.path.join(output_dir_treino, "treino-dir-negativo-rgb")
train_pos_dir_rgb = os.path.join(output_dir_treino, "treino-dir-positivo-rgb")
val_pos_dir_rgb = os.path.join(output_dir_val, "validacao-dir-positivo-rgb")
val_neg_dir_rgb = os.path.join(output_dir_val, "validacao-dir-negativo-rgb")
test_pos_dir_rgb = os.path.join(output_dir_teste, "teste-dir-positivo-rgb")
test_neg_dir_rgb = os.path.join(output_dir_teste, "teste-dir-negativo-rgb")
train_lim_dir_rgb = os.path.join(output_dir_treino, "treino-dir-limitrofe-rgb")
val_lim_dir_rgb = os.path.join(output_dir_val, "validacao-dir-limitrofe-rgb")
test_lim_dir_rgb = os.path.join(output_dir_teste, "teste-dir-limitrofe-rgb")
train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")
train_lim_dir = os.path.join(output_dir_treino, "treino-dir-limitrofe")
val_lim_dir = os.path.join(output_dir_val, "validacao-dir-limitrofe")
test_lim_dir = os.path.join(output_dir_teste, "teste-dir-limitrofe")

for d in [train_neg_dir, train_pos_dir, val_pos_dir, val_neg_dir, test_pos_dir, test_neg_dir, train_lim_dir, val_lim_dir, test_lim_dir ]:
    os.makedirs(d, exist_ok=True)
# Criar diretórios RGB
for d in [train_neg_dir_rgb, train_pos_dir_rgb, val_pos_dir_rgb, val_neg_dir_rgb, test_pos_dir_rgb, test_neg_dir_rgb, train_lim_dir_rgb, val_lim_dir_rgb, test_lim_dir_rgb ]:
    os.makedirs(d, exist_ok=True)

# ========== CARREGAR JSON ==========
with open(json_path, "r") as f:
    data = json.load(f)

# ========== EXTRATO DE CÉLULAS ==========
all_cells = []
for img_data in data:
    image_name = img_data["image_name"]
    for cell in img_data["classifications"]:
        all_cells.append({
            "image_name": image_name,
            "cell_id": cell["cell_id"],
            "x": cell["nucleus_x"],
            "y": cell["nucleus_y"],
            "label": cell["bethesda_system"]
        })

# ========== DIVISÃO ENTRE POS/NEG ==========
positive_cells = [c for c in all_cells if c["label"] == "POSITIVE"]
negative_cells = [c for c in all_cells if c["label"] == "NEGATIVE"]
limitrofe_cells = [c for c in all_cells if c["label"] == "LIMITROFE"]

# ========== DIVISÃO TREINO/VAL/TEST POR IMAGEM ==========
random.seed(42)

random.shuffle(positive_cells)
random.shuffle(negative_cells)
random.shuffle(limitrofe_cells)

# Split por classe individualmente
def split_data(cells):
    total = len(cells)
    n_train = int(0.7 * total)
    n_val = int(0.15 * total)
    train = cells[:n_train]
    val = cells[n_train:n_train + n_val]
    test = cells[n_train + n_val:]
    return train, val, test

train_pos, val_pos, test_pos = split_data(positive_cells)
train_neg, val_neg, test_neg = split_data(negative_cells)
train_lim, val_lim, test_lim = split_data(limitrofe_cells)


# ========== SALVAR RECORTES ==========
usadas = {k: 0 for k in [
    "train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg",
    "train_lim", "val_lim", "test_lim"
]}

descartadas = 0
gray_dirs = {
    "train_pos": train_pos_dir,
    "train_neg": train_neg_dir,
    "val_pos": val_pos_dir,
    "val_neg": val_neg_dir,
    "test_pos": test_pos_dir,
    "test_neg": test_neg_dir,
    "train_lim": train_lim_dir,
    "val_lim": val_lim_dir,
    "test_lim": test_lim_dir
}

rgb_dirs = {
    "train_pos": train_pos_dir_rgb,
    "train_neg": train_neg_dir_rgb,
    "val_pos": val_pos_dir_rgb,
    "val_neg": val_neg_dir_rgb,
    "test_pos": test_pos_dir_rgb,
    "test_neg": test_neg_dir_rgb,
    "train_lim": train_lim_dir_rgb,
    "val_lim": val_lim_dir_rgb,
    "test_lim": test_lim_dir_rgb
    
}

def save_cropped_dual(cell, image_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        # Abrir em RGB e em escala de cinza
        img_rgb = Image.open(image_path).convert("RGB")
        img_gray = img_rgb.convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img_rgb.width or y + half_crop > img_rgb.height:
        descartadas += 1
        return

    # Recortar ambas as versões
    crop_rgb = img_rgb.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    crop_gray = img_gray.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))

    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"

    # Salvar
    crop_rgb.save(os.path.join(rgb_dirs[key], name))
    crop_gray.save(os.path.join(gray_dirs[key], name))
    usadas[key] += 1

for c in train_pos: save_cropped_dual(c, base_dir, "train_pos")
for c in train_neg: save_cropped_dual(c, base_dir, "train_neg")
for c in val_pos: save_cropped_dual(c, base_dir, "val_pos")
for c in val_neg: save_cropped_dual(c, base_dir, "val_neg")
for c in test_pos: save_cropped_dual(c, base_dir, "test_pos")
for c in test_neg: save_cropped_dual(c, base_dir, "test_neg")
for c in train_lim: save_cropped_dual(c, base_dir, "train_lim")
for c in val_lim: save_cropped_dual(c, base_dir, "val_lim")
for c in test_lim: save_cropped_dual(c, base_dir, "test_lim")



# ========== TRANSFORMAÇÕES ==========
def apply_augmentations(img):
    return [
        img.rotate(90),
        img.rotate(180),
        img.rotate(270),
        img.transpose(Image.FLIP_LEFT_RIGHT),
        img.transpose(Image.FLIP_TOP_BOTTOM),
        ImageEnhance.Contrast(img).enhance(1.5),
        ImageEnhance.Sharpness(img).enhance(2),
        img.filter(ImageFilter.GaussianBlur(radius=1)),
        img.filter(ImageFilter.MedianFilter(size=3)),
        img.rotate(15)
    ]
def balancear_treinamento_automaticamente(positivos_dir, negativos_dir, limitrofes_dir):
    pos_files = [f for f in os.listdir(positivos_dir) if f.endswith(".png")]
    neg_files = [f for f in os.listdir(negativos_dir) if f.endswith(".png")]
    lim_files = [f for f in os.listdir(limitrofes_dir) if f.endswith(".png")]

    qtd_pos, qtd_neg, qtd_lim = len(pos_files), len(neg_files), len(lim_files)

    # Identificar a quantidade máxima entre as classes
    max_qtd = max(qtd_pos, qtd_neg, qtd_lim)

    # Lista de classes e seus dados
    classes = [
        ("POSITIVE", qtd_pos, positivos_dir, pos_files),
        ("NEGATIVE", qtd_neg, negativos_dir, neg_files),
        ("LIMITROFE", qtd_lim, limitrofes_dir, lim_files),
    ]

    # Para cada classe com quantidade menor que o máximo, aplicar aumento
    for classe, qtd, base_dir, base_files in classes:
        if qtd < max_qtd:
            deficit = max_qtd - qtd
            print(f"Aumentando classe {classe} com {deficit} imagens...")

            contador = 0
            for f in tqdm(base_files):
                if contador >= deficit:
                    break
                path = os.path.join(base_dir, f)
                try:
                    img = Image.open(path).convert("L")
                    for i, aug in enumerate(apply_augmentations(img)):
                        if contador >= deficit:
                            break
                        out_name = f"{os.path.splitext(f)[0]}_aug{i+1}.png"
                        aug.save(os.path.join(base_dir, out_name))
                        contador += 1
                except Exception as e:
                    print(f"Erro ao processar {f}: {e}")
                    continue

            print(f"Aumento de dados concluído para classe {classe}. Total gerado: {contador}")
        else:
            print(f"Classe {classe} já está balanceada.")
            
balancear_treinamento_automaticamente(train_pos_dir, train_neg_dir, train_lim_dir)
# ========== EXTRAÇÃO DE ATRIBUTOS ==========
def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)

    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0

    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))

    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    corr = graycoprops(glcm, 'correlation')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    homog = graycoprops(glcm, 'homogeneity')[0, 0]

    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), density=True)

    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    tas = mahotas.features.tas(img_u8)
    zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)
    
    # ===== DESCRITORES DE FOURIER =====
    fft = np.fft.fft2(img)
    fft_shift = np.fft.fftshift(fft)
    magnitude_spectrum = np.abs(fft_shift)

    # Normalização para evitar overflow
    magnitude_spectrum /= (magnitude_spectrum.max() + 1e-10)

    # Estatísticas do espectro
    fft_mean = magnitude_spectrum.mean()
    fft_std = magnitude_spectrum.std()
    fft_energy = np.sum(magnitude_spectrum**2)
    fft_entropy = -np.sum(magnitude_spectrum * np.log2(magnitude_spectrum + 1e-10))

    return np.hstack([
        area, perim, ecc, circ, elip,
        mean, std, skw, krt, ent,
        contrast, corr, energy, homog,
        lbp_hist, hrlk, tas, zern,  fft_mean, fft_std, fft_energy, fft_entropy
    ])

# ========== CSV COM NORMALIZAÇÃO BASEADA NO TREINO ==========
def gerar_df_csv(diretorio, label):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base)>1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0])-3)] + ["label"]
    return df

def normalizar_e_salvar(df_train, df_val, df_test):
    col_attr = df_train.columns[2:-1]
    scaler = MinMaxScaler().fit(df_train[col_attr])
    df_train[col_attr] = scaler.transform(df_train[col_attr])
    df_val[col_attr] = scaler.transform(df_val[col_attr])
    df_test[col_attr] = scaler.transform(df_test[col_attr])
    df_train.to_csv("train_3classes.csv", index=False)
    df_val.to_csv("val_3classes.csv", index=False)
    df_test.to_csv("test_3classes.csv", index=False)

df_train = pd.concat([
    gerar_df_csv(train_lim_dir, 2),
    gerar_df_csv(train_pos_dir, 1),
    gerar_df_csv(train_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

df_val = pd.concat([
    gerar_df_csv(val_lim_dir, 2),
    gerar_df_csv(val_pos_dir, 1),
    gerar_df_csv(val_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

df_test = pd.concat([
    gerar_df_csv(test_lim_dir, 2),
    gerar_df_csv(test_pos_dir, 1),
    gerar_df_csv(test_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

normalizar_e_salvar(df_train, df_val, df_test)

print(" Processamento completo com balanceamento, augmentations e normalização segura.")


Aumentando classe POSITIVE com 2238 imagens...


 10%|███████▉                                                                       | 224/2234 [00:02<00:21, 94.67it/s]


Aumento de dados concluído para classe POSITIVE. Total gerado: 2238
Classe NEGATIVE já está balanceada.
Aumentando classe LIMITROFE com 3410 imagens...


 32%|█████████████████████████▎                                                     | 341/1062 [00:03<00:07, 99.09it/s]


Aumento de dados concluído para classe LIMITROFE. Total gerado: 3410
 Processamento completo com balanceamento, augmentations e normalização segura.
