In [18]:
import os
import json
import random
from PIL import Image
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from tqdm import tqdm
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler


# Caminhos
base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications_2classes.json")
output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino"
output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao"
output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/testeboundingbox/teste"

# Diretórios de saída
train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")

os.makedirs(train_neg_dir, exist_ok=True)
os.makedirs(train_pos_dir, exist_ok=True)
os.makedirs(val_pos_dir, exist_ok=True)
os.makedirs(val_neg_dir, exist_ok=True)
os.makedirs(test_pos_dir, exist_ok=True)
os.makedirs(test_neg_dir, exist_ok=True)

# Carregar JSON
with open(json_path, "r") as f:
    data = json.load(f)

# Extrair todas as células
all_cells = []

for img_data in data:
    image_name = img_data["image_name"]
    for cell in img_data["classifications"]:
        all_cells.append({
            "image_name": image_name,
            "cell_id": cell["cell_id"],
            "x": cell["nucleus_x"],
            "y": cell["nucleus_y"],
            "label": cell["bethesda_system"]
        })

print(f"Total de células: {len(all_cells)}")

# Separar por classe
positive_cells = [c for c in all_cells if c["label"] == "POSITIVE"]
negative_cells = [c for c in all_cells if c["label"] == "NEGATIVE"]

print(f"Total POSITIVAS: {len(positive_cells)}")
print(f"Total NEGATIVAS: {len(negative_cells)}")

# Embaralhar
random.seed(42)
random.shuffle(positive_cells)
random.shuffle(negative_cells)

# Divisão 70/15/15
def split_data(cells):
    total = len(cells)
    n_train = int(0.7 * total)
    n_val = int(0.15 * total)
    train = cells[:n_train]
    val = cells[n_train:n_train + n_val]
    test = cells[n_train + n_val:]
    return train, val, test

train_pos, val_pos, test_pos = split_data(positive_cells)
train_neg, val_neg, test_neg = split_data(negative_cells)

# Conjuntos finais
val_cells = val_pos + val_neg
test_cells = test_pos + test_neg

# Embaralhar
random.shuffle(val_cells)
random.shuffle(test_cells)

# Resumo
print(f"\n[Resumo Inicial]")
print(f"Treino POSITIVO: {len(train_pos)}")
print(f"Treino NEGATIVO: {len(train_neg)}")
print(f"Validação: {len(val_cells)}  (Pos: {len(val_pos)}, Neg: {len(val_neg)})")
print(f"Teste: {len(test_cells)}  (Pos: {len(test_pos)}, Neg: {len(test_neg)})")

# Contadores
descartadas = 0
usadas = {
    "train_pos": 0,
    "train_neg": 0,
    "val_pos": 0,
    "val_neg": 0,
    "test_pos": 0,
    "test_neg": 0
}

def save_cropped(cell, image_dir, dest_dir, key_contador):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        print(f"Imagem não encontrada: {cell['image_name']}")
        descartadas += 1
        return
    try:
        image = Image.open(image_path).convert("L")
    except Exception as e:
        print(f"Erro ao abrir {cell['image_name']}: {e}")
        descartadas += 1
        return

    x, y = cell["x"], cell["y"]
    crop_size = 70
    half_crop = crop_size // 2

    x1 = x - half_crop
    y1 = y - half_crop
    x2 = x + half_crop
    y2 = y + half_crop

    if x1 < 0 or y1 < 0 or x2 > image.width or y2 > image.height:
        descartadas += 1
        return

    cropped = image.crop((x1, y1, x2, y2))

    name_prefix = os.path.splitext(cell["image_name"])[0]
    out_name = f"{name_prefix}_celula_{cell['cell_id']}.png"
    save_path = os.path.join(dest_dir, out_name)
    cropped.save(save_path)

    usadas[key_contador] += 1

# ---------- Loops para salvar imagens -------------
for cell in train_pos:
    save_cropped(cell, base_dir, train_pos_dir, "train_pos")
for cell in train_neg:
    save_cropped(cell, base_dir, train_neg_dir, "train_neg")
for cell in val_pos:
    save_cropped(cell, base_dir, val_pos_dir, "val_pos")
for cell in val_neg:
    save_cropped(cell, base_dir, val_neg_dir, "val_neg")
for cell in test_pos:
    save_cropped(cell, base_dir, test_pos_dir, "test_pos")
for cell in test_neg:
    save_cropped(cell, base_dir, test_neg_dir, "test_neg")

# Imprimir número total de descartadas
print(f"\nTotal de imagens descartadas por estarem fora dos limites: {descartadas}")

# Resumo final após descartes
print("\n[Resumo Final - Células Consideradas Após Descarte]")
print(f"Treino POSITIVO: {usadas['train_pos']}")
print(f"Treino NEGATIVO: {usadas['train_neg']}")
print(f"Validação POSITIVO: {usadas['val_pos']}")
print(f"Validação NEGATIVO: {usadas['val_neg']}")
print(f"Teste POSITIVO: {usadas['test_pos']}")
print(f"Teste NEGATIVO: {usadas['test_neg']}")


# ---------------------- ATRIBUTOS + CSV ----------------------
def extrair_atributos(image_path):
    img = imread(image_path, as_gray=True)
    img_uint8 = (img * 255).astype(np.uint8)

    # --- Segmentação para atributos morfológicos ---
    try:
        thresh = threshold_otsu(img)
        binary = img > thresh
        binary = morphology.remove_small_objects(binary, 30)
        labeled = label(binary)
        props = regionprops(labeled)

        area = perimeter = eccentricity = circularity = elipticidade = 0
        if props:
            p = props[0]
            area = p.area
            perimeter = p.perimeter
            eccentricity = p.eccentricity
            circularity = 4 * np.pi * area / (perimeter**2) if perimeter > 0 else 0
            elipticidade = (p.major_axis_length / p.minor_axis_length) if p.minor_axis_length > 0 else 0
    except Exception:
        area = perimeter = eccentricity = circularity = elipticidade = 0

    # --- Intensidade ---
    mean_intensity = img.mean()
    std_intensity = img.std()
    skewness = skew(img.ravel())
    kurt = kurtosis(img.ravel())
    entropy_val = -np.sum(img * np.log2(img + 1e-10))

    # --- GLCM ---
    glcm = graycomatrix(img_uint8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    correlation = graycoprops(glcm, 'correlation')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0, 0]

    # --- LBP ---
    lbp = local_binary_pattern(img, P=8, R=1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), density=True)

    # --- Haralick ---
    haralick = mahotas.features.haralick(img_uint8, ignore_zeros=False)
    haralick_mean = haralick.mean(axis=0)

    # --- TAS (Threshold Adjacency Statistics) ---
    try:
        tas = mahotas.features.tas(img_uint8)
    except Exception:
        tas = [0] * 6
        
    # --- Momentos de Zernike ---
    radius = min(img.shape) // 2
    try:
        zernike_moments = mahotas.features.zernike_moments(img_uint8, radius=radius, degree=8)
    except Exception:
        zernike_moments = [0] * 25  # degree 8 ≈ 25 momentos


    return np.hstack([
        area, perimeter, eccentricity, circularity, elipticidade,
        mean_intensity, std_intensity, skewness, kurt, entropy_val,
        contrast, correlation, energy, homogeneity,
        lbp_hist,
        haralick_mean,
        tas,
        zernike_moments
    ])

def gerar_csv_de_atributos(diretorio_imagens, label_binaria, output_csv):
    linhas = []


    for nome_arquivo in os.listdir(diretorio_imagens):
        if nome_arquivo.endswith(".png"):
            caminho_img = os.path.join(diretorio_imagens, nome_arquivo)
            atributos = extrair_atributos(caminho_img)

            nome_base = os.path.splitext(nome_arquivo)[0]
            partes = nome_base.split("_celula_")
            image_name = partes[0]
            cell_id = partes[1] if len(partes) > 1 else "NA"

            linha = [image_name, cell_id] + list(atributos) + [label_binaria]
            linhas.append(linha)

    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f'feat_{i}' for i in range(len(linhas[0])-3)] + ['label']


    # Normalização Min-Max apenas nas colunas de atributos (ignorando nome e label)
    col_atributos = df.columns[2:-1]
    scaler = MinMaxScaler()
    df[col_atributos] = scaler.fit_transform(df[col_atributos])

    df.to_csv(output_csv, index=False)


def juntar_csvs(csv1, csv2, output_final):
    df1 = pd.read_csv(csv1)
    df2 = pd.read_csv(csv2)
    df_final = pd.concat([df1, df2], ignore_index=True)
    df_final = df_final.sample(frac=1, random_state=42)
    df_final.to_csv(output_final, index=False)

# Gerar atributos para treino
gerar_csv_de_atributos(train_pos_dir, 1, "train_pos.csv")
gerar_csv_de_atributos(train_neg_dir, 0, "train_neg.csv")
juntar_csvs("train_pos.csv", "train_neg.csv", "train.csv")

# Validação
gerar_csv_de_atributos(val_pos_dir, 1, "val_pos.csv")
gerar_csv_de_atributos(val_neg_dir, 0, "val_neg.csv")
juntar_csvs("val_pos.csv", "val_neg.csv", "val.csv")

# Teste
gerar_csv_de_atributos(test_pos_dir, 1, "test_pos.csv")
gerar_csv_de_atributos(test_neg_dir, 0, "test_neg.csv")
juntar_csvs("test_pos.csv", "test_neg.csv", "test.csv")

print("\nProcessamento completo: imagens salvas e atributos exportados.")




Total de células: 11534
Total POSITIVAS: 4755
Total NEGATIVAS: 6779

[Resumo Inicial]
Treino POSITIVO: 3328
Treino NEGATIVO: 4745
Validação: 1729  (Pos: 713, Neg: 1016)
Teste: 1732  (Pos: 714, Neg: 1018)

Total de imagens descartadas por estarem fora dos limites: 425

[Resumo Final - Células Consideradas Após Descarte]
Treino POSITIVO: 3300
Treino NEGATIVO: 4505
Validação POSITIVO: 709
Validação NEGATIVO: 942
Teste POSITIVO: 704
Teste NEGATIVO: 949


KeyboardInterrupt: 

In [7]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from tqdm import tqdm
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# Caminhos
base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications.json")
output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino"
output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao"
output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/testeboundingbox/teste"

# Diretórios de saída
train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")

os.makedirs(train_neg_dir, exist_ok=True)
os.makedirs(train_pos_dir, exist_ok=True)
os.makedirs(val_pos_dir, exist_ok=True)
os.makedirs(val_neg_dir, exist_ok=True)
os.makedirs(test_pos_dir, exist_ok=True)
os.makedirs(test_neg_dir, exist_ok=True)

# Carregar JSON
with open(json_path, "r") as f:
    data = json.load(f)

# Extrair todas as células
all_cells = []
for img_data in data:
    image_name = img_data["image_name"]
    for cell in img_data["classifications"]:
        all_cells.append({
            "image_name": image_name,
            "cell_id": cell["cell_id"],
            "x": cell["nucleus_x"],
            "y": cell["nucleus_y"],
            "label": cell["bethesda_system"]
        })

print(f"Total de células: {len(all_cells)}")

# Separar por classe
positive_cells = [c for c in all_cells if c["label"] == "POSITIVE"]
negative_cells = [c for c in all_cells if c["label"] != "POSITIVE"]

# Embaralhar imagens de forma global
random.seed(42)

# Agrupar todas as células por imagem
imagem_to_celulas = {}
for cell in all_cells:
    imagem_to_celulas.setdefault(cell["image_name"], []).append(cell)

# Lista de imagens únicas
imagens_unicas = list(imagem_to_celulas.keys())
random.shuffle(imagens_unicas)

# Divisão global por imagem
n_total = len(imagens_unicas)
n_train = int(0.7 * n_total)
n_val = int(0.15 * n_total)

train_imgs = imagens_unicas[:n_train]
val_imgs = imagens_unicas[n_train:n_train + n_val]
test_imgs = imagens_unicas[n_train + n_val:]

# Agora alocamos células conforme a imagem associada
train_cells = sum([imagem_to_celulas[img] for img in train_imgs], [])
val_cells = sum([imagem_to_celulas[img] for img in val_imgs], [])
test_cells = sum([imagem_to_celulas[img] for img in test_imgs], [])

# Agora dividimos por classe
train_pos = [c for c in train_cells if c["label"] == "POSITIVE"]
train_neg = [c for c in train_cells if c["label"] != "POSITIVE"]
val_pos = [c for c in val_cells if c["label"] == "POSITIVE"]
val_neg = [c for c in val_cells if c["label"] != "POSITIVE"]
test_pos = [c for c in test_cells if c["label"] == "POSITIVE"]
test_neg = [c for c in test_cells if c["label"] != "POSITIVE"]

# Salvar imagens
usadas = {k: 0 for k in ["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"]}
descartadas = 0

def save_cropped(cell, image_dir, dest_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        img = Image.open(image_path).convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img.width or y + half_crop > img.height:
        descartadas += 1
        return
    crop = img.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"
    crop.save(os.path.join(dest_dir, name))
    usadas[key] += 1

for c in train_pos: save_cropped(c, base_dir, train_pos_dir, "train_pos")
for c in train_neg: save_cropped(c, base_dir, train_neg_dir, "train_neg")
for c in val_pos: save_cropped(c, base_dir, val_pos_dir, "val_pos")
for c in val_neg: save_cropped(c, base_dir, val_neg_dir, "val_neg")
for c in test_pos: save_cropped(c, base_dir, test_pos_dir, "test_pos")
for c in test_neg: save_cropped(c, base_dir, test_neg_dir, "test_neg")

# Aumento de dados - apenas treino POSITIVO
def augment_and_save(image_path, name_base, dest_dir):
    try:
        img = Image.open(image_path).convert("L")
        variants = [
            img.rotate(15), img.rotate(-15),
            img.transpose(Image.FLIP_LEFT_RIGHT),
            img.filter(ImageFilter.GaussianBlur(radius=1)),
            ImageEnhance.Contrast(img).enhance(1.5)
        ]
        for i, aug in enumerate(variants):
            output_path = os.path.normpath(os.path.join(dest_dir, f"{name_base}_aug{i+1}.png"))
            aug.save(output_path)

    except:
        pass

for f in os.listdir(train_pos_dir):
    if f.endswith(".png"):
        augment_and_save(os.path.join(train_pos_dir, f), os.path.splitext(f)[0], train_pos_dir)

# Extrair atributos
def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)
    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0
    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))
    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0,0]
    corr = graycoprops(glcm, 'correlation')[0,0]
    energy = graycoprops(glcm, 'energy')[0,0]
    homog = graycoprops(glcm, 'homogeneity')[0,0]
    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0,11), density=True)
    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    try: tas = mahotas.features.tas(img_u8)
    except: tas = [0]*6
    try: zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)
    except: zern = [0]*25
    return np.hstack([area, perim, ecc, circ, elip, mean, std, skw, krt, ent,
                      contrast, corr, energy, homog, lbp_hist, hrlk, tas, zern])
    
def gerar_df_features(diretorio, label):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base) > 1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0]) - 3)] + ["label"]
    return df


def gerar_csv(diretorio, label, output):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base)>1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0])-3)] + ["label"]
    col_attr = df.columns[2:-1]
    df[col_attr] = MinMaxScaler().fit_transform(df[col_attr])
    df.to_csv(output, index=False)

def juntar_csvs(csv1, csv2, out):
    df = pd.concat([pd.read_csv(csv1), pd.read_csv(csv2)], ignore_index=True)
    df = df.sample(frac=1, random_state=42)
    df.to_csv(out, index=False)

# Gera arquivos finais
gerar_csv(train_pos_dir, 1, "train_pos.csv")
gerar_csv(train_neg_dir, 0, "train_neg.csv")
juntar_csvs("train_pos.csv", "train_neg.csv", "train.csv")

gerar_csv(val_pos_dir, 1, "val_pos.csv")
gerar_csv(val_neg_dir, 0, "val_neg.csv")
juntar_csvs("val_pos.csv", "val_neg.csv", "val.csv")

gerar_csv(test_pos_dir, 1, "test_pos.csv")
gerar_csv(test_neg_dir, 0, "test_neg.csv")
juntar_csvs("test_pos.csv", "test_neg.csv", "test.csv")

# Garante que não há imagens iguais entre os conjuntos
train_imgs = set([c['image_name'] for c in train_pos + train_neg])
val_imgs = set([c['image_name'] for c in val_pos + val_neg])
test_imgs = set([c['image_name'] for c in test_pos + test_neg])

print("Intersecção treino/validação:", train_imgs & val_imgs)
print("Intersecção treino/teste:", train_imgs & test_imgs)
print("Intersecção validação/teste:", val_imgs & test_imgs)


print("\nProcessamento completo: imagens salvas, aumentadas e atributos exportados.")


Total de células: 11534
Intersecção treino/validação: set()
Intersecção treino/teste: set()
Intersecção validação/teste: set()

Processamento completo: imagens salvas, aumentadas e atributos exportados.


In [12]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# --- Configurações de diretórios ---
base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications.json")
output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino"
output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao"
output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/testeboundingbox/teste"

train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")

# --- Funções ---

def criar_diretorios():
    for d in [train_neg_dir, train_pos_dir, val_pos_dir, val_neg_dir, test_pos_dir, test_neg_dir]:
        os.makedirs(d, exist_ok=True)

def carregar_json(path):
    with open(path, "r") as f:
        return json.load(f)

def extrair_celulas(data):
    all_cells = []
    for img_data in data:
        image_name = img_data["image_name"]
        for cell in img_data["classifications"]:
            all_cells.append({
                "image_name": image_name,
                "cell_id": cell["cell_id"],
                "x": cell["nucleus_x"],
                "y": cell["nucleus_y"],
                "label": cell["bethesda_system"]
            })
    return all_cells

def dividir_dados_por_imagem(all_cells, seed=42):
    random.seed(seed)
    imagem_to_celulas = {}
    for cell in all_cells:
        imagem_to_celulas.setdefault(cell["image_name"], []).append(cell)
    imagens_unicas = list(imagem_to_celulas.keys())
    random.shuffle(imagens_unicas)
    n_total = len(imagens_unicas)
    n_train = round(0.7 * n_total)
    n_val = round(0.15 * n_total)
    n_test = n_total - n_train - n_val

    train_imgs = imagens_unicas[:n_train]
    val_imgs = imagens_unicas[n_train:n_train + n_val]
    test_imgs = imagens_unicas[n_train + n_val:n_train + n_val + n_test]

    train_cells = sum([imagem_to_celulas[img] for img in train_imgs], [])
    val_cells = sum([imagem_to_celulas[img] for img in val_imgs], [])
    test_cells = sum([imagem_to_celulas[img] for img in test_imgs], [])
    return train_cells, val_cells, test_cells


def dividir_por_classe(cells):
    pos = [c for c in cells if c["label"] == "POSITIVE"]
    neg = [c for c in cells if c["label"] != "POSITIVE"]
    return pos, neg

descartadas = 0
usadas = {k: 0 for k in ["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"]}

def save_cropped(cell, image_dir, dest_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        img = Image.open(image_path).convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img.width or y + half_crop > img.height:
        descartadas += 1
        return
    crop = img.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"
    crop.save(os.path.join(dest_dir, name))
    usadas[key] += 1

def salvar_imagens_por_conjunto(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg):
    for c in train_pos: save_cropped(c, base_dir, train_pos_dir, "train_pos")
    for c in train_neg: save_cropped(c, base_dir, train_neg_dir, "train_neg")
    for c in val_pos: save_cropped(c, base_dir, val_pos_dir, "val_pos")
    for c in val_neg: save_cropped(c, base_dir, val_neg_dir, "val_neg")
    for c in test_pos: save_cropped(c, base_dir, test_pos_dir, "test_pos")
    for c in test_neg: save_cropped(c, base_dir, test_neg_dir, "test_neg")

def augment_and_save(image_path, name_base, dest_dir):
    try:
        img = Image.open(image_path).convert("L")
        variants = [
            img.rotate(15), img.rotate(-15),
            img.transpose(Image.FLIP_LEFT_RIGHT),
            img.filter(ImageFilter.GaussianBlur(radius=1)),
            ImageEnhance.Contrast(img).enhance(1.5)
        ]
        for i, aug in enumerate(variants):
            output_path = os.path.normpath(os.path.join(dest_dir, f"{name_base}_aug{i+1}.png"))
            aug.save(output_path)
    except:
        pass

def augmentar_imagens_treino_positivo():
    for f in os.listdir(train_pos_dir):
        if f.endswith(".png"):
            augment_and_save(os.path.join(train_pos_dir, f), os.path.splitext(f)[0], train_pos_dir)

def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)
    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0
    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))
    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0,0]
    corr = graycoprops(glcm, 'correlation')[0,0]
    energy = graycoprops(glcm, 'energy')[0,0]
    homog = graycoprops(glcm, 'homogeneity')[0,0]
    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0,11), density=True)
    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    try: tas = mahotas.features.tas(img_u8)
    except: tas = [0]*6
    try: zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)
    except: zern = [0]*25
    return np.hstack([area, perim, ecc, circ, elip, mean, std, skw, krt, ent,
                      contrast, corr, energy, homog, lbp_hist, hrlk, tas, zern])

def gerar_csv(diretorio, label, output):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base)>1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0])-3)] + ["label"]
    col_attr = df.columns[2:-1]
    df[col_attr] = MinMaxScaler().fit_transform(df[col_attr])
    df.to_csv(output, index=False)

def juntar_csvs(csv1, csv2, out):
    df = pd.concat([pd.read_csv(csv1), pd.read_csv(csv2)], ignore_index=True)
    df = df.sample(frac=1, random_state=42)
    df.to_csv(out, index=False)

def verificar_interseccao_conjuntos(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg):
    train_imgs = set([c['image_name'] for c in train_pos + train_neg])
    val_imgs = set([c['image_name'] for c in val_pos + val_neg])
    test_imgs = set([c['image_name'] for c in test_pos + test_neg])
    print("Intersecção treino/validação:", train_imgs & val_imgs)
    print("Intersecção treino/teste:", train_imgs & test_imgs)
    print("Intersecção validação/teste:", val_imgs & test_imgs)

def main():
    criar_diretorios()
    data = carregar_json(json_path)
    all_cells = extrair_celulas(data)
    train_cells, val_cells, test_cells = dividir_dados_por_imagem(all_cells)
    train_pos, train_neg = dividir_por_classe(train_cells)
    val_pos, val_neg = dividir_por_classe(val_cells)
    test_pos, test_neg = dividir_por_classe(test_cells)

    # Balancear validação
    n_val = min(len(val_pos), len(val_neg))
    val_pos = random.sample(val_pos, n_val)
    val_neg = random.sample(val_neg, n_val)
    
    # Balancear teste
    n_test = min(len(test_pos), len(test_neg))
    test_pos = random.sample(test_pos, n_test)
    test_neg = random.sample(test_neg, n_test)
    
    
    print(f"Contagem inicial - Descartadas: {descartadas}")
    
    salvar_imagens_por_conjunto(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg)
    
    print(f"Contagem após salvar imagens - Descartadas: {descartadas}")
    print("Usadas por conjunto:", usadas)
    
    # Data augmentation somente para treino positivo
    aumentar = True
    if aumentar:
        augmentar_imagens_treino_positivo()
    
    # Gerar CSVs
    gerar_csv(train_pos_dir, 1, "train_pos.csv")
    gerar_csv(train_neg_dir, 0, "train_neg.csv")
    gerar_csv(val_pos_dir, 1, "val_pos.csv")
    gerar_csv(val_neg_dir, 0, "val_neg.csv")
    gerar_csv(test_pos_dir, 1, "test_pos.csv")
    gerar_csv(test_neg_dir, 0, "test_neg.csv")
    
    # Juntar CSVs treino, validação e teste (positivo + negativo)
    juntar_csvs("train_pos.csv", "train_neg.csv", "train.csv")
    juntar_csvs("val_pos.csv", "val_neg.csv", "val.csv")
    juntar_csvs("test_pos.csv", "test_neg.csv", "test.csv")
    
    # Verificar interseções (não devem existir imagens em mais de um conjunto)
    verificar_interseccao_conjuntos(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg)
    
    print("Processamento finalizado.")

if __name__ == "__main__":
    main()


Contagem inicial - Descartadas: 0
Contagem após salvar imagens - Descartadas: 398
Usadas por conjunto: {'train_pos': 3329, 'train_neg': 4447, 'val_pos': 545, 'val_neg': 512, 'test_pos': 839, 'test_neg': 805}


KeyboardInterrupt: 

In [14]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# --- Configurações de diretórios ---
base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications.json")
output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino"
output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao"
output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/testeboundingbox/teste"

train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")

# --- Funções ---

def criar_diretorios():
    for d in [train_neg_dir, train_pos_dir, val_pos_dir, val_neg_dir, test_pos_dir, test_neg_dir]:
        os.makedirs(d, exist_ok=True)

def carregar_json(path):
    with open(path, "r") as f:
        return json.load(f)

def extrair_celulas(data):
    all_cells = []
    for img_data in data:
        image_name = img_data["image_name"]
        for cell in img_data["classifications"]:
            all_cells.append({
                "image_name": image_name,
                "cell_id": cell["cell_id"],
                "x": cell["nucleus_x"],
                "y": cell["nucleus_y"],
                "label": cell["bethesda_system"]
            })
    return all_cells

def dividir_dados_por_imagem(all_cells, seed=42):
    random.seed(seed)
    imagem_to_celulas = {}
    for cell in all_cells:
        imagem_to_celulas.setdefault(cell["image_name"], []).append(cell)
    imagens_unicas = list(imagem_to_celulas.keys())
    random.shuffle(imagens_unicas)
    n_total = len(imagens_unicas)
    n_train = round(0.7 * n_total)
    n_val = round(0.15 * n_total)
    n_test = n_total - n_train - n_val

    train_imgs = imagens_unicas[:n_train]
    val_imgs = imagens_unicas[n_train:n_train + n_val]
    test_imgs = imagens_unicas[n_train + n_val:n_train + n_val + n_test]

    train_cells = sum([imagem_to_celulas[img] for img in train_imgs], [])
    val_cells = sum([imagem_to_celulas[img] for img in val_imgs], [])
    test_cells = sum([imagem_to_celulas[img] for img in test_imgs], [])
    return train_cells, val_cells, test_cells

def dividir_por_classe(cells):
    pos = [c for c in cells if c["label"] == "POSITIVE"]
    neg = [c for c in cells if c["label"] != "POSITIVE"]
    return pos, neg

def balancear_positivo_negativo(pos_list, neg_list, seed=42):
    random.seed(seed)
    n = min(len(pos_list), len(neg_list))
    return random.sample(pos_list, n), random.sample(neg_list, n)

descartadas = 0
usadas = {k: 0 for k in ["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"]}

def save_cropped(cell, image_dir, dest_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        img = Image.open(image_path).convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img.width or y + half_crop > img.height:
        descartadas += 1
        return
    crop = img.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"
    crop.save(os.path.join(dest_dir, name))
    usadas[key] += 1

def salvar_imagens_por_conjunto(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg):
    for c in train_pos: save_cropped(c, base_dir, train_pos_dir, "train_pos")
    for c in train_neg: save_cropped(c, base_dir, train_neg_dir, "train_neg")
    for c in val_pos: save_cropped(c, base_dir, val_pos_dir, "val_pos")
    for c in val_neg: save_cropped(c, base_dir, val_neg_dir, "val_neg")
    for c in test_pos: save_cropped(c, base_dir, test_pos_dir, "test_pos")
    for c in test_neg: save_cropped(c, base_dir, test_neg_dir, "test_neg")

def augment_and_save(image_path, name_base, dest_dir):
    try:
        img = Image.open(image_path).convert("L")
        variants = [
            img.rotate(15), img.rotate(-15),
            img.transpose(Image.FLIP_LEFT_RIGHT),
            img.filter(ImageFilter.GaussianBlur(radius=1)),
            ImageEnhance.Contrast(img).enhance(1.5)
        ]
        for i, aug in enumerate(variants):
            output_path = os.path.normpath(os.path.join(dest_dir, f"{name_base}_aug{i+1}.png"))
            aug.save(output_path)
    except:
        pass

def augmentar_imagens_treino_positivo():
    for f in os.listdir(train_pos_dir):
        if f.endswith(".png"):
            augment_and_save(os.path.join(train_pos_dir, f), os.path.splitext(f)[0], train_pos_dir)

def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)
    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0
    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))
    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0,0]
    corr = graycoprops(glcm, 'correlation')[0,0]
    energy = graycoprops(glcm, 'energy')[0,0]
    homog = graycoprops(glcm, 'homogeneity')[0,0]
    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0,11), density=True)
    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    try: tas = mahotas.features.tas(img_u8)
    except: tas = [0]*6
    try: zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)
    except: zern = [0]*25
    return np.hstack([area, perim, ecc, circ, elip, mean, std, skw, krt, ent,
                      contrast, corr, energy, homog, lbp_hist, hrlk, tas, zern])

def gerar_csv(diretorio, label, output):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base)>1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0])-3)] + ["label"]
    col_attr = df.columns[2:-1]
    df[col_attr] = MinMaxScaler().fit_transform(df[col_attr])
    df.to_csv(output, index=False)

def juntar_csvs(csv1, csv2, out):
    df = pd.concat([pd.read_csv(csv1), pd.read_csv(csv2)], ignore_index=True)
    df = df.sample(frac=1, random_state=42)
    df.to_csv(out, index=False)

def verificar_interseccao_conjuntos(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg):
    train_imgs = set([c['image_name'] for c in train_pos + train_neg])
    val_imgs = set([c['image_name'] for c in val_pos + val_neg])
    test_imgs = set([c['image_name'] for c in test_pos + test_neg])
    print("Intersecção treino/validação:", train_imgs & val_imgs)
    print("Intersecção treino/teste:", train_imgs & test_imgs)
    print("Intersecção validação/teste:", val_imgs & test_imgs)

def main():
    criar_diretorios()
    data = carregar_json(json_path)
    all_cells = extrair_celulas(data)
    train_cells, val_cells, test_cells = dividir_dados_por_imagem(all_cells)

    # Divide cada conjunto em positivo e negativo
    train_pos, train_neg = dividir_por_classe(train_cells)
    val_pos, val_neg = dividir_por_classe(val_cells)
    test_pos, test_neg = dividir_por_classe(test_cells)

    # Balanceia validação e teste
    val_pos, val_neg = balancear_positivo_negativo(val_pos, val_neg)
    test_pos, test_neg = balancear_positivo_negativo(test_pos, test_neg)

    verificar_interseccao_conjuntos(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg)
    salvar_imagens_por_conjunto(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg)
    augmentar_imagens_treino_positivo()

if __name__ == "__main__":
    # 1. Criar diretórios
    criar_diretorios()

    # 2. Carregar JSON com anotações
    dados_json = carregar_json(json_path)

    # 3. Extrair células com labels
    todas_celulas = extrair_celulas(dados_json)

    # 4. Dividir dados por imagem (para evitar vazamento entre treino/val/teste)
    train_cells, val_cells, test_cells = dividir_dados_por_imagem(todas_celulas)

    # 5. Separar cada conjunto por classe
    train_pos, train_neg = dividir_por_classe(train_cells)
    val_pos, val_neg = dividir_por_classe(val_cells)
    test_pos, test_neg = dividir_por_classe(test_cells)

    # 6. BALANCEAR VALIDAÇÃO E TESTE (positivo = negativo)
    val_pos, val_neg = balancear_positivo_negativo(val_pos, val_neg)
    test_pos, test_neg = balancear_positivo_negativo(test_pos, test_neg)

    # 7. Salvar imagens recortadas
    salvar_imagens_por_conjunto(train_pos, train_neg, val_pos, val_neg, test_pos, test_neg)

    # 8. Aumentar dados apenas do treino POSITIVO
    augmentar_imagens_treino_positivo()

    # 9. Gerar CSVs
    gerar_csv(train_pos_dir, "POSITIVE", "csv_treino_pos.csv")
    gerar_csv(train_neg_dir, "NEGATIVE", "csv_treino_neg.csv")
    gerar_csv(val_pos_dir, "POSITIVE", "csv_validacao_pos.csv")
    gerar_csv(val_neg_dir, "NEGATIVE", "csv_validacao_neg.csv")
    gerar_csv(test_pos_dir, "POSITIVE", "csv_teste_pos.csv")
    gerar_csv(test_neg_dir, "NEGATIVE", "csv_teste_neg.csv")

    # 10. Juntar positivos e negativos em conjuntos completos
    juntar_csvs("csv_treino_pos.csv", "csv_treino_neg.csv", "csv_treino_completo.csv")
    juntar_csvs("csv_validacao_pos.csv", "csv_validacao_neg.csv", "csv_validacao_completo.csv")
    juntar_csvs("csv_teste_pos.csv", "csv_teste_neg.csv", "csv_teste_completo.csv")

    # 11. Exibir contagens
    print(f"\n[INFO] Total de células descartadas: {descartadas}")
    for k, v in usadas.items():
        print(f"[INFO] {k}: {v}")




[INFO] Total de células descartadas: 388
[INFO] train_pos: 3329
[INFO] train_neg: 4447
[INFO] val_pos: 545
[INFO] val_neg: 519
[INFO] test_pos: 839
[INFO] test_neg: 808


In [15]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from tqdm import tqdm
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# ========== CAMINHOS ==========
#base_dir = "/Users/xr4good/Documents/Ingrid/datasets/imagens/base"
#json_path = os.path.join(base_dir, "classifications_2classes.json")
#output_dir_treino = "/Users/xr4good/Documents/Ingrid/datasets/imagens/treino/treino/2classes"
#output_dir_val = "/Users/xr4good/Documents/Ingrid/datasets/imagens/validacao/validacao/2classes/"
#output_dir_teste = "/Users/xr4good/Documents/Ingrid/datasets/imagens/teste/teste/2classes/"
base_dir = "E:/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications_2classes.json")
output_dir_treino = "E:/datasets/imagens/treino/treino/2classes"
output_dir_val = "E:/datasets/imagens/validacao/validacao/2classes/"
output_dir_teste = "E:/datasets/imagens/teste/teste/2classes/"

# ========== CRIAR DIRETÓRIOS ==========
train_neg_dir_rgb = os.path.join(output_dir_treino, "treino-dir-negativo-rgb")
train_pos_dir_rgb = os.path.join(output_dir_treino, "treino-dir-positivo-rgb")
val_pos_dir_rgb = os.path.join(output_dir_val, "validacao-dir-positivo-rgb")
val_neg_dir_rgb = os.path.join(output_dir_val, "validacao-dir-negativo-rgb")
test_pos_dir_rgb = os.path.join(output_dir_teste, "teste-dir-positivo-rgb")
test_neg_dir_rgb = os.path.join(output_dir_teste, "teste-dir-negativo-rgb")
train_neg_dir = os.path.join(output_dir_treino, "treino-dir-negativo")
train_pos_dir = os.path.join(output_dir_treino, "treino-dir-positivo")
val_pos_dir = os.path.join(output_dir_val, "validacao-dir-positivo")
val_neg_dir = os.path.join(output_dir_val, "validacao-dir-negativo")
test_pos_dir = os.path.join(output_dir_teste, "teste-dir-positivo")
test_neg_dir = os.path.join(output_dir_teste, "teste-dir-negativo")

for d in [train_neg_dir, train_pos_dir, val_pos_dir, val_neg_dir, test_pos_dir, test_neg_dir]:
    os.makedirs(d, exist_ok=True)
# Criar diretórios RGB
for d in [train_neg_dir_rgb, train_pos_dir_rgb, val_pos_dir_rgb, val_neg_dir_rgb, test_pos_dir_rgb, test_neg_dir_rgb]:
    os.makedirs(d, exist_ok=True)

# ========== CARREGAR JSON ==========
with open(json_path, "r") as f:
    data = json.load(f)

# ========== EXTRATO DE CÉLULAS ==========
all_cells = []
for img_data in data:
    image_name = img_data["image_name"]
    for cell in img_data["classifications"]:
        all_cells.append({
            "image_name": image_name,
            "cell_id": cell["cell_id"],
            "x": cell["nucleus_x"],
            "y": cell["nucleus_y"],
            "label": cell["bethesda_system"]
        })

# ========== DIVISÃO ENTRE POS/NEG ==========
positive_cells = [c for c in all_cells if c["label"] == "POSITIVE"]
negative_cells = [c for c in all_cells if c["label"] != "POSITIVE"]

# ========== DIVISÃO TREINO/VAL/TEST POR IMAGEM ==========
random.seed(42)
# Shuffle global
random.seed(42)

# Separar por classe
positive_cells = [c for c in all_cells if c["label"] == "POSITIVE"]
negative_cells = [c for c in all_cells if c["label"] != "POSITIVE"]

random.shuffle(positive_cells)
random.shuffle(negative_cells)

# Split por classe individualmente
def split_data(cells):
    total = len(cells)
    n_train = int(0.7 * total)
    n_val = int(0.15 * total)
    train = cells[:n_train]
    val = cells[n_train:n_train + n_val]
    test = cells[n_train + n_val:]
    return train, val, test

# Aplicar split
train_pos, val_pos, test_pos = split_data(positive_cells)
train_neg, val_neg, test_neg = split_data(negative_cells)


# ========== SALVAR RECORTES ==========
usadas = {k: 0 for k in ["train_pos", "train_neg", "val_pos", "val_neg", "test_pos", "test_neg"]}
descartadas = 0
gray_dirs = {
    "train_pos": train_pos_dir,
    "train_neg": train_neg_dir,
    "val_pos": val_pos_dir,
    "val_neg": val_neg_dir,
    "test_pos": test_pos_dir,
    "test_neg": test_neg_dir
}

rgb_dirs = {
    "train_pos": train_pos_dir_rgb,
    "train_neg": train_neg_dir_rgb,
    "val_pos": val_pos_dir_rgb,
    "val_neg": val_neg_dir_rgb,
    "test_pos": test_pos_dir_rgb,
    "test_neg": test_neg_dir_rgb
}

def save_cropped_dual(cell, image_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        # Abrir em RGB e em escala de cinza
        img_rgb = Image.open(image_path).convert("RGB")
        img_gray = img_rgb.convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img_rgb.width or y + half_crop > img_rgb.height:
        descartadas += 1
        return

    # Recortar ambas as versões
    crop_rgb = img_rgb.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    crop_gray = img_gray.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))

    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"

    # Salvar
    crop_rgb.save(os.path.join(rgb_dirs[key], name))
    crop_gray.save(os.path.join(gray_dirs[key], name))
    usadas[key] += 1

for c in train_pos: save_cropped_dual(c, base_dir, "train_pos")
for c in train_neg: save_cropped_dual(c, base_dir, "train_neg")
for c in val_pos: save_cropped_dual(c, base_dir, "val_pos")
for c in val_neg: save_cropped_dual(c, base_dir, "val_neg")
for c in test_pos: save_cropped_dual(c, base_dir, "test_pos")
for c in test_neg: save_cropped_dual(c, base_dir, "test_neg")


# ========== TRANSFORMAÇÕES ==========
def apply_augmentations(img):
    return [
        # Rotações em ângulos fixos (15, 90°, 180° e 270°)
        img.rotate(15),
        img.rotate(90),
        img.rotate(180),
        img.rotate(270),
        # Espelhamento horizontal e vertifcal
        img.transpose(Image.FLIP_LEFT_RIGHT),
        img.transpose(Image.FLIP_TOP_BOTTOM),
        # Ajustes de contraste e nitidez
        ImageEnhance.Contrast(img).enhance(1.5),
        ImageEnhance.Sharpness(img).enhance(2),
        # Filtros de desfoque, como o desfoque Gaussiano e o filtro da mediana
        img.filter(ImageFilter.GaussianBlur(radius=1)),
        img.filter(ImageFilter.MedianFilter(size=3)),
    ]

def balancear_treinamento_automaticamente(positivos_dir, negativos_dir):
    pos_files = [f for f in os.listdir(positivos_dir) if f.endswith(".png")]
    neg_files = [f for f in os.listdir(negativos_dir) if f.endswith(".png")]
    qtd_pos, qtd_neg = len(pos_files), len(neg_files)

    if qtd_pos < qtd_neg:
        deficit = qtd_neg - qtd_pos
        base_dir = positivos_dir
        base_files = pos_files
        classe = "POSITIVE"
    elif qtd_neg < qtd_pos:
        deficit = qtd_pos - qtd_neg
        base_dir = negativos_dir
        base_files = neg_files
        classe = "NEGATIVE"
    else:
        print("Classes já estão balanceadas.")
        return

    print(f"Aumentando classe {classe} com {deficit} imagens...")

    contador = 0
    for f in tqdm(base_files):
        if contador >= deficit:
            break
        path = os.path.join(base_dir, f)
        try:
            img = Image.open(path).convert("L")
            for i, aug in enumerate(apply_augmentations(img)):
                if contador >= deficit:
                    break
                out_name = f"{os.path.splitext(f)[0]}_aug{i+1}.png"
                aug.save(os.path.join(base_dir, out_name))
                contador += 1
        except:
            continue

    print(f"Aumento de dados concluído. Total gerado: {contador}")

balancear_treinamento_automaticamente(train_pos_dir, train_neg_dir)

# ========== EXTRAÇÃO DE ATRIBUTOS ==========
def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)

    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0

    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))

    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    corr = graycoprops(glcm, 'correlation')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    homog = graycoprops(glcm, 'homogeneity')[0, 0]

    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), density=True)

    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    tas = mahotas.features.tas(img_u8)
    zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)
    
    # ===== DESCRITORES DE FOURIER =====
    fft = np.fft.fft2(img)
    fft_shift = np.fft.fftshift(fft)
    magnitude_spectrum = np.abs(fft_shift)

    # Normalização para evitar overflow
    magnitude_spectrum /= (magnitude_spectrum.max() + 1e-10)

    # Estatísticas do espectro
    fft_mean = magnitude_spectrum.mean()
    fft_std = magnitude_spectrum.std()
    fft_energy = np.sum(magnitude_spectrum**2)
    fft_entropy = -np.sum(magnitude_spectrum * np.log2(magnitude_spectrum + 1e-10))

    return np.hstack([
        area, perim, ecc, circ, elip,
        mean, std, skw, krt, ent,
        contrast, corr, energy, homog,
        lbp_hist, hrlk, tas, zern,  fft_mean, fft_std, fft_energy, fft_entropy
    ])

# ========== CSV COM NORMALIZAÇÃO BASEADA NO TREINO ==========
def gerar_df_csv(diretorio, label):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base)>1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0])-3)] + ["label"]
    return df

def normalizar_e_salvar(df_train, df_val, df_test):
    col_attr = df_train.columns[2:-1]
    scaler = MinMaxScaler().fit(df_train[col_attr])
    df_train[col_attr] = scaler.transform(df_train[col_attr])
    df_val[col_attr] = scaler.transform(df_val[col_attr])
    df_test[col_attr] = scaler.transform(df_test[col_attr])
    df_train.to_csv("train_2classes.csv", index=False)
    df_val.to_csv("val_2classes.csv", index=False)
    df_test.to_csv("test_2classes.csv", index=False)

df_train = pd.concat([
    gerar_df_csv(train_pos_dir, 1),
    gerar_df_csv(train_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

df_val = pd.concat([
    gerar_df_csv(val_pos_dir, 1),
    gerar_df_csv(val_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

df_test = pd.concat([
    gerar_df_csv(test_pos_dir, 1),
    gerar_df_csv(test_neg_dir, 0)
], ignore_index=True).sample(frac=1, random_state=42)

normalizar_e_salvar(df_train, df_val, df_test)

print(" Processamento completo com balanceamento, augmentations e normalização segura.")


Aumentando classe POSITIVE com 1205 imagens...


  4%|██▉                                                                            | 121/3300 [00:01<00:32, 99.31it/s]


Aumento de dados concluído. Total gerado: 1205
 Processamento completo com balanceamento, augmentations e normalização segura.
