In [11]:
import os
import json
import random
from PIL import Image, ImageEnhance, ImageFilter
from skimage.feature import graycomatrix, graycoprops, local_binary_pattern
from skimage.io import imread
import numpy as np
import pandas as pd
import mahotas
from tqdm import tqdm
from skimage.filters import threshold_otsu
from skimage.measure import regionprops, label
from skimage import morphology
from scipy.stats import skew, kurtosis
from sklearn.preprocessing import MinMaxScaler

# ========== CAMINHOS ==========
base_dir = "E:/datasets/imagens/base"
json_path = os.path.join(base_dir, "classifications_6classes.json")
output_dir_treino = "E:/datasets/imagens/treino/treino/6classes/"
output_dir_val = "E:/datasets/imagens/validacao/validacao/6classes/"
output_dir_teste = "E:/datasets/imagens/teste/teste/6classes/"

# ========== CRIAR DIRETÓRIOS PARA SISTEMA BETHESDA ==========

# Diretórios de treino (RGB)
train_negative_dir_rgb = os.path.join(output_dir_treino, "treino-dir-NEGATIVE-rgb")
train_asch_dir_rgb     = os.path.join(output_dir_treino, "treino-dir-ASC-H-rgb")
train_ascus_dir_rgb    = os.path.join(output_dir_treino, "treino-dir-ASC-US-rgb")
train_lsil_dir_rgb     = os.path.join(output_dir_treino, "treino-dir-LSIL-rgb")
train_hsil_dir_rgb     = os.path.join(output_dir_treino, "treino-dir-HSIL-rgb")
train_scc_dir_rgb      = os.path.join(output_dir_treino, "treino-dir-SCC-rgb")

# Diretórios de validação (RGB)
val_negative_dir_rgb = os.path.join(output_dir_val, "validacao-dir-NEGATIVE-rgb")
val_asch_dir_rgb     = os.path.join(output_dir_val, "validacao-dir-ASC-H-rgb")
val_ascus_dir_rgb    = os.path.join(output_dir_val, "validacao-dir-ASC-US-rgb")
val_lsil_dir_rgb     = os.path.join(output_dir_val, "validacao-dir-LSIL-rgb")
val_hsil_dir_rgb     = os.path.join(output_dir_val, "validacao-dir-HSIL-rgb")
val_scc_dir_rgb      = os.path.join(output_dir_val, "validacao-dir-SCC-rgb")

# Diretórios de teste (RGB)
test_negative_dir_rgb = os.path.join(output_dir_teste, "teste-dir-NEGATIVE-rgb")
test_asch_dir_rgb     = os.path.join(output_dir_teste, "teste-dir-ASC-H-rgb")
test_ascus_dir_rgb    = os.path.join(output_dir_teste, "teste-dir-ASC-US-rgb")
test_lsil_dir_rgb     = os.path.join(output_dir_teste, "teste-dir-LSIL-rgb")
test_hsil_dir_rgb     = os.path.join(output_dir_teste, "teste-dir-HSIL-rgb")
test_scc_dir_rgb      = os.path.join(output_dir_teste, "teste-dir-SCC-rgb")

# Diretórios de treino (grayscale)
train_negative_dir = os.path.join(output_dir_treino, "treino-dir-NEGATIVE")
train_asch_dir     = os.path.join(output_dir_treino, "treino-dir-ASC-H")
train_ascus_dir    = os.path.join(output_dir_treino, "treino-dir-ASC-US")
train_lsil_dir     = os.path.join(output_dir_treino, "treino-dir-LSIL")
train_hsil_dir     = os.path.join(output_dir_treino, "treino-dir-HSIL")
train_scc_dir      = os.path.join(output_dir_treino, "treino-dir-SCC")

# Diretórios de validação (grayscale)
val_negative_dir = os.path.join(output_dir_val, "validacao-dir-NEGATIVE")
val_asch_dir     = os.path.join(output_dir_val, "validacao-dir-ASC-H")
val_ascus_dir    = os.path.join(output_dir_val, "validacao-dir-ASC-US")
val_lsil_dir     = os.path.join(output_dir_val, "validacao-dir-LSIL")
val_hsil_dir     = os.path.join(output_dir_val, "validacao-dir-HSIL")
val_scc_dir      = os.path.join(output_dir_val, "validacao-dir-SCC")

# Diretórios de teste (grayscale)
test_negative_dir = os.path.join(output_dir_teste, "teste-dir-NEGATIVE")
test_asch_dir     = os.path.join(output_dir_teste, "teste-dir-ASC-H")
test_ascus_dir    = os.path.join(output_dir_teste, "teste-dir-ASC-US")
test_lsil_dir     = os.path.join(output_dir_teste, "teste-dir-LSIL")
test_hsil_dir     = os.path.join(output_dir_teste, "teste-dir-HSIL")
test_scc_dir      = os.path.join(output_dir_teste, "teste-dir-SCC")

# ========== CRIAR OS DIRETÓRIOS ==========
os.makedirs(train_negative_dir_rgb, exist_ok=True)
os.makedirs(train_asch_dir_rgb, exist_ok=True)
os.makedirs(train_ascus_dir_rgb, exist_ok=True)
os.makedirs(train_lsil_dir_rgb, exist_ok=True)
os.makedirs(train_hsil_dir_rgb, exist_ok=True)
os.makedirs(train_scc_dir_rgb, exist_ok=True)

os.makedirs(val_negative_dir_rgb, exist_ok=True)
os.makedirs(val_asch_dir_rgb, exist_ok=True)
os.makedirs(val_ascus_dir_rgb, exist_ok=True)
os.makedirs(val_lsil_dir_rgb, exist_ok=True)
os.makedirs(val_hsil_dir_rgb, exist_ok=True)
os.makedirs(val_scc_dir_rgb, exist_ok=True)

os.makedirs(test_negative_dir_rgb, exist_ok=True)
os.makedirs(test_asch_dir_rgb, exist_ok=True)
os.makedirs(test_ascus_dir_rgb, exist_ok=True)
os.makedirs(test_lsil_dir_rgb, exist_ok=True)
os.makedirs(test_hsil_dir_rgb, exist_ok=True)
os.makedirs(test_scc_dir_rgb, exist_ok=True)

os.makedirs(train_negative_dir, exist_ok=True)
os.makedirs(train_asch_dir, exist_ok=True)
os.makedirs(train_ascus_dir, exist_ok=True)
os.makedirs(train_lsil_dir, exist_ok=True)
os.makedirs(train_hsil_dir, exist_ok=True)
os.makedirs(train_scc_dir, exist_ok=True)

os.makedirs(val_negative_dir, exist_ok=True)
os.makedirs(val_asch_dir, exist_ok=True)
os.makedirs(val_ascus_dir, exist_ok=True)
os.makedirs(val_lsil_dir, exist_ok=True)
os.makedirs(val_hsil_dir, exist_ok=True)
os.makedirs(val_scc_dir, exist_ok=True)

os.makedirs(test_negative_dir, exist_ok=True)
os.makedirs(test_asch_dir, exist_ok=True)
os.makedirs(test_ascus_dir, exist_ok=True)
os.makedirs(test_lsil_dir, exist_ok=True)
os.makedirs(test_hsil_dir, exist_ok=True)
os.makedirs(test_scc_dir, exist_ok=True)


# ========== CARREGAR JSON ==========
with open(json_path, "r") as f:
    data = json.load(f)

# ========== EXTRATO DE CÉLULAS ==========
all_cells = []
for img_data in data:
    image_name = img_data["image_name"]
    for cell in img_data["classifications"]:
        all_cells.append({
            "image_name": image_name,
            "cell_id": cell["cell_id"],
            "x": cell["nucleus_x"],
            "y": cell["nucleus_y"],
            "label": cell["bethesda_system"]
        })

# ========== DIVISÃO ENTRE AS 6 CLASSES ==========
negative_cells = [c for c in all_cells if c["label"] == "NEGATIVE"]
asch_cells     = [c for c in all_cells if c["label"] == "ASC-H"]
ascus_cells    = [c for c in all_cells if c["label"] == "ASC-US"]
lsil_cells     = [c for c in all_cells if c["label"] == "LSIL"]
hsil_cells     = [c for c in all_cells if c["label"] == "HSIL"]
scc_cells      = [c for c in all_cells if c["label"] == "SCC"]

random.seed(42)
random.shuffle(negative_cells)
random.shuffle(asch_cells)
random.shuffle(ascus_cells)
random.shuffle(lsil_cells)
random.shuffle(hsil_cells)
random.shuffle(scc_cells)

def split_data(cells, train_ratio=0.7, val_ratio=0.15):
    n = len(cells)
    train_end = int(n * train_ratio)
    val_end = train_end + int(n * val_ratio)
    return cells[:train_end], cells[train_end:val_end], cells[val_end:]

# Inicializar contador
descartadas = 0

# Dividir dados
train_negative_cells, val_negative_cells, test_negative_cells = split_data(negative_cells)
train_asch_cells, val_asch_cells, test_asch_cells = split_data(asch_cells)
train_ascus_cells, val_ascus_cells, test_ascus_cells = split_data(ascus_cells)
train_lsil_cells, val_lsil_cells, test_lsil_cells = split_data(lsil_cells)
train_hsil_cells, val_hsil_cells, test_hsil_cells = split_data(hsil_cells)
train_scc_cells, val_scc_cells, test_scc_cells = split_data(scc_cells)

# ========== DICIONÁRIOS USADAS E DIRETÓRIOS ==========
usadas = dict.fromkeys([
    "train_negative", "val_negative", "test_negative",
    "train_asch", "val_asch", "test_asch",
    "train_ascus", "val_ascus", "test_ascus",
    "train_lsil", "val_lsil", "test_lsil",
    "train_hsil", "val_hsil", "test_hsil",
    "train_scc", "val_scc", "test_scc"
], 0)

gray_dirs = {
    "train_negative": train_negative_dir,
    "val_negative": val_negative_dir,
    "test_negative": test_negative_dir,
    "train_asch": train_asch_dir,
    "val_asch": val_asch_dir,
    "test_asch": test_asch_dir,
    "train_ascus": train_ascus_dir,
    "val_ascus": val_ascus_dir,
    "test_ascus": test_ascus_dir,
    "train_lsil": train_lsil_dir,
    "val_lsil": val_lsil_dir,
    "test_lsil": test_lsil_dir,
    "train_hsil": train_hsil_dir,
    "val_hsil": val_hsil_dir,
    "test_hsil": test_hsil_dir,
    "train_scc": train_scc_dir,
    "val_scc": val_scc_dir,
    "test_scc": test_scc_dir,
}

rgb_dirs = {
    "train_negative": train_negative_dir_rgb,
    "val_negative": val_negative_dir_rgb,
    "test_negative": test_negative_dir_rgb,
    "train_asch": train_asch_dir_rgb,
    "val_asch": val_asch_dir_rgb,
    "test_asch": test_asch_dir_rgb,
    "train_ascus": train_ascus_dir_rgb,
    "val_ascus": val_ascus_dir_rgb,
    "test_ascus": test_ascus_dir_rgb,
    "train_lsil": train_lsil_dir_rgb,
    "val_lsil": val_lsil_dir_rgb,
    "test_lsil": test_lsil_dir_rgb,
    "train_hsil": train_hsil_dir_rgb,
    "val_hsil": val_hsil_dir_rgb,
    "test_hsil": test_hsil_dir_rgb,
    "train_scc": train_scc_dir_rgb,
    "val_scc": val_scc_dir_rgb,
    "test_scc": test_scc_dir_rgb,
}



def save_cropped_dual(cell, image_dir, key):
    global descartadas
    image_path = os.path.join(image_dir, cell["image_name"])
    if not os.path.exists(image_path):
        descartadas += 1
        return
    try:
        # Abrir em RGB e em escala de cinza
        img_rgb = Image.open(image_path).convert("RGB")
        img_gray = img_rgb.convert("L")
    except:
        descartadas += 1
        return
    x, y = cell["x"], cell["y"]
    half_crop = 35
    if x - half_crop < 0 or y - half_crop < 0 or x + half_crop > img_rgb.width or y + half_crop > img_rgb.height:
        descartadas += 1
        return

    # Recortar ambas as versões
    crop_rgb = img_rgb.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))
    crop_gray = img_gray.crop((x - half_crop, y - half_crop, x + half_crop, y + half_crop))

    name = f"{os.path.splitext(cell['image_name'])[0]}_celula_{cell['cell_id']}.png"

    # Salvar
    crop_rgb.save(os.path.join(rgb_dirs[key], name))
    crop_gray.save(os.path.join(gray_dirs[key], name))
    usadas[key] += 1

# Salvando as imagens cortadas
# NEGATIVE
# NEGATIVE
for c in train_negative_cells: save_cropped_dual(c, base_dir, "train_negative")
for c in val_negative_cells: save_cropped_dual(c, base_dir, "val_negative")
for c in test_negative_cells: save_cropped_dual(c, base_dir, "test_negative")

# ASC-H
for c in train_asch_cells: save_cropped_dual(c, base_dir, "train_asch")
for c in val_asch_cells: save_cropped_dual(c, base_dir, "val_asch")
for c in test_asch_cells: save_cropped_dual(c, base_dir, "test_asch")

# ASC-US
for c in train_ascus_cells: save_cropped_dual(c, base_dir, "train_ascus")
for c in val_ascus_cells: save_cropped_dual(c, base_dir, "val_ascus")
for c in test_ascus_cells: save_cropped_dual(c, base_dir, "test_ascus")

# LSIL
for c in train_lsil_cells: save_cropped_dual(c, base_dir, "train_lsil")
for c in val_lsil_cells: save_cropped_dual(c, base_dir, "val_lsil")
for c in test_lsil_cells: save_cropped_dual(c, base_dir, "test_lsil")

# HSIL
for c in train_hsil_cells: save_cropped_dual(c, base_dir, "train_hsil")
for c in val_hsil_cells: save_cropped_dual(c, base_dir, "val_hsil")
for c in test_hsil_cells: save_cropped_dual(c, base_dir, "test_hsil")

# SCC
for c in train_scc_cells: save_cropped_dual(c, base_dir, "train_scc")
for c in val_scc_cells: save_cropped_dual(c, base_dir, "val_scc")
for c in test_scc_cells: save_cropped_dual(c, base_dir, "test_scc")

print(f"Células descartadas: {descartadas}")


# ========== TRANSFORMAÇÕES ==========
def apply_augmentations(img):
    return [
        img.rotate(90),
        img.rotate(180),
        img.rotate(270),
        img.transpose(Image.FLIP_LEFT_RIGHT),
        img.transpose(Image.FLIP_TOP_BOTTOM),
        ImageEnhance.Contrast(img).enhance(1.5),
        ImageEnhance.Sharpness(img).enhance(2),
        img.filter(ImageFilter.GaussianBlur(radius=1)),
        img.filter(ImageFilter.MedianFilter(size=3)),
        img.rotate(15)
    ]

def balancear_treinamento_automaticamente(
    negative_dir, asch_dir, ascus_dir, lsil_dir, hsil_dir, scc_dir
):
    neg_files = [f for f in os.listdir(negative_dir) if f.endswith(".png")]
    asch_files = [f for f in os.listdir(asch_dir) if f.endswith(".png")]
    ascus_files = [f for f in os.listdir(ascus_dir) if f.endswith(".png")]
    lsil_files = [f for f in os.listdir(lsil_dir) if f.endswith(".png")]
    hsil_files = [f for f in os.listdir(hsil_dir) if f.endswith(".png")]
    scc_files = [f for f in os.listdir(scc_dir) if f.endswith(".png")]

    qtd_neg = len(neg_files)
    qtd_asch = len(asch_files)
    qtd_ascus = len(ascus_files)
    qtd_lsil = len(lsil_files)
    qtd_hsil = len(hsil_files)
    qtd_scc = len(scc_files)

    # Identificar a quantidade máxima entre as classes
    max_qtd = max(qtd_neg, qtd_asch, qtd_ascus, qtd_lsil, qtd_hsil, qtd_scc)

    # Lista de classes e seus dados
    classes = [
        ("NEGATIVE", qtd_neg, negative_dir, neg_files),
        ("ASC-H", qtd_asch, asch_dir, asch_files),
        ("ASC-US", qtd_ascus, ascus_dir, ascus_files),
        ("LSIL", qtd_lsil, lsil_dir, lsil_files),
        ("HSIL", qtd_hsil, hsil_dir, hsil_files),
        ("SCC", qtd_scc, scc_dir, scc_files),
    ]

    # Para cada classe com quantidade menor que o máximo, aplicar aumento
    for classe, qtd, base_dir, base_files in classes:
        if qtd < max_qtd:
            deficit = max_qtd - qtd
            print(f"Aumentando classe {classe} com {deficit} imagens...")

            contador = 0
            for f in tqdm(base_files):
                if contador >= deficit:
                    break
                path = os.path.join(base_dir, f)
                try:
                    img = Image.open(path).convert("L")
                    for i, aug in enumerate(apply_augmentations(img)):
                        if contador >= deficit:
                            break
                        out_name = f"{os.path.splitext(f)[0]}_aug{i+1}.png"
                        aug.save(os.path.join(base_dir, out_name))
                        contador += 1
                except Exception as e:
                    print(f"Erro ao processar {f}: {e}")
                    continue

            print(f"Aumento de dados concluído para classe {classe}. Total gerado: {contador}")
        else:
            print(f"Classe {classe} já está balanceada.")


# Chamada da função para os diretórios das 6 classes no treino
balancear_treinamento_automaticamente(
    train_negative_dir,
    train_asch_dir,
    train_ascus_dir,
    train_lsil_dir,
    train_hsil_dir,
    train_scc_dir,
)
# ========== EXTRAÇÃO DE ATRIBUTOS ==========
def extrair_atributos(p):
    img = imread(p, as_gray=True)
    img_u8 = (img * 255).astype(np.uint8)

    try:
        bin = morphology.remove_small_objects(img > threshold_otsu(img), 30)
        props = regionprops(label(bin))
        if props:
            p = props[0]
            area, perim = p.area, p.perimeter
            ecc = p.eccentricity
            circ = 4*np.pi*area/(perim**2) if perim > 0 else 0
            elip = p.major_axis_length/p.minor_axis_length if p.minor_axis_length > 0 else 0
        else:
            area = perim = ecc = circ = elip = 0
    except:
        area = perim = ecc = circ = elip = 0

    mean, std, skw, krt = img.mean(), img.std(), skew(img.ravel()), kurtosis(img.ravel())
    ent = -np.sum(img * np.log2(img + 1e-10))

    glcm = graycomatrix(img_u8, [1], [0], symmetric=True, normed=True)
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    corr = graycoprops(glcm, 'correlation')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    homog = graycoprops(glcm, 'homogeneity')[0, 0]

    lbp = local_binary_pattern(img, 8, 1, method='uniform')
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), density=True)

    hrlk = mahotas.features.haralick(img_u8).mean(axis=0)
    tas = mahotas.features.tas(img_u8)
    zern = mahotas.features.zernike_moments(img_u8, radius=min(img.shape)//2, degree=8)

    # ===== DESCRITORES DE FOURIER =====
    fft = np.fft.fft2(img)
    fft_shift = np.fft.fftshift(fft)
    magnitude_spectrum = np.abs(fft_shift)
    magnitude_spectrum /= (magnitude_spectrum.max() + 1e-10)  # Normalização

    fft_mean = magnitude_spectrum.mean()
    fft_std = magnitude_spectrum.std()
    fft_energy = np.sum(magnitude_spectrum**2)
    fft_entropy = -np.sum(magnitude_spectrum * np.log2(magnitude_spectrum + 1e-10))

    return np.hstack([
        area, perim, ecc, circ, elip,
        mean, std, skw, krt, ent,
        contrast, corr, energy, homog,
        lbp_hist, hrlk, tas, zern, fft_mean, fft_std, fft_energy, fft_entropy
    ])

# ========== CSV COM NORMALIZAÇÃO BASEADA NO TREINO ==========
def gerar_df_csv(diretorio, label):
    linhas = []
    for arq in os.listdir(diretorio):
        if arq.endswith(".png"):
            path = os.path.normpath(os.path.join(diretorio, arq))
            feat = extrair_atributos(path)
            base = os.path.splitext(arq)[0].split("_celula_")
            linhas.append([base[0], base[1] if len(base) > 1 else "NA"] + list(feat) + [label])
    df = pd.DataFrame(linhas)
    df.columns = ["image_name", "cell_id"] + [f"feat_{i}" for i in range(len(linhas[0]) - 3)] + ["label"]
    return df

def normalizar_e_salvar(df_train, df_val, df_test):
    col_attr = df_train.columns[2:-1]
    scaler = MinMaxScaler().fit(df_train[col_attr])
    df_train[col_attr] = scaler.transform(df_train[col_attr])
    df_val[col_attr] = scaler.transform(df_val[col_attr])
    df_test[col_attr] = scaler.transform(df_test[col_attr])
    df_train.to_csv("train_6classes.csv", index=False)
    df_val.to_csv("val_6classes.csv", index=False)
    df_test.to_csv("test_6classes.csv", index=False)

# === Gerar dataframes unindo as 6 classes ===
df_train = pd.concat([
    gerar_df_csv(train_negative_dir, 0),
    gerar_df_csv(train_asch_dir, 1),
    gerar_df_csv(train_ascus_dir, 2),
    gerar_df_csv(train_lsil_dir, 3),
    gerar_df_csv(train_hsil_dir, 4),
    gerar_df_csv(train_scc_dir, 5),
], ignore_index=True).sample(frac=1, random_state=42)

df_val = pd.concat([
    gerar_df_csv(val_negative_dir, 0),
    gerar_df_csv(val_asch_dir, 1),
    gerar_df_csv(val_ascus_dir, 2),
    gerar_df_csv(val_lsil_dir, 3),
    gerar_df_csv(val_hsil_dir, 4),
    gerar_df_csv(val_scc_dir, 5),
], ignore_index=True).sample(frac=1, random_state=42)

df_test = pd.concat([
    gerar_df_csv(test_negative_dir, 0),
    gerar_df_csv(test_asch_dir, 1),
    gerar_df_csv(test_ascus_dir, 2),
    gerar_df_csv(test_lsil_dir, 3),
    gerar_df_csv(test_hsil_dir, 4),
    gerar_df_csv(test_scc_dir, 5),
], ignore_index=True).sample(frac=1, random_state=42)

normalizar_e_salvar(df_train, df_val, df_test)

print("Processamento completo com extração, normalização e salvamento para 6 classes.")


Células descartadas: 425
Classe NEGATIVE já está balanceada.
Aumentando classe ASC-H com 3838 imagens...


 60%|███████████████████████████████████████████████▋                                | 384/644 [00:04<00:02, 86.70it/s]


Aumento de dados concluído para classe ASC-H. Total gerado: 3838
Aumentando classe ASC-US com 4065 imagens...


 98%|██████████████████████████████████████████████████████████████████████████████  | 407/417 [00:05<00:00, 78.48it/s]


Aumento de dados concluído para classe ASC-US. Total gerado: 4065
Aumentando classe LSIL com 3543 imagens...


 38%|██████████████████████████████▏                                                 | 355/939 [00:04<00:07, 80.03it/s]


Aumento de dados concluído para classe LSIL. Total gerado: 3543
Aumentando classe HSIL com 3300 imagens...


 28%|██████████████████████                                                         | 330/1182 [00:04<00:12, 70.13it/s]


Aumento de dados concluído para classe HSIL. Total gerado: 3300
Aumentando classe SCC com 4370 imagens...


100%|████████████████████████████████████████████████████████████████████████████████| 112/112 [00:01<00:00, 60.82it/s]


Aumento de dados concluído para classe SCC. Total gerado: 1120
Processamento completo com extração, normalização e salvamento para 6 classes.
