# 001_preprocess_raw.ipynb
Notebook para padronizar datasets do UCI em CSV, com alvo numérico e normalização.

Este notebook:
1. Lê os arquivos brutos em `data/raw/` para **banknote_authentication**, **breast_cancer (WDBC)** e **sonar**.
2. Converte o alvo categórico para numérico.
3. Normaliza os atributos (MinMaxScaler ou StandardScaler).
4. Salva CSVs padronizados em `data/processed/<dataset>/`:
   - `dataset_clean.csv` com valores originais e `target` numérico
   - `dataset_normalized.csv` com atributos normalizados e `target` numérico
5. Salva um `label_map.json` com o mapeamento do alvo.


In [2]:
import os
import json
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

def find_repo_root(start=None):
    """Sobe diretórios até achar um marcador do projeto (pyproject.toml, .git ou configs/)."""
    cur = os.path.abspath(start or os.getcwd())
    markers = {"pyproject.toml", ".git", "configs"}
    while True:
        if any(os.path.exists(os.path.join(cur, m)) for m in markers):
            return cur
        parent = os.path.dirname(cur)
        if parent == cur:  # chegou na raiz do drive
            # fallback: sobe um nível (útil se estiver em notebooks/)
            return os.path.abspath(os.path.join(os.getcwd(), ".."))
        cur = parent

BASE = find_repo_root()
RAW  = os.path.join(BASE, "data", "raw")
PROC = os.path.join(BASE, "data", "processed")

SCALER_KIND = "minmax"  # troque para "standard" se preferir

# Garante que as pastas de saída existam
os.makedirs(PROC, exist_ok=True)

# Opcional: alinhar o CWD à raiz do projeto para evitar surpresas
os.chdir(BASE)

print("BASE:", BASE)
print("RAW :", RAW, "exists?", os.path.exists(RAW))
print("PROC:", PROC, "exists?", os.path.exists(PROC))


BASE: c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project
RAW : c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\raw exists? True
PROC: c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\processed exists? True


In [3]:
def _normalize_features(df: pd.DataFrame, target_col: str, kind: str = "minmax") -> pd.DataFrame:
    X = df.drop(columns=[target_col])
    y = df[target_col].copy()
    scaler = StandardScaler() if kind == "standard" else MinMaxScaler()
    Xs = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
    out = Xs.assign(**{target_col: y.values})
    return out

def _save_outputs(dataset_name: str, df_clean: pd.DataFrame, df_norm: pd.DataFrame, label_map: dict):
    out_dir = os.path.join(PROC, dataset_name)
    os.makedirs(out_dir, exist_ok=True)
    df_clean.to_csv(os.path.join(out_dir, "dataset_clean.csv"), index=False)
    df_norm.to_csv(os.path.join(out_dir, "dataset_normalized.csv"), index=False)
    with open(os.path.join(out_dir, "label_map.json"), "w", encoding="utf-8") as f:
        json.dump(label_map, f, ensure_ascii=False, indent=2)
    print(f"[OK] {dataset_name}: {len(df_clean)} linhas | salvo em {out_dir}")


## Banknote Authentication

In [4]:
def load_banknote():
    path = os.path.join(RAW, "banknote_authentication", "data_banknote_authentication.txt")
    df = pd.read_csv(path, header=None)
    cols = [f"f{i}" for i in range(1, df.shape[1])] + ["target"]
    df.columns = cols
    # Alvo já é 0 e 1 no arquivo original
    label_map = {"neg": 0, "pos": 1, "from_file": "already_numeric_0_1"}
    return df, label_map

df_bank, map_bank = load_banknote()
df_bank_norm = _normalize_features(df_bank, "target", SCALER_KIND)
_ = _save_outputs("banknote_authentication", df_bank, df_bank_norm, map_bank)
df_bank.head()


[OK] banknote_authentication: 1372 linhas | salvo em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\processed\banknote_authentication


Unnamed: 0,f1,f2,f3,f4,target
0,3.6216,8.6661,-2.8073,-0.44699,0
1,4.5459,8.1674,-2.4586,-1.4621,0
2,3.866,-2.6383,1.9242,0.10645,0
3,3.4566,9.5228,-4.0112,-3.5944,0
4,0.32924,-4.4552,4.5718,-0.9888,0


## Breast Cancer Wisconsin (Diagnostic) — WDBC

In [5]:
def load_wdbc():
    path = os.path.join(RAW, "breast_cancer", "wdbc.data")
    df = pd.read_csv(path, header=None)
    cols = ["id", "diagnosis"] + [f"f{i}" for i in range(1, 31)]
    if df.shape[1] != len(cols):
        raise ValueError(f"Esperado 32 colunas, encontrado {df.shape[1]}")
    df.columns = cols
    label_map = {"B": 0, "M": 1}
    df["target"] = df["diagnosis"].map(label_map).astype(int)
    df = df.drop(columns=["id", "diagnosis"])
    return df, label_map

df_wdbc, map_wdbc = load_wdbc()
df_wdbc_norm = _normalize_features(df_wdbc, "target", SCALER_KIND)
_ = _save_outputs("breast_cancer_wdbc", df_wdbc, df_wdbc_norm, map_wdbc)
df_wdbc.head()


[OK] breast_cancer_wdbc: 569 linhas | salvo em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\processed\breast_cancer_wdbc


Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f22,f23,f24,f25,f26,f27,f28,f29,f30,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,1
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,1
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,1
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,1
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,1


## Sonar — Mines vs Rocks

In [6]:
def load_sonar():
    path = os.path.join(RAW, "sonar", "sonar.all-data")
    df = pd.read_csv(path, header=None)
    if df.shape[1] != 61:
        raise ValueError(f"Esperado 61 colunas 60 features + 1 alvo, encontrado {df.shape[1]}")
    cols = [f"f{i}" for i in range(1, 61)] + ["label"]
    df.columns = cols
    label_map = {"R": 0, "M": 1}
    df["target"] = df["label"].map(label_map).astype(int)
    df = df.drop(columns=["label"])
    return df, label_map

df_sonar, map_sonar = load_sonar()
df_sonar_norm = _normalize_features(df_sonar, "target", SCALER_KIND)
_ = _save_outputs("sonar", df_sonar, df_sonar_norm, map_sonar)
df_sonar.head()


[OK] sonar: 208 linhas | salvo em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\processed\sonar


Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f52,f53,f54,f55,f56,f57,f58,f59,f60,target
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,0
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,0
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,0
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,0
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,0


In [7]:
summary = {
    "banknote_authentication": {
        "clean_path": os.path.abspath(os.path.join(PROC, "banknote_authentication", "dataset_clean.csv")),
        "normalized_path": os.path.abspath(os.path.join(PROC, "banknote_authentication", "dataset_normalized.csv")),
    },
    "breast_cancer_wdbc": {
        "clean_path": os.path.abspath(os.path.join(PROC, "breast_cancer_wdbc", "dataset_clean.csv")),
        "normalized_path": os.path.abspath(os.path.join(PROC, "breast_cancer_wdbc", "dataset_normalized.csv")),
    },
    "sonar": {
        "clean_path": os.path.abspath(os.path.join(PROC, "sonar", "dataset_clean.csv")),
        "normalized_path": os.path.abspath(os.path.join(PROC, "sonar", "dataset_normalized.csv")),
    },
}
summary


{'banknote_authentication': {'clean_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\banknote_authentication\\dataset_clean.csv',
  'normalized_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\banknote_authentication\\dataset_normalized.csv'},
 'breast_cancer_wdbc': {'clean_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\breast_cancer_wdbc\\dataset_clean.csv',
  'normalized_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\breast_cancer_wdbc\\dataset_normalized.csv'},
 'sonar': {'clean_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\sonar\\dataset_clean.csv',
  'normalized_path': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\sonar\\dataset_normalized.csv'}}

In [8]:
import numpy as np

def sanity_report(name, df_clean, df_norm, target="target"):
    print(f"\n=== {name} ===")
    # distribuição de classes
    vc_clean = df_clean[target].value_counts(normalize=True).sort_index()
    vc_norm  = df_norm[target].value_counts(normalize=True).sort_index()
    print("Distribuição de classes (clean):")
    print(vc_clean)
    print("Distribuição de classes (normalized):")
    print(vc_norm)

    # checar que o target é idêntico
    assert np.array_equal(df_clean[target].values, df_norm[target].values), "target mudou após normalização"

    # checar estatísticas de features
    Xc = df_clean.drop(columns=[target])
    Xn = df_norm.drop(columns=[target])
    print("Clean  mean/std (primeiras 3 colunas):", Xc.iloc[:, :3].mean().round(3).tolist(), Xc.iloc[:, :3].std().round(3).tolist())
    print("Normed mean/std (primeiras 3 colunas):", Xn.iloc[:, :3].mean().round(3).tolist(), Xn.iloc[:, :3].std().round(3).tolist())

sanity_report("banknote_authentication", df_bank, df_bank_norm)
sanity_report("breast_cancer_wdbc", df_wdbc, df_wdbc_norm)
sanity_report("sonar", df_sonar, df_sonar_norm)
print("\n[OK] Sanity checks concluídos.")



=== banknote_authentication ===
Distribuição de classes (clean):
target
0    0.555394
1    0.444606
Name: proportion, dtype: float64
Distribuição de classes (normalized):
target
0    0.555394
1    0.444606
Name: proportion, dtype: float64
Clean  mean/std (primeiras 3 colunas): [0.434, 1.922, 1.398] [2.843, 5.869, 4.31]
Normed mean/std (primeiras 3 colunas): [0.539, 0.587, 0.288] [0.205, 0.22, 0.186]

=== breast_cancer_wdbc ===
Distribuição de classes (clean):
target
0    0.627417
1    0.372583
Name: proportion, dtype: float64
Distribuição de classes (normalized):
target
0    0.627417
1    0.372583
Name: proportion, dtype: float64
Clean  mean/std (primeiras 3 colunas): [14.127, 19.29, 91.969] [3.524, 4.301, 24.299]
Normed mean/std (primeiras 3 colunas): [0.338, 0.324, 0.333] [0.167, 0.145, 0.168]

=== sonar ===
Distribuição de classes (clean):
target
0    0.466346
1    0.533654
Name: proportion, dtype: float64
Distribuição de classes (normalized):
target
0    0.466346
1    0.533654
Nam

In [9]:
import os
import pandas as pd
from sklearn.model_selection import StratifiedKFold

SPLITS_DIR = os.path.join(BASE, "data", "splits")
os.makedirs(SPLITS_DIR, exist_ok=True)

def export_stratified_folds(name, df_clean, target="target", n_splits=10, seed=42):
    out_dir = os.path.join(SPLITS_DIR, name)
    os.makedirs(out_dir, exist_ok=True)
    y = df_clean[target].values
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for i, (tr, te) in enumerate(skf.split(df_clean, y), start=1):
        pd.DataFrame({"idx": tr}).to_csv(os.path.join(out_dir, f"fold_{i:02d}_train.csv"), index=False)
        pd.DataFrame({"idx": te}).to_csv(os.path.join(out_dir, f"fold_{i:02d}_test.csv"), index=False)
    print(f"[OK] {name}: {n_splits} folds estratificados salvos em {out_dir}")

export_stratified_folds("banknote_authentication", df_bank)
export_stratified_folds("breast_cancer_wdbc", df_wdbc)
export_stratified_folds("sonar", df_sonar)


[OK] banknote_authentication: 10 folds estratificados salvos em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\splits\banknote_authentication
[OK] breast_cancer_wdbc: 10 folds estratificados salvos em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\splits\breast_cancer_wdbc
[OK] sonar: 10 folds estratificados salvos em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\splits\sonar


In [10]:
import json
meta = {
    "banknote_authentication": {
        "n_rows": len(df_bank),
        "n_features": df_bank.shape[1] - 1,
        "n_classes": int(df_bank["target"].nunique()),
        "clean_csv": summary["banknote_authentication"]["clean_path"],
        "normalized_csv": summary["banknote_authentication"]["normalized_path"],
    },
    "breast_cancer_wdbc": {
        "n_rows": len(df_wdbc),
        "n_features": df_wdbc.shape[1] - 1,
        "n_classes": int(df_wdbc["target"].nunique()),
        "clean_csv": summary["breast_cancer_wdbc"]["clean_path"],
        "normalized_csv": summary["breast_cancer_wdbc"]["normalized_path"],
    },
    "sonar": {
        "n_rows": len(df_sonar),
        "n_features": df_sonar.shape[1] - 1,
        "n_classes": int(df_sonar["target"].nunique()),
        "clean_csv": summary["sonar"]["clean_path"],
        "normalized_csv": summary["sonar"]["normalized_path"],
    },
}
out_meta = os.path.join(PROC, "_summary.json")
with open(out_meta, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"[OK] summary salvo em {out_meta}")
meta


[OK] summary salvo em c:\Users\hopper\Documents\Mestrado\Quantica\qml-ga-project\data\processed\_summary.json


{'banknote_authentication': {'n_rows': 1372,
  'n_features': 4,
  'n_classes': 2,
  'clean_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\banknote_authentication\\dataset_clean.csv',
  'normalized_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\banknote_authentication\\dataset_normalized.csv'},
 'breast_cancer_wdbc': {'n_rows': 569,
  'n_features': 30,
  'n_classes': 2,
  'clean_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\breast_cancer_wdbc\\dataset_clean.csv',
  'normalized_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\breast_cancer_wdbc\\dataset_normalized.csv'},
 'sonar': {'n_rows': 208,
  'n_features': 60,
  'n_classes': 2,
  'clean_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Quantica\\qml-ga-project\\data\\processed\\sonar\\dataset_clean.csv',
  'normalized_csv': 'c:\\Users\\hopper\\Documents\\Mestrado\\Qu