# ðŸ§± 11 â€” DIM_IES (Gold)

Este notebook:
- LÃª `silver/2018_anonimizado.xlsx` e `silver/2019_anonimizado.xlsx`
- Consolida as bases (2018+2019)
- ConstrÃ³i a dimensÃ£o **DIM_IES**
  - Preferindo `IES_ID_FAKE` / `IES_NOME_FAKE` quando existirem
  - Criando atributos derivados: **PÃšBLICA/PRIVADA** e **Ã‚MBITO** (FEDERAL/ESTADUAL/MUNICIPAL)
- Exporta em `gold/output/dim_ies.csv`


## 0) Imports

In [None]:
import pandas as pd
import numpy as np


## 1) Paths robustos

In [None]:
from pathlib import Path

# ======================================================
# Paths robustos (funciona no VS Code / Jupyter / OneDrive)
# - encontra a raiz do projeto procurando a pasta 'silver'
# ======================================================
CWD = Path().resolve()

def find_project_root(start: Path) -> Path:
    p = start
    for _ in range(12):
        if (p / "silver").exists():
            return p
        if p.parent == p:
            break
        p = p.parent
    raise FileNotFoundError("NÃ£o encontrei a pasta 'silver' subindo a Ã¡rvore. Rode o notebook dentro do repo.")

PROJECT_ROOT = find_project_root(CWD)
SILVER_DIR = PROJECT_ROOT / "silver"
GOLD_DIR = PROJECT_ROOT / "gold"
OUT_DIR = GOLD_DIR / "output"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("ðŸ“Œ CWD:", CWD)
print("ðŸ“Œ PROJECT_ROOT:", PROJECT_ROOT)
print("ðŸ“Œ SILVER_DIR:", SILVER_DIR)
print("ðŸ“Œ OUT_DIR:", OUT_DIR)


## 2) Ler Silver (2018 + 2019)

In [None]:
import pandas as pd
import numpy as np

INPUT_FILES = [
    SILVER_DIR / "2018_anonimizado.xlsx",
    SILVER_DIR / "2019_anonimizado.xlsx",
]

print("ðŸ“¥ INPUT_FILES:")
for f in INPUT_FILES:
    print(" -", f, "| existe?", f.exists())

dfs = []
for f in INPUT_FILES:
    if not f.exists():
        raise FileNotFoundError(f"Arquivo nÃ£o encontrado: {f}")
    tmp = pd.read_excel(f, dtype=str)
    tmp["fonte_arquivo"] = f.name
    dfs.append(tmp)

df = pd.concat(dfs, ignore_index=True)
print("âœ… Linhas/Colunas consolidadas:", df.shape)
df.head()


## 3) FunÃ§Ãµes auxiliares

In [None]:
import pandas as pd
import numpy as np

def norm_missing(s: pd.Series) -> pd.Series:
    x = s.astype(str).str.strip()
    return x.replace({"": np.nan, "nan": np.nan, "NAN": np.nan, "None": np.nan, "NONE": np.nan})

def dedup_most_complete(df_in: pd.DataFrame, key: str) -> pd.DataFrame:
    score = df_in.notna().sum(axis=1)
    return (df_in.assign(_score=score)
              .sort_values([key, "_score"], ascending=[True, False])
              .drop_duplicates(subset=[key], keep="first")
              .drop(columns=["_score"]))

def pick_first_existing(candidates, df_):
    return next((c for c in candidates if c in df_.columns), None)


## 4) Construir DIM_IES

In [None]:
ies_id_col = pick_first_existing(["IES_ID_FAKE", "CODIGO_DA_IES"], df)
ies_name_col = pick_first_existing(["IES_NOME_FAKE", "NOME_DA_IES"], df)

if ies_id_col is None:
    raise KeyError("NÃ£o encontrei coluna de id de IES (IES_ID_FAKE ou CODIGO_DA_IES).")

cols = [ies_id_col]
if ies_name_col is not None:
    cols.append(ies_name_col)

for c in [
    "ORGANIZACAO_ACADEMICA",
    "SISTEMA_DE_ENSINO",
    "CATEGORIA_ADMINISTRATIVA",
    "SITUACAO_DA_IES",
    "UF_PROCESSO",
    "UF_CADASTRO",
    "MUNICIPIO_PROCESSO",
    "MUNICIPIO_CADASTRO",
]:
    if c in df.columns and c not in cols:
        cols.append(c)

dim_ies = df[cols].copy()

dim_ies[ies_id_col] = norm_missing(dim_ies[ies_id_col])
dim_ies = dim_ies.dropna(subset=[ies_id_col])

if ies_name_col is not None:
    dim_ies[ies_name_col] = norm_missing(dim_ies[ies_name_col])

for c in ["ORGANIZACAO_ACADEMICA","SISTEMA_DE_ENSINO","CATEGORIA_ADMINISTRATIVA","SITUACAO_DA_IES"]:
    if c in dim_ies.columns:
        dim_ies[c] = norm_missing(dim_ies[c]).str.upper()

uf_proc = norm_missing(dim_ies["UF_PROCESSO"]) if "UF_PROCESSO" in dim_ies.columns else pd.Series([pd.NA]*len(dim_ies))
uf_cad  = norm_missing(dim_ies["UF_CADASTRO"]) if "UF_CADASTRO" in dim_ies.columns else pd.Series([pd.NA]*len(dim_ies))
dim_ies["UF"] = uf_proc.fillna(uf_cad)

if "CATEGORIA_ADMINISTRATIVA" in dim_ies.columns:
    cat = dim_ies["CATEGORIA_ADMINISTRATIVA"].astype(str).str.upper()
    dim_ies["PUBLICA_PRIVADA"] = np.where(cat.str.contains("PÃšBLIC|PUBLIC", na=False), "PÃšBLICA", "PRIVADA")
else:
    dim_ies["PUBLICA_PRIVADA"] = "DESCONHECIDO"

if "SISTEMA_DE_ENSINO" in dim_ies.columns:
    sist = dim_ies["SISTEMA_DE_ENSINO"].astype(str).str.upper()
    dim_ies["AMBITO_ADMINISTRATIVO"] = np.select(
        [sist.str.contains("FEDERAL", na=False),
         sist.str.contains("ESTADUAL", na=False),
         sist.str.contains("MUNICIPAL", na=False)],
        ["FEDERAL","ESTADUAL","MUNICIPAL"],
        default="OUTROS"
    )
else:
    dim_ies["AMBITO_ADMINISTRATIVO"] = "DESCONHECIDO"

rename_map = {
    ies_id_col: "id_ies",
    ies_name_col: "nome_ies" if ies_name_col else None,
    "ORGANIZACAO_ACADEMICA": "organizacao_academica",
    "SISTEMA_DE_ENSINO": "sistema_de_ensino",
    "CATEGORIA_ADMINISTRATIVA": "categoria_administrativa",
    "SITUACAO_DA_IES": "situacao_da_ies",
    "UF": "uf",
    "MUNICIPIO_PROCESSO": "municipio_processo",
    "MUNICIPIO_CADASTRO": "municipio_cadastro",
}
rename_map = {k:v for k,v in rename_map.items() if k is not None and v is not None and k in dim_ies.columns}
dim_ies = dim_ies.rename(columns=rename_map)

dim_ies = dedup_most_complete(dim_ies, "id_ies").reset_index(drop=True)

print("âœ… DIM_IES pronta:", dim_ies.shape)
dim_ies.head(10)


## 5) Exportar

In [None]:
out_file = OUT_DIR / "dim_ies.csv"
dim_ies.to_csv(out_file, index=False, encoding="utf-8")
print("âœ… Salvo em:", out_file)
