In [6]:
import os, re, glob, zipfile, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from flaml import AutoML
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Yol ayarları (CSV’ler notebook ile aynı klasördeyse ".")
DATA_DIR = "."
OUT_DIR  = "./outputs_flaml_shap"
os.makedirs(OUT_DIR, exist_ok=True)

# FLAML ayarları
TIME_BUDGET_SEC = 180   # her LOCO eğitimi için süre bütçesi (artırılabilir: 300-900)
SEED = 42
N_JOBS = -1

# Çizimler
plt.rcParams["figure.dpi"] = 120


In [7]:
import re

def _norm(s: str) -> str:
    """Küçük harfe çevir + harf/rakam dışını sil (space/_/-)"""
    return re.sub(r"[^a-z0-9]", "", str(s).strip().lower())

TARGET_ALIASES = {
    "soh": {"soh","stateofhealth","y","health"}
}
FEATURE_ALIASES = {
    "cycle": {"cycle","cycles","efc","equivalentfullcycles","ncycle","cyclecount"},
    "peak":  {"peak","pmax","icpeak","normalizedpeak","normalizedpmax","normalizedicpeak","pmaxnorm","pmaxnormalized","p_max"},
    "meanT": {"meant","temperature","temp","meantemp","avgtemp","meantemperature","averagetemperature","tmean",
              "mean_temperature","mean-temperature","mean temperature"}
}
TARGET_ALIASES = {k: {_norm(x) for x in v} for k,v in TARGET_ALIASES.items()}
FEATURE_ALIASES = {k: {_norm(x) for x in v} for k,v in FEATURE_ALIASES.items()}

def _find_index(colnames, alias_set):
    norm_cols = [_norm(c) for c in colnames]
    for i, n in enumerate(norm_cols):
        if n in alias_set:
            return i
    # kısmi eşleşme fallback (örn meantemperature içinde temperature)
    for i, n in enumerate(norm_cols):
        for a in alias_set:
            if a and a in n:
                return i
    return None

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = list(df.columns)

    soh_idx   = _find_index(cols, TARGET_ALIASES["soh"])
    cycle_idx = _find_index(cols, FEATURE_ALIASES["cycle"])
    peak_idx  = _find_index(cols, FEATURE_ALIASES["peak"])
    temp_idx  = _find_index(cols, FEATURE_ALIASES["meanT"])

    missing = []
    if soh_idx   is None: raise ValueError(f"Hedef (SoH) kolonu bulunamadı. Mevcut: {cols}")
    if cycle_idx is None: missing.append("cycle")
    if peak_idx  is None: missing.append("peak")
    if temp_idx  is None: missing.append("meanT")
    if missing:
        raise ValueError(f"Zorunlu feature kolon(lar) bulunamadı: {missing}\nMevcut kolonlar: {cols}")

    df_std = pd.DataFrame({
        "cycle": pd.to_numeric(df.iloc[:, cycle_idx], errors="coerce"),
        "peak":  pd.to_numeric(df.iloc[:,  peak_idx], errors="coerce"),
        "meanT": pd.to_numeric(df.iloc[:,  temp_idx], errors="coerce"),
        "soh":   pd.to_numeric(df.iloc[:,   soh_idx], errors="coerce"),
    }).dropna()

    return df_std

def infer_cell_id_from_path(path: str) -> str:
    base = os.path.basename(path)
    m = re.search(r"(Cell\s*\d+)", base, flags=re.I)
    return m.group(1).replace(" ", "") if m else os.path.splitext(base)[0]


In [8]:
SEARCH_PAT = "Cell*_Cycle_Peak_meanT_SoH.csv"

csv_paths = sorted(glob.glob(os.path.join(DATA_DIR, SEARCH_PAT)))
if not csv_paths:
    raise FileNotFoundError(f"'{SEARCH_PAT}' bulunamadı. Çalışma dizini: {os.getcwd()}")

dfs = []
for p in csv_paths:
    raw = pd.read_csv(p)
    std = standardize_columns(raw)
    std["Cell"] = infer_cell_id_from_path(p)
    dfs.append(std)

data = pd.concat(dfs, ignore_index=True)
display(data.head())
print(f"Yüklendi: {len(csv_paths)} dosya, {data.shape[0]} satır, Kolonlar: {list(data.columns)}")


Unnamed: 0,cycle,peak,meanT,soh,Cell
0,0,5.098876,40.049519,100.0,Cell1
1,100,4.807921,39.993983,99.220226,Cell1
2,200,4.723264,39.992577,98.608316,Cell1
3,300,4.652136,40.006883,98.03196,Cell1
4,400,4.442668,39.988917,97.487351,Cell1


Yüklendi: 8 dosya, 519 satır, Kolonlar: ['cycle', 'peak', 'meanT', 'soh', 'Cell']


In [9]:
import shap
import numpy as np
import matplotlib.pyplot as plt

results = []
all_cells = sorted(data["Cell"].unique(), key=lambda x: int(re.sub(r"\D","",x)))  # Cell1..Cell8 sıralı

for test_cell in tqdm(all_cells, desc="LOCO"):
    train_df = data[data["Cell"] != test_cell].copy()
    test_df  = data[data["Cell"] == test_cell].copy()

    if len(test_df) == 0 or len(train_df) == 0:
        print(f"[UYARI] {test_cell}: Boş fold, atlandı.")
        continue

    X_train = train_df[["cycle","peak","meanT"]].copy()
    y_train = train_df["soh"].copy()
    X_test  = test_df[["cycle","peak","meanT"]].copy()
    y_test  = test_df["soh"].copy()

    # Güvenli numeric + NaN temizliği
    X_train = X_train.apply(pd.to_numeric, errors="coerce")
    X_test  = X_test.apply(pd.to_numeric,  errors="coerce")
    y_train = pd.to_numeric(y_train, errors="coerce")
    y_test  = pd.to_numeric(y_test,  errors="coerce")

    tr_mask = np.isfinite(X_train.values).all(axis=1) & np.isfinite(y_train.values)
    te_mask = np.isfinite(X_test.values).all(axis=1)  & np.isfinite(y_test.values)
    X_train, y_train = X_train.loc[tr_mask], y_train.loc[tr_mask]
    X_test,  y_test  = X_test.loc[te_mask],  y_test.loc[te_mask]

    if len(X_train) == 0 or len(X_test) == 0:
        print(f"[UYARI] {test_cell}: NaN/inf temizliği sonrası boş veri, atlandı.")
        continue

    # FLAML
    automl = AutoML()
    automl.fit(
        X_train=X_train, y_train=y_train,
        task="regression",
        time_budget=TIME_BUDGET_SEC,
        metric="rmse",
        estimator_list=["lgbm","xgboost","rf","extra_tree"],  # ağaç odaklı, SHAP uyumlu
        n_jobs=N_JOBS,
        seed=SEED,
        verbose=0
    )

    # Tahmin & metrikler (sklearn sürümünden bağımsız RMSE)
    y_pred = automl.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = float(np.sqrt(mse))
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

    results.append({
        "Test_Cell": test_cell,
        "Best_Estimator": automl.best_estimator,
        "Best_Config": automl.best_config,
        "Best_Iteration": automl.best_iteration,
        "RMSE": rmse,
        "MAE":  mae,
        "R2":   r2,
    })

    # ==== SHAP ====
    # Arka plan örneklemi (Explainer maliyetini düşürmek için)
    bg = X_train.sample(min(1000, len(X_train)), random_state=SEED)

    # FLAML wrapper → gerçek model
    base_model = getattr(automl.model, "model", automl.model)
    shap_arr = None

    # 1) Ağaç modelleri için TreeExplainer (en hızlı/kararlı)
    try:
        explainer = shap.TreeExplainer(base_model)
        shap_arr = explainer.shap_values(X_test)
        if isinstance(shap_arr, list):  # bazen sınıf-listesi dönebilir
            shap_arr = shap_arr[0]
        shap_arr = np.asarray(shap_arr)
    except Exception:
        # 2) Genel Explainer (masker ile)
        try:
            masker = shap.maskers.Independent(bg, max_samples=1000)
            explainer = shap.Explainer(base_model, masker)
            sv = explainer(X_test, check_additivity=False)
            shap_arr = np.asarray(sv.values)
        except Exception as e_auto:
            print(f"[UYARI] {test_cell}: SHAP hesaplanamadı. ({type(e_auto).__name__}: {e_auto})")
            shap_arr = None

    # SHAP görseller/CSV (başarılıysa)
    if shap_arr is not None and shap_arr.ndim == 2 and shap_arr.shape[0] == len(X_test):
        # Summary (beeswarm)
        try:
            plt.figure()
            shap.summary_plot(shap_arr, X_test, feature_names=X_test.columns, show=False)
            fig_path = os.path.join(OUT_DIR, f"shap_summary_{test_cell}.png")
            plt.tight_layout(); plt.savefig(fig_path, dpi=200, bbox_inches="tight"); plt.close()
        except Exception as e_sum:
            print(f"[UYARI] {test_cell}: SHAP summary çizimi başarısız: {e_sum}")

        # SHAP değerleri CSV
        try:
            shap_df = pd.DataFrame(shap_arr, columns=X_test.columns)
            shap_csv = os.path.join(OUT_DIR, f"shap_values_{test_cell}.csv")
            shap_df.to_csv(shap_csv, index=False)
        except Exception as e_csv:
            print(f"[UYARI] {test_cell}: SHAP CSV yazımı başarısız: {e_csv}")

        # En önemli ilk 3 özellik için bağımlılık grafikleri
        try:
            mean_abs = np.abs(shap_arr).mean(axis=0)
            order = np.argsort(-mean_abs)
            top_features = [X_test.columns[i] for i in order[:min(3, X_test.shape[1])]]
            for feat in top_features:
                plt.figure()
                shap.dependence_plot(
                    ind=feat, shap_values=shap_arr, features=X_test, feature_names=X_test.columns,
                    show=False, interaction_index=None
                )
                dep_path = os.path.join(OUT_DIR, f"shap_dependence_{test_cell}_{feat}.png")
                plt.tight_layout(); plt.savefig(dep_path, dpi=200, bbox_inches="tight"); plt.close()
        except Exception as e_dep:
            print(f"[UYARI] {test_cell}: SHAP dependence çizimi başarısız: {e_dep}")
    else:
        print(f"[UYARI] {test_cell}: SHAP değerleri üretilemedi, görsel atlandı.")

# Sonuç tablosu
res_df = pd.DataFrame(results)
res_csv = os.path.join(OUT_DIR, "flaml_loco_results.csv")
res_df.to_csv(res_csv, index=False)
display(res_df)
print(f"[KAYIT] Sonuç tablosu: {res_csv}")

LOCO:   0%|          | 0/8 [00:00<?, ?it/s]

Unnamed: 0,Test_Cell,Best_Estimator,Best_Config,Best_Iteration,RMSE,MAE,R2
0,Cell1,xgboost,"{'n_estimators': 440, 'max_leaves': 11, 'min_c...",312,0.522933,0.375982,0.993837
1,Cell2,lgbm,"{'n_estimators': 445, 'num_leaves': 7, 'min_ch...",353,0.779598,0.521048,0.986479
2,Cell3,lgbm,"{'n_estimators': 231, 'num_leaves': 14, 'min_c...",270,0.572419,0.42485,0.992138
3,Cell4,xgboost,"{'n_estimators': 280, 'max_leaves': 15, 'min_c...",267,0.6269,0.543891,0.989334
4,Cell5,lgbm,"{'n_estimators': 472, 'num_leaves': 10, 'min_c...",389,0.716865,0.292135,0.98256
5,Cell6,lgbm,"{'n_estimators': 472, 'num_leaves': 9, 'min_ch...",414,0.556837,0.469195,0.98964
6,Cell7,extra_tree,"{'n_estimators': 92, 'max_features': 0.9031334...",346,1.546465,1.394771,0.921774
7,Cell8,lgbm,"{'n_estimators': 442, 'num_leaves': 14, 'min_c...",446,0.752366,0.6912,0.986256


[KAYIT] Sonuç tablosu: ./outputs_flaml_shap\flaml_loco_results.csv


<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

<Figure size 768x576 with 0 Axes>

In [10]:
zip_path = "flaml_shap_outputs.zip"
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    for root, _, files in os.walk(OUT_DIR):
        for f in files:
            full = os.path.join(root, f)
            rel  = os.path.relpath(full, ".")
            zf.write(full, rel)
print(f"ZIP hazır: {zip_path}")


ZIP hazır: flaml_shap_outputs.zip


In [None]:
# === EvalML-style SHAP beeswarm (FLAML) ===
# Amaç: outputs_flaml_shap/shap_values_<CELL_ID>.csv (yoksa /mnt/data yolunu) okuyup,
#       orijinal X (Cycle, Peak, Temperature) ile aynı uzunlukta hizalayarak
#       EvalML'e benzer yatay beeswarm grafiği üretmek.
import os, re, glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

# ---------------- Parametre ----------------
CELL_ID = "Cell1"  # ör. "Cell4", "Cell8" vb.
OUT_PNG = f"shap_importance_{CELL_ID}_flaml.png"

# ---------------- Yardımcılar --------------
def _norm(s: str) -> str:
    return re.sub(r"[^a-z0-9]", "", str(s).strip().lower())

def _find_idx(cols, alias_set):
    ncols = [_norm(c) for c in cols]
    for i,n in enumerate(ncols):
        if n in alias_set:
            return i
    for i,n in enumerate(ncols):
        if any(a in n for a in alias_set):
            return i
    return None

# ---------------- SHAP değerlerini yükle ---------------
shap_csv_main = f"./outputs_flaml_shap/shap_values_{CELL_ID}.csv"
shap_csv_alt  = f"/mnt/data/shap_values_{CELL_ID}.csv"
shap_xls_alt  = f"/mnt/data/shap_values_{CELL_ID}.xls"

if   os.path.exists(shap_csv_main): shap_df = pd.read_csv(shap_csv_main)
elif os.path.exists(shap_csv_alt):  shap_df = pd.read_csv(shap_csv_alt)
elif os.path.exists(shap_xls_alt):  shap_df = pd.read_excel(shap_xls_alt)
else:
    raise FileNotFoundError(f"SHAP dosyası bulunamadı: {shap_csv_main} / {shap_csv_alt} / {shap_xls_alt}")

# ---------------- Özellik değerlerini (X) yükle --------
# Ham dosya adı kalıbı: CellX_Cycle_Peak_meanT_SoH.csv
cand = sorted(glob.glob(f"./{CELL_ID}_Cycle_Peak_meanT_SoH.csv")) or \
       sorted(glob.glob(f"/mnt/data/{CELL_ID}_Cycle_Peak_meanT_SoH.csv"))
if not cand:
    raise FileNotFoundError(f"{CELL_ID}_Cycle_Peak_meanT_SoH.csv dosyası bulunamadı (., /mnt/data).")
raw = pd.read_csv(cand[0])

cols = list(raw.columns)
idx_cycle = _find_idx(cols, {"cycle","cycles","efc","equivalentfullcycles","cyclecount"})
idx_peak  = _find_idx(cols, {"peak","pmax","icpeak","normalizedpeak","normalizedpmax","normalizedicpeak"})
idx_temp  = _find_idx(cols, {"meantemperature","temperature","meantemp","avgtemp","tmean"})

if None in (idx_cycle, idx_peak, idx_temp):
    raise ValueError(f"Zorunlu kolon(lar) yok. Mevcut başlıklar: {cols}")

X = pd.DataFrame({
    "Cycle":           pd.to_numeric(raw.iloc[:, idx_cycle], errors="coerce"),
    "Normalized_Peak": pd.to_numeric(raw.iloc[:, idx_peak],  errors="coerce"),
    "Temperature":     pd.to_numeric(raw.iloc[:, idx_temp],  errors="coerce"),
}).dropna()

# --- SHAP ve X uzunluklarını hizala (gerekirse kırp) ---
min_len = min(len(shap_df), len(X))
X = X.iloc[:min_len, :].reset_index(drop=True)
# shap_df kolonlarını eşle: cycle/peak/temperature benzeri isimler olabilir
sc_map = {
    "Normalized_Peak": [c for c in shap_df.columns if _norm(c) in {"peak","pmax","icpeak","normalizedpeak","normalizedpmax","normalizedicpeak"}],
    "Cycle":           [c for c in shap_df.columns if _norm(c) == "cycle" or "cycle" in _norm(c)],
    "Temperature":     [c for c in shap_df.columns if "temp" in _norm(c) or "temperature" in _norm(c)],
}
for k in sc_map:
    if not sc_map[k]:
        raise ValueError(f"{k} için SHAP kolonu bulunamadı. SHAP kolonları: {list(shap_df.columns)}")
    sc_map[k] = sc_map[k][0]
S = shap_df.loc[:min_len-1, [sc_map["Normalized_Peak"], sc_map["Cycle"], sc_map["Temperature"]]]
S.columns = ["Normalized_Peak","Cycle","Temperature"]

# ---------------- Çizim (EvalML-style) -----------------
# Sıra: üstte Normalized_Peak, ortada Cycle, altta Temperature
plot_order = ["Normalized_Peak","Cycle","Temperature"]
ypos = {"Temperature":0, "Cycle":1, "Normalized_Peak":2}

fig, ax = plt.subplots(figsize=(10, 3.2))
cmap = plt.cm.coolwarm

for feat in plot_order:
    xs = S[feat].values
    ys = np.full_like(xs, ypos[feat], dtype=float)
    # hafif jitter
    jitter = (np.random.rand(len(xs)) - 0.5) * 0.5
    # renk: ilgili feature değerleri min-max normalize
    vals = X[feat].values
    vmin, vmax = np.nanmin(vals), np.nanmax(vals)
    colors = cmap((vals - vmin)/(vmax - vmin) if vmax>vmin else np.zeros_like(vals))
    ax.scatter(xs, ys + jitter, s=14, alpha=0.9, edgecolors="none", c=colors)

# y-tick etiketleri
ax.set_yticks([0,1,2])
ax.set_yticklabels(["Temperature","Cycle","Normalized_Peak"])

# dikey 0 çizgisi
ax.axvline(0, color="gray", linewidth=1)

# x-izgara (hafif noktalı)
ax.grid(True, axis="x", linestyle=":", linewidth=0.6, alpha=0.6)
ax.grid(False, axis="y")

# başlık ve eksen
ax.set_title(f"EvalML-style SHAP (Cell: {CELL_ID})", fontsize=14, pad=8)
ax.set_xlabel("SHAP value (impact on model output)")

# renk barı (Low→High)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=Normalize(vmin=0, vmax=1))
sm.set_array([])
cbar = plt.colorbar(sm, ax=ax, fraction=0.046, pad=0.04)
cbar.set_label("Feature value", rotation=90)
cbar.set_ticks([0,1])
cbar.set_ticklabels(["Low","High"])

plt.tight_layout()
plt.savefig(OUT_PNG, dpi=200, bbox_inches="tight")
plt.show()
print(f"Kaydedildi: {OUT_PNG}")
