In [1]:
print("Code Checker")

Code Checker


# Import 

In [5]:
import os, json, math, textwrap
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LinearRegression
from scipy.signal import savgol_filter

DATA_DIR = Path("../Data")
RAW = DATA_DIR / "raw" / "raw_battery_dataset.csv"
PROC_DIR = DATA_DIR / "processed"
RES_DIR = DATA_DIR / "results"
PROC_DIR.mkdir(parents=True, exist_ok=True)
RES_DIR.mkdir(parents=True, exist_ok=True)
RAW


PosixPath('../Data/raw/raw_battery_dataset.csv')

# CSV Load

In [6]:
# Read with robust options (handles stray commas/na)
df_raw = pd.read_csv(RAW)
print(df_raw.shape)
df_raw.head()


(680, 11)


Unnamed: 0,battery_id,cycle,chI,chV,chT,disI,disV,disT,BCt,SOH,RUL
0,B5,1,1.440147,4.254682,23.988733,1.894407,3.273523,32.980834,1.986196,99.30979,219
1,B5,2,1.416595,4.159825,25.665347,1.829949,4.038741,32.25792,1.98624,99.311985,218
2,B5,3,1.420272,4.276323,25.40791,1.942105,3.214433,35.134801,1.984252,99.212608,217
3,B5,4,1.33768,4.236697,27.069757,2.073577,3.134529,32.082988,1.969236,98.461812,216
4,B5,5,1.263946,4.142791,26.478353,2.049885,3.729341,32.483154,1.974862,98.743106,215


#### Battery_id - EACH CELL
#### cycle - Indicates Test Cycle Number: A FULL CHARGE + DISCHARGE COUNTS AS 1 CYCLE 
#### chl - CHARGING CURRENT
#### chv - CHARGING VOLTAGE
#### cht - CHARGING TEMPERATURE
#### disl - DISCHARGING CURRENT - CURRENT DRAWN WHILE DISCHARGE
#### disv - DISCHARGING VOLTAGE - VOLTAGE DRAWN WHILE DISCHARGE
#### dist - DISCHARGING TEMPERATURE - TEMPERATURE DRAWN WHILE DISCHARGE
#### BCt - BATTERY CAPACITY - HOW MUCH CHARGE A CELL CAN STORE
#### SOH - STATE OF HEALTH
#### RUL - REMAINING USEFUL LIFE



In [7]:
# === Canonical Mapping (specific to Kaggle Li-ion dataset) ===
canonical_map = {
    "battery_id": "battery_id",        # identifier
    "cycle": "cycle",                  # cycle index
    "chI": "charge_current",           # charging current (A)
    "chV": "charge_voltage",           # charging voltage (V)
    "chT": "charge_temp",              # charging temperature (°C)
    "disI": "discharge_current",       # discharging current (A)
    "disV": "discharge_voltage",       # discharging voltage (V)
    "disT": "discharge_temp",          # discharging temperature (°C)
    "BCt": "capacity",                 # battery capacity (Ah)
    "SOH": "soh",                      # state of health (%)
    "RUL": "rul"                       # remaining useful life (cycles)
}

df = df_raw.rename(columns=canonical_map)
df = df[list(canonical_map.values())]

# Ensure numeric columns are clean
num_cols = [
    "cycle", "charge_current", "charge_voltage", "charge_temp",
    "discharge_current", "discharge_voltage", "discharge_temp",
    "capacity", "soh", "rul"
]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

# Sort for consistency
df = df.sort_values(by=["battery_id", "cycle"]).reset_index(drop=True)
print(df.shape)
df.head()


(680, 11)


Unnamed: 0,battery_id,cycle,charge_current,charge_voltage,charge_temp,discharge_current,discharge_voltage,discharge_temp,capacity,soh,rul
0,B5,1,1.440147,4.254682,23.988733,1.894407,3.273523,32.980834,1.986196,99.30979,219
1,B5,2,1.416595,4.159825,25.665347,1.829949,4.038741,32.25792,1.98624,99.311985,218
2,B5,3,1.420272,4.276323,25.40791,1.942105,3.214433,35.134801,1.984252,99.212608,217
3,B5,4,1.33768,4.236697,27.069757,2.073577,3.134529,32.082988,1.969236,98.461812,216
4,B5,5,1.263946,4.142791,26.478353,2.049885,3.729341,32.483154,1.974862,98.743106,215


# PREPROCESS

In [8]:
# === Cleaning ===
df_clean = df.copy()

# Interpolate and fill missing data
df_clean[num_cols] = df_clean[num_cols].interpolate(limit_direction="both")
df_clean[num_cols] = df_clean[num_cols].fillna(method="bfill").fillna(method="ffill")

# Verify per-battery data ranges
summary = df_clean.groupby("battery_id").agg({
    "cycle": ["min", "max"],
    "capacity": ["min", "max"],
    "soh": ["min", "max"],
    "rul": ["min", "max"]
}).round(3)

summary


  df_clean[num_cols] = df_clean[num_cols].fillna(method="bfill").fillna(method="ffill")


Unnamed: 0_level_0,cycle,cycle,capacity,capacity,soh,soh,rul,rul
Unnamed: 0_level_1,min,max,min,max,min,max,min,max
battery_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
B5,1,220,0.898,1.986,44.913,99.312,0,219
B6,1,210,0.942,1.994,47.11,99.685,0,209
B7,1,250,0.751,1.991,37.549,99.545,0,249


# SIGNAL SMOOTHING 

In [9]:
from scipy.signal import savgol_filter
import numpy as np
import pandas as pd

# Columns to smooth
SMOOTH_COLS = ["charge_voltage","discharge_voltage","charge_temp","discharge_temp","capacity"]

def sg_safe(x, window=11, polyorder=2):
    x = np.asarray(x)
    n = len(x)
    # enforce odd window and >= poly+2
    w = max(window, polyorder + 2 if (polyorder + 2) % 2 == 1 else polyorder + 3)
    if n < w: 
        w = max(3, n - (1 - n % 2))  # nearest odd <= n
    if w < 3: 
        return x
    try:
        return savgol_filter(x, w, polyorder)
    except ValueError:
        return x

def ema_vec(x, span=10):
    return pd.Series(x).ewm(span=span, adjust=False).mean().values

# --- (Optional) small auto-tuner per-battery+signal ---
def choose_sg_window(series, candidates=(7,9,11,13,15)):
    x = np.asarray(series)
    best_w, best_score = None, np.inf
    for w in candidates:
        xs = sg_safe(x, window=w, polyorder=2)
        resid = x - xs
        rough = np.var(np.diff(xs))           # smoother wants lower
        fiterr = np.var(resid)                # don’t drift far from data
        score = 0.6*rough + 0.4*fiterr        # weight shape more than fit
        if score < best_score:
            best_w, best_score = w, score
    return best_w or 11

# Apply smoothing per battery_id to respect trajectories
df_smooth = df_clean.copy()
for bid, g in df_smooth.groupby("battery_id"):
    idx = g.index
    # auto-pick one SG window per battery for consistency
    ref_window = choose_sg_window(g["capacity"].values) if "capacity" in g else 11
    for col in SMOOTH_COLS:
        if col in g:
            df_smooth.loc[idx, f"{col}_sg"]  = sg_safe(g[col].values, window=ref_window, polyorder=2)
            df_smooth.loc[idx, f"{col}_ema"] = ema_vec(g[col].values, span=10)

# Replace df_clean with smoothed frame for downstream cells
df_clean = df_smooth

# Quick quality check metrics (optional display)
def smoothing_qc(raw, smooth):
    resid = raw - smooth
    return {
        "resid_std": float(np.nanstd(resid)),
        "derivative_roughness": float(np.nanvar(np.diff(smooth)))
    }

qc_report = {}
for col in SMOOTH_COLS:
    if f"{col}_sg" in df_clean:
        qc_report[col] = smoothing_qc(df_clean[col].values, df_clean[f"{col}_sg"].values)
qc_report


{'charge_voltage': {'resid_std': 0.04186645901180995,
  'derivative_roughness': 0.00016502358598027129},
 'discharge_voltage': {'resid_std': 0.26353847947908626,
  'derivative_roughness': 0.0064116521485236624},
 'charge_temp': {'resid_std': 1.3166496307241928,
  'derivative_roughness': 0.16459058134095864},
 'discharge_temp': {'resid_std': 1.5956915632270232,
  'derivative_roughness': 0.24668977201018358},
 'capacity': {'resid_std': 0.004118676748971825,
  'derivative_roughness': 0.0033926097284148844}}