# Package Imports

In [9]:
import pandas as pd 
from xgboost import XGBClassifier
import os
from sklearn.metrics import classification_report, log_loss
from sklearn.model_selection import train_test_split
import pyodbc 
import numpy as np
from sklearn.preprocessing import LabelEncoder
from IPython.display import HTML, display
from sklearn.model_selection import StratifiedKFold
import optuna
import matplotlib.pyplot as plt
from xgboost import plot_importance
import multiprocessing
from joblib import Parallel, delayed
from scipy.stats import skew, kurtosis, entropy
from scipy.signal import medfilt, find_peaks
from numpy.fft import fft, rfft, rfftfreq
import m2cgen as m2c
import seaborn as sns
import shap
from optuna.samplers import CmaEsSampler
from itertools import groupby
from sqlalchemy import create_engine
from urllib.parse import quote_plus


In [10]:

# Connection parameters
server = 'prod-bmis2-sqlserver.database.windows.net'
database = 'BMIS2'  # Change if your database name differs
username = 'DataIntern'
password = 'D@ta!ntern@mdx1'
driver = 'ODBC Driver 18 for SQL Server'  # e.g. ODBC Driver 18 for SQL Server
 

mvt = '''
SELECT  t.TestRecordDetailId as "Test_Record_Detail_ID", SoftwareVersionNumber,
Voltage, Rated, Measured,g.StarterVoltageGraphpoints, g.AlternatorRippleGraphpoints, BatteryDecision as "Battery_Decision"
FROM BMIS.R_T_TestRecordDetails as t
left join BMIS.R_T_TestRecordLineItemDetail as l on t.TestRecordDetailId=l.TestRecordDetailId
left join BMIS.R_T_TestsGraphPoints as g on g.KeyID=l.GraphpointKeyID
where t.clientid not in (160, 10, 2123, 1, 12345)
and t.TestDate >='01/01/2023'
and t.TestDate <= getdate()
and StarterVoltageGraphpoints is not null
and StarterVoltageGraphpoints like '%:%'
and t.Voltage > 0
and t.Measured > 0
and t.ToolTypeID = 7
and t.BatteryDecision in (0,1,5,6)
'''

odbc_str = (
    f"DRIVER={{{driver}}};"
    f"SERVER={server};"
    f"DATABASE={database};"
    f"UID={username};"
    f"PWD={password};"
    "Encrypt=yes;TrustServerCertificate=no;"
)
conn_url = f"mssql+pyodbc:///?odbc_connect={quote_plus(odbc_str)}"

# Create engine (fast_executemany helps with large inserts if used later)
engine = create_engine(conn_url, fast_executemany=True)

# Execute query and load results into a DataFrame via SQLAlchemy engine
MVT_Original = pd.read_sql_query(mvt, con=engine)

# Close/dispose engine
engine.dispose()


In [11]:
MVT_Original.shape

(27338, 8)

In [12]:
MVT_Original['Battery_Decision'].value_counts()


Battery_Decision
1    14830
0     6711
5     5229
6      568
Name: count, dtype: int64

In [13]:
MVT_Original.set_index('Test_Record_Detail_ID', inplace=True)

In [14]:

bd_map = {  0: "GOOD BATTERY",
    1: "GOOD RECHARGE",
    2: "MARGINAL RECHARGE",
    3: "MARGINAL",
    4: "CHARGE & RETES",
    5: "REPLACE BATTERY",
    6: "BADCELL SHORT REPLACE",
    7: "REMOTE POST",
    8: "SIDE POST",
    9: "BC OPEN OR LOAD FAIL REPLACE",
    10: "BROKEN WELD REPLACE",
    11: "FROZEN BATTERY",
    12: "TOO HOT REPLACE",
    13: "TEMP SENSOR FAILED",
    14: "ABORTED",
    15: "INVALID TEST",
    16: "ABORTED/24V",
    17: "OUT OF BALANCE",
    15: "IN BALANCE",
    16: "CLAMPS REMOVED",
    20: "LOST POWER",
    21: "CHARGE & RETEST QUESTION",
    22: "SIDE POST QUESTION",
    23: "SYSTEM NOISE",
    24: "JUMPER POST QUESTION",
    25: "GOOD PACK",
    26: "CHECK PACK",
    27: "BADCELL SHORT",
    28: "To be reused",
    29: "READY TO INSTALL",
    30: "DECISION NOT REACHED",
    31: "PDI COMPLETE",
    32: "CYCLING REQUIRED",
    33: "REST & RETEST",
    34: "To be reused",
    35: "To be reused",
    36: "Replace SRFCHG",
    37: "Good SRFCHG",
    38: "GR-8 Diag SRFCHG",
    39: "Good Rech SRFCHG",
    100: "GOOD BATTERY",
    101: "GOOD RECHARGE",
    102: "MARGINAL RECHARGE",
    103: "MARGINAL",
    104: "CHARGE & RETEST",
    105: "REPLACE BATTERY",
    106: "BADCELL SHORT REPLACE",
    107: "REMOTE POST",
    108: "SIDE POST",
    109: "BC OPEN OR LOAD FAIL REPLACE",
    110: "BROKEN WELD REPLACE",
    111: "FROZEN BATTERY",
    112: "TOO HOT REPLACE",
    113: "TEMP SENSOR FAILED",
    114: "ABORTED",
    115: "INVALID TEST",
    116: "ABORTED/24V",
    117: "OUT OF BALANCE",
    118: "IN BALANCE",
    119: "CLAMPS REMOVED",
    120: "LOST POWER",
    121: "PDI COMPLETE",
    122: "POOR RESERVE",
    123: "GOOD RESERVE",
    124: "POOR CHARGE ACCEPTANCE",
    125: "GOOD CHARGE ACCEPTANCE",
    126: "NO DECISION",
    136: "Stopped",
    249: "CUSTOMER DECISION 5",
    250: "CUSTOMER DECISION 4",
    251: "CUSTOMER DECISION 3",
    252: "CUSTOMER DECISION 2",
    253: "CUSTOMER DECISION 1",
    254: "ALL BATTERY AND CHARGE DECISIONS",
    255: "No Decision"
           }

decision = ['GOOD BATTERY', 'GOOD RECHARGE', 'REPLACE BATTERY', 'CHARGE & RETEST', 'BADCELL SHORT REPLACE']  

MVT_Original.dropna(inplace=True,axis = 0)
MVT_Original.drop_duplicates(inplace=True)

MVT_Original['Battery_Decision'] = MVT_Original['Battery_Decision'].astype(int)
MVT_Original['Battery_Decision'] = MVT_Original['Battery_Decision'].map(bd_map)


In [15]:
#convert good and bad battery
good = ['GOOD BATTERY','GOOD RECHARGE']
bad = ['REPLACE BATTERY','BADCELL SHORT REPLACE']

def batterydecision(status):
    if status in good:
        return 'GOOD BATTERY'
    elif status in bad:
        return 'BAD BATTERY'
    else:
        return status
    
MVT_Original['Battery_Decision'] = MVT_Original['Battery_Decision'].apply(batterydecision)




In [16]:
MVT_Original = MVT_Original[MVT_Original['Battery_Decision'].isin(['GOOD BATTERY', 'BAD BATTERY'])]


In [17]:
MVT_Original['Battery_Decision'].value_counts()

Battery_Decision
GOOD BATTERY    18588
BAD BATTERY      4491
Name: count, dtype: int64

### Clean Starter Voltage Array

In [18]:
Starter_Voltage_Array_Hex = MVT_Original['StarterVoltageGraphpoints'].str.split(':', expand=True)


In [19]:
Starter_Voltage_Array_Hex


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,506
Test_Record_Detail_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
373209073,4f1,4f0,4f1,4f1,4f1,4f1,3b5,3e3,3e6,405,...,,,,,,,,,,
375394165,4fa,4f9,4fa,4fa,4fa,4fa,311,34c,39c,3e1,...,,,,,,,,,,
374794087,4de,4df,4de,4df,4de,4de,31b,34b,36d,35f,...,,,,,,,,,,
369432352,4c7,4c7,4c7,4c7,4c8,4c7,291,2b8,2e0,2de,...,,,,,,,,,,
370672670,4fe,4fe,4ff,4ff,4ff,4fe,454,3bd,3f3,413,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331039853,4f0,4f1,4f0,4f1,4f0,4f1,385,39e,3a7,3c0,...,,,,,,,,,,
335579145,4aa,4aa,4a9,4aa,4a9,4aa,3bf,38d,389,395,...,,,,,,,,,,
330667895,4d1,4d1,4d1,4d1,4d2,4d1,312,331,2fc,2fa,...,46a,475,467,46d,46d,474,46c,46e,474,46b
330665811,4e2,4e2,4e3,4e2,4e3,4e2,3bf,3a6,3ba,3d1,...,,,,,,,,,,


In [20]:
#Split and expand the StarterVoltageGraphpoints column
Starter_Voltage_Array_Hex = MVT_Original['StarterVoltageGraphpoints'].str.split(':', expand=True)
Starter_Voltage_Array_Hex.fillna(value='nan',inplace=True)
Starter_Voltage_Array_Hex.dropna(thresh=20,inplace=True, axis=0)
Starter_Voltage_Array_Hex.set_index(MVT_Original.index, inplace=True)


In [21]:
Starter_Voltage_Array_Hex.shape

(23079, 507)

In [22]:
Starter_Voltage_Array_Hex

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,497,498,499,500,501,502,503,504,505,506
Test_Record_Detail_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
373209073,4f1,4f0,4f1,4f1,4f1,4f1,3b5,3e3,3e6,405,...,,,,,,,,,,
375394165,4fa,4f9,4fa,4fa,4fa,4fa,311,34c,39c,3e1,...,,,,,,,,,,
374794087,4de,4df,4de,4df,4de,4de,31b,34b,36d,35f,...,,,,,,,,,,
369432352,4c7,4c7,4c7,4c7,4c8,4c7,291,2b8,2e0,2de,...,,,,,,,,,,
370672670,4fe,4fe,4ff,4ff,4ff,4fe,454,3bd,3f3,413,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331039853,4f0,4f1,4f0,4f1,4f0,4f1,385,39e,3a7,3c0,...,,,,,,,,,,
335579145,4aa,4aa,4a9,4aa,4a9,4aa,3bf,38d,389,395,...,,,,,,,,,,
330667895,4d1,4d1,4d1,4d1,4d2,4d1,312,331,2fc,2fa,...,46a,475,467,46d,46d,474,46c,46e,474,46b
330665811,4e2,4e2,4e3,4e2,4e3,4e2,3bf,3a6,3ba,3d1,...,,,,,,,,,,


In [23]:
alternator_ripple_array = MVT_Original.AlternatorRippleGraphpoints.str.split(':', expand=True)


In [24]:
def hex_to_signed_int(h: str) -> int:
    """
    Convert an 8-digit hex string (two’s-complement 32-bit) into a Python int.
    """
    val = int(h, 16)               # Python int, never overflows
    if val & 0x80000000:           # if sign bit set
        val -= 0x100000000        # subtract 2**32
    return val


Note: You can set the environment variable NUM_PARALLEL_JOBS to a value higher than your CPU count to allow more jobs. However, for CPU‑bound tasks additional jobs may simply share the available cores and lead to increased overhead.

In [25]:
#drop rows where the hex value is greater than or less than 8 characters
alternator_ripple_array = alternator_ripple_array.map(lambda x: x if isinstance(x, str) and len(x) == 8 else None)

In [26]:
alternator_ripple_array = alternator_ripple_array.map(hex_to_signed_int, na_action='ignore')

In [27]:
alternator_ripple_array.shape

(23079, 256)

In [28]:
# improved converters + chunked processing that preserves the original index
def _hex_convert(val):
    """Convert a single cell which may be hex (preferred) or decimal; divide result by 100."""
    try:
        if pd.isna(val):
            return np.nan
        s = str(val).strip()
        if s == '':
            return np.nan
        # treat values containing a-f as hex for sure
        if any(c in 'abcdefABCDEF' for c in s):
            return np.float32(int(s, 16) / 100.0)
        # otherwise try decimal/float parse
        try:
            return np.float32(float(s) / 100.0)
        except Exception:
            # fallback: try hex parse (if decimal parse fails)
            try:
                return np.float32(int(s, 16) / 100.0)
            except Exception:
                return np.nan
    except Exception:
        return np.nan

def process_chunk(df_chunk: pd.DataFrame) -> pd.DataFrame:
    """
    Convert measurement columns in df_chunk in-place and return chunk.
    df_chunk keeps its original index (Test_Record_Detail_ID), so no reindexing/reset.
    """
    # detect columns to convert: exclude obvious metadata columns
    exclude = {'SoftwareVersionNumber', 'Test_Record_Detail_ID'}
    voltage_cols = [c for c in df_chunk.columns if c not in exclude]

    # operate on a copy of the chunk to avoid SettingWithCopy warnings in pandas
    chunk = df_chunk.copy()

    # apply conversion only to the selected measurement columns, preserve dtype & index
    if voltage_cols:
        chunk.loc[:, voltage_cols] = chunk.loc[:, voltage_cols].map(_hex_convert).astype(np.float32, copy=False)

    return chunk

# autodetect cores and decide number of jobs
n_cores = multiprocessing.cpu_count()
n_jobs = int(os.environ.get('NUM_PARALLEL_JOBS', n_cores)) or 1
n_jobs = min(n_jobs, len(Starter_Voltage_Array_Hex))  # don't create more jobs than rows

# split by the existing index (which may not start at 0) — keep labels so .loc preserves original indices
idx_chunks = np.array_split(Starter_Voltage_Array_Hex.index.values, n_jobs)
df_chunks = [Starter_Voltage_Array_Hex.loc[idx] for idx in idx_chunks if len(idx) > 0]

results = Parallel(n_jobs=n_jobs)(
    delayed(process_chunk)(chunk) for chunk in df_chunks
)

# re-assemble in the original index order (preserves Test_Record_Detail_ID index)
Starter_Volt_Array_MVT = pd.concat(results).loc[Starter_Voltage_Array_Hex.index]

In [29]:
# Downcast object-dtype columns to their “real” types before filling NaNs
Starter_Volt_Array_MVT = Starter_Volt_Array_MVT.infer_objects(copy=False)
Starter_Volt_Array_MVT.fillna(value=np.nan)
Starter_Volt_Array_MVT.dropna(thresh=20)
Starter_Volt_Array_MVT = Starter_Volt_Array_MVT.astype(np.float32, copy=False)

In [30]:
Starter_Volt_Array_MVT.to_numpy(na_value=np.nan)
print(f'Number of Rows: {len(Starter_Volt_Array_MVT)}')



Number of Rows: 23079


### Calculate Starter Array Features

In [31]:
# ───────────────────────────────────────────────────────────────────────────────
# CONSTANTS (indices correspond to 10 ms steps)  → use Python ints for slicing,
# but cast any index arrays to np.int32 later.
# ───────────────────────────────────────────────────────────────────────────────
PRE_END        = 18
FULL_END       = 167
POST_START     = 19
RECOVERY_END   = 20
START_END      = 5

def _mad(x):
    x = np.asarray(x, dtype=float)
    m = np.median(x)
    return 1.4826 * np.median(np.abs(x - m)) + 1e-12  # robust sigma


def _recovery(row, idx_min, thresh=0.5):
    """Time (ms) until row rises by +thresh V above its min from idx_min."""
    idx_min = int(idx_min)
    base = row[idx_min]
    for j in range(idx_min, row.size):
        if row[j] >= base + thresh:
            return (j - idx_min) * dt_ms
    return (row.size - idx_min) * dt_ms
# ───────────────────────────────────────────────────────────────────────────────
# 1) FILTER OUT ROWS WITH ZERO DATA IN 0–17
# ───────────────────────────────────────────────────────────────────────────────
# to numpy array (float32) for speed and 32-bit control
wave_full = Starter_Volt_Array_MVT.to_numpy(dtype=np.float32)

# boolean mask: keep rows with at least one non-NaN in cols 0–17
has_early = ~np.isnan(wave_full[:, :PRE_END]).all(axis=1)

# filtered array + corresponding original DataFrame
wave        = wave_full[has_early]
filtered_df = Starter_Volt_Array_MVT[has_early]

# ───────────────────────────────────────────────────────────────────────────────
# 2) PRE-SLICE FOR REUSE
# ───────────────────────────────────────────────────────────────────────────────
pre_seg   = wave[:, :PRE_END].astype(np.float32, copy=False)
full_seg  = wave[:, :FULL_END].astype(np.float32, copy=False)
post_seg  = wave[:, POST_START:].astype(np.float32, copy=False)
rec_seg   = wave[:, :RECOVERY_END].astype(np.float32, copy=False)
start_seg = wave[:, :START_END].astype(np.float32, copy=False)

# ───────────────────────────────────────────────────────────────────────────────
# 3) BASIC STATISTICS (force float32 outputs)
# ───────────────────────────────────────────────────────────────────────────────
min_pre   = np.nanmin(pre_seg,  axis=1).astype(np.float32)
min_full  = np.nanmin(full_seg, axis=1).astype(np.float32)
max_full  = np.nanmax(full_seg, axis=1).astype(np.float32)
std_full  = np.nanstd(full_seg, axis=1).astype(np.float32)
mean_full = np.nanmean(full_seg, axis=1).astype(np.float32)
med_full  = np.nanmedian(full_seg, axis=1).astype(np.float32)

max_post    = np.nanmax(post_seg, axis=1).astype(np.float32)
bounce_back = (max_post - min_pre).astype(np.float32)
drop        = (wave[:, 0] - min_pre).astype(np.float32)

# ───────────────────────────────────────────────────────────────────────────────
# 4) SAFE INDICES + SLOPES (indices as int32; computed arrays as float32)
# ───────────────────────────────────────────────────────────────────────────────
min_idx = np.nanargmin(pre_seg, axis=1).astype(np.int32)  # Min_Index_Below_19

# avoid All-NaN error for post_seg:
allnan_post = np.isnan(post_seg).all(axis=1)
raw_max_idx = np.full(wave.shape[0], np.int32(-1), dtype=np.int32)
valid       = ~allnan_post

raw_max_idx[valid] = np.nanargmax(post_seg[valid], axis=1).astype(np.int32)
# If no valid max, keep as -1; convert to float32 for NaN-compatible math
max_idx_float = raw_max_idx + POST_START 

# slopes (float32)
slope_drop = np.divide(drop.astype(np.float32),
                       (-min_idx).astype(np.float32),
                       where=(min_idx != 0),
                       out=np.full_like(drop, np.float32(np.nan)))
slope_bounce_back = np.divide(bounce_back.astype(np.float32),
                              (max_idx_float - min_idx.astype(np.float32)),
                              where=((max_idx_float - min_idx.astype(np.float32)) != 0),
                              out=np.full_like(bounce_back, np.float32(np.nan)))

# ───────────────────────────────────────────────────────────────────────────────
# 5) SHAPE/ENERGY FEATURES (float32 all the way)
# ───────────────────────────────────────────────────────────────────────────────
start_voltage  = np.nanmean(start_seg, axis=1).astype(np.float32)  # Start_Voltage
time_to_min_ms = (min_idx.astype(np.int32) * np.int32(10)).astype(np.int32)     # int32 ms

# kurtosis & skew (SciPy may return float64 → cast to float32)
curve_kurt = kurtosis(full_seg, axis=1, fisher=False, bias=False, nan_policy='omit').astype(np.float32)
curve_skew = skew(full_seg, axis=1, bias=False, nan_policy='omit').astype(np.float32)

# 1st derivative (float32)
dV = np.diff(wave, axis=1).astype(np.float32, copy=False)
dV = np.pad(dV, ((0,0),(0,1)), constant_values=np.float32(np.nan)).astype(np.float32, copy=False)
dV_pre = dV[:, :PRE_END].astype(np.float32, copy=False)
Max_Rise_Rate_0_180  = np.nanmax(dV_pre, axis=1).astype(np.float32)
Max_Fall_Rate_0_180  = np.nanmin(dV_pre, axis=1).astype(np.float32)
Mean_Abs_Slope_0_180 = np.nanmean(np.abs(dV_pre), axis=1).astype(np.float32)
Std_Slope_0_180      = np.nanstd (np.abs(dV_pre), axis=1).astype(np.float32)

# 2nd derivative (float32)
d2V = np.diff(dV, axis=1).astype(np.float32, copy=False)
d2V = np.pad(d2V, ((0,0),(0,2)), constant_values=np.float32(np.nan)).astype(np.float32, copy=False)
d2V_pre = d2V[:, :PRE_END].astype(np.float32, copy=False)
Mean_Abs_Accel_0_180 = np.nanmean(np.abs(d2V_pre), axis=1).astype(np.float32)
Max_Accel_0_180      = np.nanmax(d2V_pre, axis=1).astype(np.float32)
Min_Accel_0_180      = np.nanmin(d2V_pre, axis=1).astype(np.float32)

recovery_argmin  = np.nanargmin(wave, axis=1).astype(np.int32)
Recovery_Time_ms = np.array([_recovery(r, i) for r, i in zip(wave, recovery_argmin)], dtype=np.int32)

# area & threshold counts (float32 to allow NaN if you ever mask; counts kept as int32 then cast to float32)
Area_0_200ms = (np.nansum(rec_seg, axis=1).astype(np.float32) * np.float32(10)).astype(np.float32)
Count_Below7  = np.sum(wave < np.float32(7),  axis=1, dtype=np.int32).astype(np.float32)
Count_Below9  = np.sum(wave < np.float32(9),  axis=1, dtype=np.int32).astype(np.float32)
Count_Below10 = np.sum(wave < np.float32(10), axis=1, dtype=np.int32).astype(np.float32)

# pseudo-resistance & normalized energy (float32)
measured = MVT_Original.loc[filtered_df.index, 'Measured'].astype(np.float32)

I_est = np.where(measured == np.float32(0),
                 np.float32(np.nan),
                 (measured / np.float32(12.0)).astype(np.float32)).astype(np.float32)

R_est = np.divide(drop.astype(np.float32), I_est,
                  out=np.full_like(drop, np.float32(np.nan)),
                  where=~np.isnan(I_est) & (I_est != np.float32(0)))

Norm_Energy_200ms = np.divide(Area_0_200ms.astype(np.float32), measured.astype(np.float32),
                              out=np.full_like(Area_0_200ms, np.float32(np.nan)),
                              where=measured != np.float32(0))

Rec_Slope = np.divide(bounce_back.astype(np.float32), Recovery_Time_ms.astype(np.float32),
                      out=np.full_like(bounce_back, np.float32(np.nan)),
                      where=Recovery_Time_ms != np.int32(0))
# ---- kept building blocks ----


dt_ms = 10
s = np.nan_to_num(np.asarray(full_seg, dtype=float))
n = len(s)
x = np.arange(n)

mean_val = float(np.mean(s))
diff = np.diff(s)
sigma_d = _mad(diff)
flat_mask = np.abs(diff) < 0.25 * sigma_d
longest_flat = int(np.max([len(list(g)) for k,g in groupby(flat_mask) if k] or [0]))

freqs = np.abs(fft(s - mean_val))
hf_energy = float(np.sum(freqs[int(len(freqs)/4):]) / (np.sum(freqs) + 1e-6))
norm_freqs = freqs / (np.sum(freqs) + 1e-6)
spectral_entropy = float(entropy(norm_freqs))
roll_var = float(pd.Series(s).rolling(window=10, min_periods=1).var().mean())

edge_start_diff = float(np.abs(np.median(s[:min(10, n)]) - s[0]))
edge_end_diff   = float(np.abs(np.median(s[-min(10, n):]) - s[-1]))

min_drop = float(np.min(diff))
drop_idx = int(np.argmin(diff))
lookahead = min(20, max(1, n - drop_idx - 1))
recovery_slope = float((s[drop_idx + lookahead] - s[drop_idx]) / lookahead) if drop_idx < n-1 else 0.0

zero_cross_rate = float(np.sum(diff[:-1] * diff[1:] < 0) / max(1, n))

deg = min(2, max(1, n-1))
try:
    coeffs = np.polyfit(x, s, deg)
    trend_poly = np.polyval(coeffs, x)
    poly_resid = float(np.mean((s - trend_poly) ** 2))
except Exception:
    poly_resid = 0.0

k = 4
seg_len = max(1, n // k)
slopes = []
for i in range(k):
    a = i * seg_len
    b = min(n - 1, (i + 1) * seg_len - 1)
    if b > a:
        slopes.append((s[b] - s[a]) / (b - a))
segment_slope_var = float(np.var(slopes)) if len(slopes) > 1 else 0.0

# ---- robust spike/dip on residual ----
k_med = max(5, (n // 50) | 1)              # odd kernel (~2% length)
trend_med = medfilt(s, kernel_size=k_med)
resid = s - trend_med
sigma_res = _mad(resid)

min_dist  = max(2, int(30 / dt_ms))        # ≥30 ms between events
min_width = max(1, int(10 / dt_ms))        # ≥10 ms width
min_prom  = 3.0 * sigma_res                # ≥3σ prominence

p_pos, prop_pos = find_peaks(resid,  prominence=min_prom, distance=min_dist, width=min_width)
p_neg, prop_neg = find_peaks(-resid, prominence=min_prom, distance=min_dist, width=min_width)

spike_count = int(len(p_pos) + len(p_neg))
dip_count   = int(len(p_neg))
spike_prom_sum = float(prop_pos["prominences"].sum() + prop_neg["prominences"].sum()) if spike_count else 0.0
spike_width_mean_ms = float(np.r_[prop_pos["widths"], prop_neg["widths"]].mean() * dt_ms) if spike_count else 0.0

# ---- extras (non-overlapping) ----
W = max(4, int(40 / dt_ms))                # ~40 ms window
if n >= 3*W:
    box = np.ones(W) / W
    m1 = np.convolve(s, box, mode="valid")
    steps = m1[2*W-1:] - m1[W-1:-W]
    step_sigma = _mad(steps)
    step_thr = 4.0 * step_sigma
    step_count_sustained = int(np.sum(np.abs(steps) > step_thr))
    max_step_mag = float(np.max(np.abs(steps))) if steps.size else 0.0
else:
    step_count_sustained, max_step_mag = 0, 0.0

Xr = resid - resid.mean()
F = np.abs(rfft(Xr))**2
freqs_r = rfftfreq(n, d=dt_ms/1000.0)
def _band(lo, hi):
    m = (freqs_r >= lo) & (freqs_r < hi)
    return float(F[m].sum())
Ptot = float(F.sum() + 1e-12)
bp_low  = _band(0.5, 2.0)  / Ptot
bp_mid  = _band(2.0, 8.0)  / Ptot
bp_high = _band(8.0, 20.0) / Ptot
bp_mid_ratio  = bp_mid  / (bp_low + 1e-12)
bp_high_ratio = bp_high / (bp_low + 1e-12)
resid_spectral_entropy = float(entropy((F / Ptot) + 1e-12))

m0 = max(10, min(n//10, 100))
baseline = float(np.median(s[:m0]))
rel_thr = baseline - 2.0 * sigma_res
low_mask = s < rel_thr
rel_below_frac = float(low_mask.mean())
from itertools import groupby as _gb
runs = [len(list(g)) for k,g in _gb(low_mask) if k]
rel_below_longest_ms = float((max(runs) if runs else 0) * dt_ms)

w = max(5, int(100 / dt_ms))
sr = pd.Series(s)
win_range = (sr.rolling(w, min_periods=1).max() - sr.rolling(w, min_periods=1).min())
win_range_max = float(np.nanmax(win_range.values))

tail = s[int(0.9*n):] if n >= 10 else s
tail_std = float(np.std(tail))
if tail.size >= 3:
    t = tail - tail.mean()
    tail_ac1 = float(np.dot(t[:-1], t[1:]) / (np.dot(t, t) + 1e-12))
else:
    tail_ac1 = 0.0

rms = float(np.sqrt(np.mean(s**2)) + 1e-12)
crest_factor = float(np.max(np.abs(s)) / rms)
line_length  = float(np.mean(np.abs(np.diff(s))))

a, b = int(0.2*n), int(0.8*n)
mid_duty_cycle_low = float((s[a:b] < rel_thr).mean()) if b > a else 0.0

# ───────────────────────────────────────────────────────────────────────────────
# 6) BUILD FEATURES DF & DROP PROBLEM ROWS (preserve dtypes)
# ───────────────────────────────────────────────────────────────────────────────
# Note: Pandas may display float32 as float64 in some ops, but underlying dtype stays float32.
feat = pd.DataFrame({
    'Min_Volt_Below_19':     min_pre.astype(np.float32),
    'Min':                   min_full.astype(np.float32),
    'Max':                   max_full.astype(np.float32),
    'Standard_Deviation':    std_full.astype(np.float32),
    'Average':               mean_full.astype(np.float32),
    'Median':                med_full.astype(np.float32),
    'Max_Volt_19_Above':     max_post.astype(np.float32),
    'Bounce_Back':           bounce_back.astype(np.float32),
    'Drop':                  drop.astype(np.float32),
    'Max_Index_19_Above':    max_idx_float.astype(np.float32),   # float32 with NaN support
    'Min_Index_Below_19':    min_idx.astype(np.int32),
    'Slope_Drop':            slope_drop.astype(np.float32),
    'Slope_Bounce_Back':     slope_bounce_back.astype(np.float32),
    'Start_Voltage':         start_voltage.astype(np.float32),
    'Time_To_Min_ms':        time_to_min_ms.astype(np.int32),
    'Recovery_Time_ms':      Recovery_Time_ms.astype(np.int32),
    'Area_0_200ms':          Area_0_200ms.astype(np.float32),
    'Count_Below7':          Count_Below7.astype(np.float32),
    'Count_Below9':          Count_Below9.astype(np.float32),
    'Count_Below10':         Count_Below10.astype(np.float32),
    'Curve_Kurtosis':        curve_kurt.astype(np.float32),
    'Curve_Skew':            curve_skew.astype(np.float32),
    'Max_Rise_Rate_0_180':   Max_Rise_Rate_0_180.astype(np.float32),
    'Max_Fall_Rate_0_180':   Max_Fall_Rate_0_180.astype(np.float32),
    'Mean_Abs_Slope_0_180':  Mean_Abs_Slope_0_180.astype(np.float32),
    'Std_Slope_0_180':       Std_Slope_0_180.astype(np.float32),
    'Mean_Abs_Accel_0_180':  Mean_Abs_Accel_0_180.astype(np.float32),
    'Max_Accel_0_180':       Max_Accel_0_180.astype(np.float32),
    'Min_Accel_0_180':       Min_Accel_0_180.astype(np.float32),
    'Norm_Energy_200ms':     Norm_Energy_200ms.astype(np.float32),
    'Rec_Slope':             Rec_Slope.astype(np.float32),
    'R_est':                 R_est.astype(np.float32),
    "spike_count":          np.int32(spike_count),
    "dip_count":            np.int32(dip_count),
    "spike_prom_sum":       np.float32(spike_prom_sum),
    "spike_width_mean_ms":  np.float32(spike_width_mean_ms),
    "longest_flat":         np.float32(longest_flat),
    "hf_energy":            np.float32(hf_energy),
    "spectral_entropy":     np.float32(spectral_entropy),
    "roll_var":             np.float32(roll_var),
    "edge_start_diff":      np.float32(edge_start_diff),
    "edge_end_diff":        np.float32(edge_end_diff),
    "min_drop":             np.float32(min_drop),
    "recovery_slope":       np.float32(recovery_slope),
    "poly_resid":           np.float32(poly_resid),
    "segment_slope_var":    np.float32(segment_slope_var),
    "zero_cross_rate":      np.float32(zero_cross_rate),
    "step_count_sustained": np.float32(step_count_sustained),
    "max_step_mag":         np.float32(max_step_mag),
    "bp_low":               np.float32(bp_low), "bp_mid": np.float32(bp_mid), "bp_high": np.float32(bp_high),
    "bp_mid_ratio":         np.float32(bp_mid_ratio), "bp_high_ratio": np.float32(bp_high_ratio),
    "resid_spectral_entropy": np.float32(resid_spectral_entropy),
    "rel_below_frac":       np.float32(rel_below_frac),
    "rel_below_longest_ms": np.float32(rel_below_longest_ms),
    "win_range_max":        np.float32(win_range_max),
    "tail_std":             np.float32(tail_std), "tail_ac1": np.float32(tail_ac1),
    "crest_factor":         np.float32(crest_factor), "line_length": np.float32(line_length),
    "mid_duty_cycle_low":   np.float32(mid_duty_cycle_low),
}, index=filtered_df.index)

computed_features_w_array = pd.concat([filtered_df, feat], axis=1).set_index(filtered_df.index)


  max_post    = np.nanmax(post_seg, axis=1).astype(np.float32)


NameError: name 'dt_ms' is not defined

In [None]:
def remove_outliers(df):
    # Selecting data that Voltage less than 30V
    df = df.loc[df['Voltage'] < 16]

    df = df.loc[df['Voltage'] >= 11.4]

    # Selecting data that average equal or less than 18
    df = df.loc[df['Average'] <= 18]

    # Selecting data that min equal or less than 12
    df = df.loc[df['Min'] <= 12]

    # Selecting data that max equal or less than 16
    df = df.loc[df['Max'] <= 16]

    # Selecting data that bounce back equal or less than 16
    df = df.loc[df['Bounce_Back'] <= 16]

    # Selecting data that slope bounce back from the range [-.5;.5]
    df = df.loc[(df['Slope_Bounce_Back'] <= 0.5) & (df['Slope_Bounce_Back'] >= 0.005)]

    # Selecting data that slope drop from the range [-2;2]
    df = df.loc[(df['Slope_Drop'] <= 2) & (df['Slope_Drop'] >= -2)]

    return df

In [None]:
features_df = build_computed_features(Starter_Volt_Array_MVT, MVT_Original, dt_ms=10)


In [None]:
final_mvt_data = pd.merge(MVT_Original,features_df, left_index=True, right_index=True,how='right')
final_mvt_data_w_array = pd.merge(final_mvt_data,Starter_Volt_Array_MVT, left_index=True, right_index=True,how='left')

Starter_Volt_Array_mvt_DF = final_mvt_data_w_array.drop(columns=final_mvt_data.columns).astype(float)

final_mvt_data = final_mvt_data_w_array.drop(columns=Starter_Volt_Array_mvt_DF.columns)

In [None]:


final_mvt_data.drop(columns = ['StarterVoltageGraphpoints'], inplace=True)


In [None]:
final_mvt_data_w_array

In [None]:
final_mvt_data = final_mvt_data[['Voltage', 'Rated','Measured', 'Min', 'Max', 'Standard_Deviation', 'Average', 'Median',
        'Bounce_Back', 'Drop', 'Slope_Bounce_Back', 'Slope_Drop',
        'Min_Volt_Below_19', 'Max_Volt_19_Above',
        'Start_Voltage', 'Time_To_Min_ms', 'Recovery_Time_ms',
        'Area_0_200ms', 'Count_Below7', 'Count_Below9', 'Count_Below10',
        'Curve_Kurtosis','Curve_Skew',
        'Max_Rise_Rate_0_180', 'Max_Fall_Rate_0_180',
        'Mean_Abs_Slope_0_180', 'Std_Slope_0_180', 'Mean_Abs_Accel_0_180',
        'Max_Accel_0_180', 'Min_Accel_0_180', 'Norm_Energy_200ms',
        'Rec_Slope', 'R_est',
        'Spike_Count', 'Dip_Count', 'Spike_Prom_Sum', 'Spike_Width_Mean_Ms',
        'Longest_Flat', 'Hf_Energy', 'Spectral_Entropy', 'Roll_Var',
        'Edge_Start_Diff', 'Edge_End_Diff', 'Min_Drop', 'Recovery_Slope',
        'Poly_Resid', 'Segment_Slope_Var', 'Zero_Cross_Rate',
        'Step_Count_Sustained', 'Max_Step_Mag', 'Bp_Low', 'Bp_Mid', 'Bp_High',
        'Bp_Mid_Ratio', 'Bp_High_Ratio', 'Resid_Spectral_Entropy',
        'Rel_Below_Frac', 'Rel_Below_Longest_Ms', 'Win_Range_Max',
        'Tail_Std', 'Tail_Ac1', 'Crest_Factor', 'Line_Length',
        'Mid_Duty_Cycle_Low', 'Battery_Decision']]
final_mvt_data
                

In [None]:
final_mvt_data.isna().sum()

In [None]:
label_encoder = LabelEncoder()

final_mvt_data.Battery_Decision = label_encoder.fit_transform(final_mvt_data.Battery_Decision.values)


In [None]:
final_mvt_data.Battery_Decision.value_counts(),

In [None]:
final_mvt_data

In [None]:
Starter_Volt_Array_MVT

In [None]:
starter_volt_array =Starter_Volt_Array_MVT


In [None]:
starter_volt_array

In [None]:
alternator_ripple_array = alternator_ripple_array.loc[alternator_ripple_array.index.isin(final_mvt_data.index)]
alternator_ripple_array 

In [None]:
alternator_ripple_array

In [None]:
final_mvt_data

In [None]:
mask = (alternator_ripple_array.sum(axis=1) != 0)

In [None]:
mask = mask.sort_index()

In [None]:
final_mvt_data:pd.DataFrame = final_mvt_data[mask]

In [None]:
final_mvt_data

In [None]:
starter_volt_array = starter_volt_array[mask]

In [None]:
starter_volt_array

In [None]:
final_mvt_data = final_mvt_data.astype(np.float32)

# Export Transformed Data 

In [None]:
final_mvt_data.to_csv('Preprocessed_Datasets/MVT13_Final_Features.csv', index=True)
starter_volt_array.to_csv('Preprocessed_Datasets/MVT13_Starter_Volt_Array.csv', index=True)

# Import The Parquet Files

In [None]:
# Import The Parquet Files 
final_mvt_data = pd.read_csv('Preprocessed_Datasets/final_mvt_data_ncr.csv') 
starter_volt_array = pd.read_csv('Preprocessed_Datasets/MVT_Starter_Volt_Array_NCR.csv')


In [None]:
starter_volt_array.shape, final_mvt_data.shape



In [None]:
starter_volt_array

In [None]:
# Select one sample by its row index (e.g., index 0)
sample_row = starter_volt_array.iloc[0:1].copy()  # Select the first row as an example
sample_row.bfill(axis = 1).ffill(axis=1)

In [None]:
# Plotting the Starter Voltage Array make it interactive

plt.figure(figsize=(12, 6))
plt.plot(sample_row.values.flatten(), marker='o', linestyle='-', color='b')
plt.title('Starter Voltage Array Sample')
plt.xlabel('Time (ms)')
plt.ylabel('Voltage (V)')
plt.grid(True)
plt.xticks(ticks=np.arange(0, 170, 10), rotation=45)
plt.tight_layout()
plt.show()


In [None]:
final_mvt_data.shape, starter_volt_array.shape

# Split the data into training and testing sets

In [None]:
final_mvt_data['Conductance'] = final_mvt_data['Measured'] / final_mvt_data['Rated']

#correlation matrix
corr = final_mvt_data.corr()

# Heatmap

plt.figure(figsize=(20, 10))
plt.title('Correlation Heatmap')
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()



In [None]:
X_mvt = final_mvt_data.drop(columns=['Battery_Decision', 'Rated','Conductance', 'Time_To_Min_ms']) #original had no time to min ms
y_mvt = final_mvt_data['Battery_Decision']

Xmvt_train, Xmvt_temp, ymvt_train, ymvt_temp = train_test_split(X_mvt, y_mvt, test_size=0.2, random_state=42, stratify=y_mvt)
Xmvt_val, Xmvt_test, ymvt_val, ymvt_test = train_test_split(Xmvt_temp, ymvt_temp, test_size=0.5, random_state=12, stratify=ymvt_temp)

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    """
    Evaluate a model on train, validation, and test sets. Print classification reports and log loss.

    Parameters:
        model: A fitted model that implements predict() and optionally predict_proba().
        X_train, X_val, X_test: Feature matrices for train, validation, and test sets.
        y_train, y_val, y_test: True labels for each dataset.

    Returns:
        A dictionary containing the classification reports and log loss values for each dataset.
    """
    
    results = {}
    
    # Create a dictionary mapping dataset names to the (X, y) tuples.
    datasets = {
        "Train": (X_train, y_train),
        "Validation": (X_val, y_val),
        "Test": (X_test, y_test)
    }
    
    for name, (X, y) in datasets.items():
        # Predict labels
        X = X.astype(float)
        y_pred = model.predict(X)
        
        # Generate the classification report
        report = classification_report(y, y_pred)

        
        # Store results
        results[name] = {"classification_report": report}
        
        # If the model supports predict_proba, compute log loss
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X)
            loss = log_loss(y, y_proba)
            results[name]["log_loss"] = loss
        else:
            print(f"Model does not support predict_proba. Skipping log loss for {name} dataset.\n")
            results[name]["log_loss"] = None
    
    return results


def display_evaluation_metrics(results, cluster_name="Model Evaluation Results"):
    """
    Display evaluation results in a horizontally stacked layout for easy comparison.
    
    Parameters:
      results: dict
          A dictionary with keys for each dataset (e.g., "Train", "Validation", "Test").
          Each key should map to another dictionary with keys "classification_report" (str)
          and "log_loss" (float or None).
    """
    cluster_name = cluster_name
    results = results
    html = f"""
        <h1 style="text-align:center;">{cluster_name}</h1>
        <div class="results-container">
        <style>
        .results-container {{
            display: flex;
            flex-direction: row;
            justify-content: space-around;
            flex-wrap: wrap;
            gap: 20px;
        }}
        .dataset-card {{
            border: 1px solid #ccc;
            border-radius: 5px;
            padding: 10px;
            box-shadow: 2px 2px 10px rgba(0, 0, 0, 0.3);
            width: 40%;
            min-width: 350px;
            overflow: auto;
            background: #fff;
            color: #333;
        }}
        .dataset-card h2 {{
            text-align: center;
            margin-top: 0;
            color: #222;
        }}
        .dataset-card h3 {{
            color: #444;
            margin-bottom: 5px;
        }}
        pre {{
            white-space: pre-wrap;
            background-color: #eaeaea;
            padding: 10px;
            border-radius: 3px;
            color: #000;
        }}
        p {{
            font-size: 12px;
            margin: 5px 0;
            font-weight: bold;
        }}
        </style>
        <div class="results-container">
    """
    
    for dataset, metrics in results.items():
        log_loss_val = (f"{metrics['log_loss']:.4f}"
                        if metrics["log_loss"] is not None 
                        else "N/A (model does not support predict_proba)")
        card = f"""
        <div class="dataset-card">
          <h2>{dataset} Dataset</h2>
          <h3>Classification Report</h3>
          <pre>{metrics["classification_report"]}</pre>
          <h3>Log Loss</h3>
          <p>{log_loss_val}</p>
        </div>
        """
        html += card
        
    html += "</div>"
    display(HTML(html))

# MVT Only

In [None]:
kf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)

oof_predictions = np.zeros(len(X_mvt))
all_fold_results = []
xgb_mvt = None
for fold, (train_idx, val_idx) in enumerate(kf.split(X_mvt, y_mvt)):
    print(f"Fold {fold + 1}")
    
    # Split the data
    X_train_fold, X_val_fold = X_mvt.iloc[train_idx], X_mvt.iloc[val_idx]
    y_train_fold, y_val_fold = y_mvt.iloc[train_idx], y_mvt.iloc[val_idx]
    
    # Initialize the model
    xgb_mvt = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        seed=10,
        n_estimators=500,
        max_depth=5,
        early_stopping_rounds=10,
        n_jobs=-1,
        sampling_method='uniform',
        tree_method='hist',
        device ='cpu',
        scale_pos_weight=.3,
        gamma=10
    )    
    # Train the model using the augmented data
    xgb_mvt.fit(
        X_train_fold.values, y_train_fold.values,
        eval_set=[(X_val_fold.values, y_val_fold.values)],  
        verbose=False
    )
    
    
    # Store out-of-fold predictions using the validation augmented data
    oof_predictions[val_idx] = xgb_mvt.predict_proba(X_val_fold.values)[:, 1]

    # Evaluate for the current fold
    fold_results = evaluate_model(
        xgb_mvt,
        X_train_fold,
        y_train_fold,
        X_val_fold,
        y_val_fold,
        Xmvt_test,
        ymvt_test
    )
    all_fold_results.append(fold_results)
    display_evaluation_metrics(evaluate_model(
        xgb_mvt,
        X_train_fold,
        y_train_fold,
        X_val_fold,
        y_val_fold,
        Xmvt_test,
        ymvt_test
    ), cluster_name=f"Fold {fold + 1} Evaluation Results")

avg_log_loss = np.mean([fold["Validation"]["log_loss"] for fold in all_fold_results if fold["Validation"]["log_loss"] is not None])
print(f"Average Validation Log Loss Across Folds: {avg_log_loss}")
print("Out-of-Fold Log Loss:", log_loss(y_mvt, oof_predictions))




In [None]:
# ----------------  data  -----------------
FEATURES = [c for c in Xmvt_train.columns if c != "Test_Record_Detail_ID"]

X_tr_full = Xmvt_train[FEATURES].astype(np.float32)          # only the training rows
y_tr_full = ymvt_train

X_val_hold = Xmvt_val[FEATURES].astype(np.float32)           # fixed validation split
y_val_hold = ymvt_val

X_test_hold = Xmvt_test[FEATURES].astype(np.float32)         # fixed test split
y_test_hold = ymvt_test

starter_tr_full  = starter_volt_array.loc[X_tr_full.index]
starter_val_hold = starter_volt_array.loc[X_val_hold.index]


# Optuna Hyperparmater Tuning

In [None]:

starter_tr_full  = starter_volt_array.loc[X_tr_full.index]
starter_holdout  = starter_volt_array.loc[X_val_hold.index]

def objective(trial):
    '''
  prev study best params 

    base_score: 0.4936503028534939
    max_depth: 4
    eta: 0.03255911844213105
    subsample: 0.7615811212830086
    colsample_bytree: 0.9218658878025937
    colsample_bylevel: 0.9657384172389276
    colsample_bynode: 0.9021768577894738
    gamma: 2.149866369005831
    reg_lambda: 4.854207483078419
    reg_alpha: 0.10956776598072049
    min_child_weight: 3
    
 '''
    
    # ---------------- hyper‑parameters to tune ----------------
    params = {
        "objective":        "binary:logistic",
        "eval_metric":      "logloss",
        "seed":     10,
        "n_estimators":     400,
        "tree_method":      "hist",
        "sampling_method":  "uniform",
        "early_stopping_rounds": 20,
        "n_jobs":           -1,
        "base_score":       trial.suggest_float("base_score", 0.48, 0.52),
        "max_depth":        trial.suggest_int("max_depth", 4, 6),
        "eta":              trial.suggest_float("eta", 0.05, 0.15),
        "subsample":        trial.suggest_float("subsample", 0.6, 0.8),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.95, 1),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.97, 1),
        "gamma":            trial.suggest_float("gamma", 4, 7),
        "scale_pos_weight": .31,
    }

    kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=10)

    fold_train_losses = []
    fold_val_losses = []
    ho_pred_accum = np.zeros(len(y_val_hold), dtype=float)   # ensemble probs

    # ---------------- k‑fold loop ----------------
    for tr_idx, val_idx in kf.split(X_tr_full,y_tr_full):

        # split core features
        X_tr, X_fold = X_tr_full.iloc[tr_idx],X_tr_full.iloc[val_idx]
        y_tr, y_fold = y_tr_full.iloc[tr_idx],y_tr_full.iloc[val_idx]
        X_ho = X_val_hold.copy()



        #make floats inparams into float32

        params = {k: np.float32(v) if isinstance(v, float) else v for k, v in params.items()}

        model = XGBClassifier(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_fold, y_fold)],
            verbose=False
        )

        # --- fold‑training los6
        fold_train_losses.append(log_loss(y_tr, model.predict_proba(X_tr)[:, 1]))


        # --- fold‑validation loss
        fold_val_losses.append(log_loss(y_fold, model.predict_proba(X_fold)[:, 1]))

        # --- accumulate hold‑out probabilities
        ho_pred_accum += model.predict_proba(X_ho)[:, 1] / kf.n_splits

    # ------------- aggregate -------------
    val_train_mean  = np.mean(fold_train_losses)
    val_fold_mean   = np.mean(fold_val_losses)
    val_hold_mean   = log_loss(y_val_hold, ho_pred_accum)

    # gap‑penalised composite score
    score = (val_fold_mean + val_hold_mean)/2 + abs(val_train_mean - val_fold_mean)+abs(val_train_mean - val_hold_mean)
    return score 

study = optuna.create_study(direction="minimize", study_name="MVT_Optuna_Study")
study.optimize(objective, n_trials=100, n_jobs=2,show_progress_bar=True)

# -------------- best parameters -------------                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             
best_params = study.best_params
for key, value in best_params.items():
    print(f"{key}: {value}")
 

In [None]:

print(f"Best Params: \n {study.best_params}\n")
print(f"Best Value Score: {study.best_value}")

In [None]:
#Previous Best Value: 0.2798202463932981
best_params =  {'base_score': 0.5146740544397742, 'max_depth': 4, 'eta': 0.13719992835710454, 'subsample': 0.6387205994827557, 'colsample_bytree': 0.9623006739544103, 'colsample_bylevel': 0.9532238856222752, 'colsample_bynode': 0.9719535485108595, 'gamma': 4.714334748343621}



In [None]:
best_params_weighted=    {'objective': 'binary:logistic',
 'use_label_encoder': False,
 'base_score': 0.4910429253932203,
 'colsample_bylevel': 0.9521081666403901,
 'colsample_bynode': 0.9502389233223183,
 'colsample_bytree': 0.9410369777828046,
 'early_stopping_rounds': 20,
 'enable_categorical': False,
 'eval_metric': 'logloss',
 'gamma': 3.051682147783409,
 'grow_policy': 'depthwise',
 'max_delta_step': 0,
 'max_depth': 4,
 'max_leaves': 0,
 'min_child_weight': 1,
 'missing': np.nan,
 'monotone_constraints': '()',
 'n_estimators': 500,
 'n_jobs': -1,
 'reg_alpha': 0,
 'reg_lambda': 2,
 'sampling_method': 'uniform',
 'scale_pos_weight': 0.3,
 'subsample': 0.7887050055975044,
 'tree_method': 'hist',
 'validate_parameters': 1,
 'verbosity': None,
 'seed': 10,
 'eta': 0.0605730463701796}




# Best XGB 

In [None]:
#convert base params floats to float32
best_params= {k: np.float32(v) if (isinstance(v, float) and (k != "missing")) else v for k, v in best_params.items()}
best_params

In [None]:
Xmvt_train.columns

In [None]:
# Initialize the model with the best parameters
best_mvt_xgb = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    seed=10,
    n_estimators=500,
    early_stopping_rounds=50,
    n_jobs=-1,
    sampling_method='uniform',
    tree_method='hist',
    scale_pos_weight=.28,
    **best_params
)
# Train the model using the augmented training data
best_mvt_xgb.fit(
    Xmvt_train.values, y_tr_full.values,
    eval_set=[(Xmvt_val.values, y_val_hold.values)],
    verbose=True
)

# Evaluate the model on the validation set
val_results = evaluate_model(
    best_mvt_xgb,
    Xmvt_train,
    y_tr_full,
    Xmvt_val,
    y_val_hold,
    Xmvt_test,
    y_test_hold
)
display_evaluation_metrics(val_results, cluster_name="Final Model Evaluation Results")


In [None]:
best_mvt_xgb = XGBClassifier(**best_params_weighted)
best_mvt_xgb.load_model('Python_XGB_Models/XGB_MVT_NCR_Model_Weighted.json')

In [None]:
best_mvt_xgb.get_params()

In [None]:
# Evaluate the model on the validation set
val_results = evaluate_model(
    best_mvt_xgb,
    Xmvt_train,
    y_tr_full,
    Xmvt_val,
    y_val_hold,
    Xmvt_test,
    y_test_hold
)
display_evaluation_metrics(val_results, cluster_name="Final Model Evaluation Results")


In [None]:
best_mvt_xgb.__sklearn_tags__ = {
    'sample_weights': False,
    'multioutput': False,
    'binary_only': True,
    'requires_y': True,
    'non_deterministic': True,
    'pairwise': False,
    'allow_nan': True,
    'X_types': ['2darray', 'sparse'],
    'X_inner_types': ['float32', 'int32', 'uint8', 'bool'],
    'capability:multiclass': False,
    'capability:multilabel': False
}

In [None]:

# Measure performance on the test set with cross-validation and show average out-of-fold classification report

X_oof = pd.concat([Xmvt_val, Xmvt_test], axis=0)
y_oof = pd.concat([ymvt_val, ymvt_test], axis=0)


kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(X_oof))
oof_probabilities = np.zeros(len(X_oof))
oof_true = np.zeros(len(X_oof))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_oof, y_oof)):
    # Split the data
    X_train_fold, X_val_fold = X_mvt.iloc[train_idx], X_mvt.iloc[val_idx]
    y_train_fold, y_val_fold = y_mvt.iloc[train_idx], y_mvt.iloc[val_idx]
    



    # Initialize the model
    best_mvt_xgb = XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        seed=10,
        n_estimators=504000,
        early_stopping_rounds=50,
        n_jobs=-1,
        sampling_method='uniform',
        tree_method='hist',
        scale_pos_weight=.3,
        **best_params
    )    
    
    # Train the model
    best_mvt_xgb.fit(
        X_train_fold.values, y_train_fold.values,
        eval_set=[(X_val_fold.values, y_val_fold.values)],
        verbose=False
    )
    
    # Store out-of-fold predictions and true labels
    oof_predictions[val_idx] = best_mvt_xgb.predict(X_val_fold.values)
    oof_probabilities[val_idx] = best_mvt_xgb.predict_proba(X_val_fold.values)[:, 1]
    oof_true[val_idx] = y_val_fold.values

# Show the classification report averaged over all out-of-fold predictions

print("Average Out-of-Fold Classification Report:")
print(classification_report(oof_true, oof_predictions, target_names=['Bad_Battery','Good_Battery'],digits=4))
print("Average Out-of-Fold Log Loss:", log_loss(oof_true, oof_probabilities))


In [None]:
starter_volt_array

In [None]:
misclassified_rows = X_mvt[best_mvt_xgb.predict(X_mvt) != y_mvt]
misclassified_indices = misclassified_rows.index
misclassified_starter_volt = starter_volt_array.loc[misclassified_indices]

In [None]:
misclassified_alternator_ripple = alternator_ripple_array.loc[alternator_ripple_array['Test_Record_Detail_ID'].isin(misclassified_rows['Test_Record_Detail_ID'])]

In [None]:
plt.figure(figsize=(40, 25))
max_plots = 20
for i, (idx, row) in enumerate(misclassified_starter_volt.iloc[10:30].iterrows()):
    if i >= max_plots:
        break

    voltage = MVT_Original.loc[MVT_Original['Test_Record_Detail_ID'] == misclassified_rows.loc[idx, 'Test_Record_Detail_ID'], 'Voltage'].values[0]
    rated = MVT_Original.loc[MVT_Original['Test_Record_Detail_ID'] == misclassified_rows.loc[idx, 'Test_Record_Detail_ID'], 'Rated'].values[0]
    measured_conductance = MVT_Original.loc[MVT_Original['Test_Record_Detail_ID'] == misclassified_rows.loc[idx, 'Test_Record_Detail_ID'], 'Measured'].values[0]
    actual_battery_decision = MVT_Original.loc[MVT_Original['Test_Record_Detail_ID'] == misclassified_rows.loc[idx, 'Test_Record_Detail_ID'], 'Battery_Decision'].values[0]
    predicted_battery_decision = "GOOD BATTERY" if best_mvt_xgb.predict(misclassified_rows.loc[idx].drop('Test_Record_Detail_ID').values.reshape(1, -1))[0] == 1 else "BAD BATTERY"

    plt.subplot(4, 5, i + 1)
    color = 'r' if actual_battery_decision == 'BAD BATTERY' else 'g'
    plt.plot(row.values, marker='o', linestyle='-', color= color)
    #Set Test_Record_Detail_ID, Voltage, Rated, Measured Conductance, Actual and Predicted Battery Decision in the title
    plt.title(f'''
    Test_Record_Detail_ID: {misclassified_rows.loc[idx, 'Test_Record_Detail_ID']}
    Voltage: {voltage:.2f} V, Rated: {rated:.2f}
    Measured: {measured_conductance:.2f}, Conductance: {measured_conductance / rated:.2f}
    Actual: {actual_battery_decision}
    Predicted: {predicted_battery_decision}''', fontdict={'fontsize': 18})
    plt.xlabel('Index')
    plt.ylabel('Voltage (V)')
    #make tick label sizes larger
    plt.yticks(ticks = np.arange(0, 14, 1), fontsize=15)
    plt.xticks(fontsize=15, rotation=45)
    plt.xticks(ticks=np.arange(0, 513, 32))  # 0-512, ticks every 64
    plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
best_mvt_xgb.get_xgb_params().get("base_score")

In [None]:
import m2cgen as m2c
from m2cgen.interpreters import CInterpreter

# Custom interpreter forcing float
class CFloatInterpreter(CInterpreter):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dtype = "float"   # instead of "double"


In [None]:
# Generate the C code for the optimized model

# Ensure base_score is float

best_mvt_xgb.set_params(base_score=float(best_mvt_xgb.get_xgb_params()["base_score"]))

c_code = m2c.export_to_c(best_mvt_xgb)

# Save the C code to a file
with open('Inference_Pipeline/best_xgb_mvt_ncr_model_weighted_2.c', 'w') as f:
    f.write(c_code)

In [None]:
best_mvt_xgb.get_booster().feature_names = Xmvt_train.columns.tolist()


In [None]:
Xmvt_train.columns.tolist()

# Feature Importances

In [None]:
#plot feature importances
fig, ax1 = plt.subplots(figsize=(9, 6))
plot_importance(best_mvt_xgb, importance_type='gain', ax = ax1,title='Feature Importance (Gain)')

fig, ax2 = plt.subplots(figsize=(9, 6))
plot_importance(best_mvt_xgb, importance_type='weight', ax = ax2, title='Feature Importance (Weight)')

fig, ax3 = plt.subplots(figsize=(9, 6))
plot_importance(best_mvt_xgb, importance_type='cover', ax = ax3,title='Feature Importance (Cover)')

plt.tight_layout()
plt.show()


In [None]:
corr = pd.concat([X_mvt,y_mvt], axis=1).corr()
plt.figure(figsize=(20, 10))
plt.title('Feature Correlation')
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.show()

In [None]:
best_mvt_xgb.predict(Xmvt_test.drop(columns = ['Test_Record_Detail_ID']).iloc[0].values.reshape(1, -1))

In [None]:
best_mvt_xgb.predict_proba(Xmvt_test.drop(columns = ['Test_Record_Detail_ID']).iloc[0].values.reshape(1, -1))

In [None]:
Xmvt_test.Test_Record_Detail_ID.iloc[0]

In [None]:
display(MVT_Original.query('Test_Record_Detail_ID == 358376538').StarterVoltageGraphpoints.values)
display(MVT_Original.query('Test_Record_Detail_ID == 358376538').SoftwareVersionNumber.values)
display(MVT_Original.query('Test_Record_Detail_ID == 358376538').AlternatorRippleGraphpoints.values)
display(Xmvt_test.query('Test_Record_Detail_ID == 358376538'))

In [None]:
X_mvt

In [None]:
starter_volt_array

# Probabilities

In [None]:
#predict probability of entire X_mvt set 
probabilities = best_mvt_xgb.predict_proba(X_mvt.values)
probabilities_df = pd.DataFrame(probabilities, columns=['Probability_Bad', 'Probability_Good']).set_index(X_mvt.index)

predictions = best_mvt_xgb.predict(X_mvt.values)
predictions_df = pd.DataFrame(predictions, columns=['Prediction']).set_index(X_mvt.index)
probabilities_df = pd.concat([X_mvt, probabilities_df, predictions_df], axis=1)

probabilities_df.sort_index(inplace=True)
display(probabilities_df)
starter_volt_array.sort_index(inplace=True)
display(starter_volt_array)


In [None]:
MVT_Original.loc[probabilities_df.index].sort_index()

In [None]:
alternator_ripple_array = alternator_ripple_array.loc[probabilities_df.index]

In [None]:
display(starter_volt_array)
display(probabilities_df)
display(alternator_ripple_array)

In [None]:
probabilities_df['Battery_Decision']  = MVT_Original.loc[probabilities_df.index, 'Battery_Decision'].sort_index().values

In [None]:
probabilities_df['Prediction'] = probabilities_df['Prediction'].map({0: 'BAD_BATTERY', 1: 'GOOD_BATTERY'})

In [None]:
MVT12a_input = MVT_Original.loc[probabilities_df.index].sort_index()
display(MVT12a_input)

In [None]:
MVT12a_outut = probabilities_df.loc[MVT12a_input.index].sort_index()

In [None]:

display(MVT12a_input)
display(MVT12a_outut)

In [None]:
starter_volt_array.loc[342468411].values

In [None]:
MVT12a_input.to_csv('Input_Output_Validation/MVT12a_python_input_2.csv')
MVT12a_outut.to_csv('Input_Output_Validation/MVT12a_python_output_2.csv')

In [None]:
probabilities_df.to_csv('MVT_Without_Conductance_Rated_Proba/Predictions.csv', index=False)
starter_volt_array.to_csv('MVT_Without_Conductance_Rated_Proba/Starter_Volt_Array_MVT.csv', index=False)
alternator_ripple_array.to_csv('MVT_Without_Conductance_Rated_Proba/Alternator_Ripple_Array_MVT.csv', index=False)