# PRE PROCESSING

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
import ast
import matplotlib.pyplot as plt

# ===============================================================
# Fungsi umum
# ===============================================================

def load_excel(file_path):
    """Load Excel file menjadi DataFrame."""
    df = pd.read_excel(file_path)
    return df

def convert_smiles_to_mol(df, smiles_col='SMILES'):
    """Konversi SMILES ke Mol."""
    if smiles_col not in df.columns:
        raise ValueError(f"❌ Column '{smiles_col}' not found!")
    df['Mol'] = df[smiles_col].apply(lambda x: Chem.MolFromSmiles(str(x)) if pd.notnull(x) else None)
    return df

def drop_duplicates_and_na(df, subset_cols=['SMILES']):
    """Buang duplikat dan NA."""
    df = df.drop_duplicates(subset=subset_cols).dropna()
    return df

def parse_fingerprint_column(column):
    """Parsing string list menjadi array numerik."""
    parsed = []
    for item in column:
        if isinstance(item, str):
            try:
                parsed.append(ast.literal_eval(item))
            except Exception:
                parsed.append([])
        else:
            parsed.append(item)
    return np.array(parsed, dtype=float)

def prepare_X_y(df, fingerprint_cols, outcome_col='Outcome'):
    """Prepare X dan y dari DataFrame."""
    # Outcome
    df[outcome_col] = df[outcome_col].astype(int)
    y = df[outcome_col].map({0: 0, 1: 1}).to_numpy(dtype=np.int32)
    
    # Fingerprints
    X_parts = [parse_fingerprint_column(df[col]) for col in fingerprint_cols]
    
    # Konsistensi jumlah sampel
    n_samples = {arr.shape[0] for arr in X_parts}
    if len(n_samples) != 1:
        raise ValueError("❌ Number of samples inconsistent across fingerprints!")
    
    # Concatenate semua fingerprint
    X = np.concatenate(X_parts, axis=1)
    return X, y

def check_numerical(data, name="Data"):
    """Validasi seluruh elemen numerik."""
    if not np.issubdtype(data.dtype, np.number):
        raise ValueError(f"❌ {name} contains non-numerical values!")

def plot_outcome_hist(y, bins=None, title="Outcome Histogram"):
    """Plot histogram outcome."""
    plt.figure(figsize=(6,4))
    plt.hist(y, bins=bins, edgecolor='k')
    plt.xticks(np.arange(0, np.max(y)+1))
    plt.title(title)
    plt.xlabel("Outcome")
    plt.ylabel("Count")
    plt.show()

# ===============================================================
# Paths & fingerprint columns
# ===============================================================
fingerprint_columns = [
    'Morgan_Descriptors',
    'MACCS_Descriptors',
    'APF_Descriptors',
    'RDK_Descriptors'
]

train_file = r"C:\Fauzan\Manuscripts QSAR-RA 2\Old Endpoints\Acute Dermal Toxicity (manual split)\Read Across\Train_set_Dermal_balanced_with_fingerprints.xlsx"
test_file  = r"C:\Fauzan\Manuscripts QSAR-RA 2\Old Endpoints\Acute Dermal Toxicity (manual split)\Read Across\Test_set_Dermal_balanced_with_fingerprints.xlsx"

# ===============================================================
# Load & clean data
# ===============================================================
train_df = load_excel(train_file)
train_df = convert_smiles_to_mol(train_df)
train_df = drop_duplicates_and_na(train_df)
train_df = train_df.sort_values(['Outcome'], ascending=True)

test_df  = load_excel(test_file)
test_df = convert_smiles_to_mol(test_df)
test_df = drop_duplicates_and_na(test_df)
test_df = test_df.sort_values(['Outcome'], ascending=True)

# ===============================================================
# Prepare X and y
# ===============================================================
x_train, y_train = prepare_X_y(train_df, fingerprint_columns)
x_test,  y_test  = prepare_X_y(test_df, fingerprint_columns)

# Validasi numerik
for arr, name in zip([x_train, x_test], ['Train X', 'Test X']):
    check_numerical(arr, name)

# ===============================================================
# Info outcome
# ===============================================================
def print_class_info(y, label="Dataset"):
    outcomes = np.unique(y)
    print(f"{label} Classes                          : ", outcomes)
    print(f"{label} Number of compounds in each class: ", [len(y[y==cls]) for cls in outcomes])
    print(f"{label} Total number of compounds        : ", len(y))
    info = {str(cls): i for i, cls in enumerate(outcomes)}
    print(f"{label} Class info mapping               : ", info)
    plot_outcome_hist(y, bins=np.arange(-0.5, np.max(y)+1, 1), title=f"{label} Outcome Histogram")
    return info

train_info = print_class_info(y_train, "Train")
test_info  = print_class_info(y_test, "Test")

# ===============================================================
# Print shapes
# ===============================================================
print("✅ Train shape:", x_train.shape, y_train.shape)
print("✅ Test shape :", x_test.shape, y_test.shape)


# CV WEIGHT OPTIMIZATION

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem.Scaffolds import MurckoScaffold
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.model_selection import KFold
import ast
import os

# ===============================================================
# Konversi fingerprint (pakai format DataFrame yang sudah ada)
# ===============================================================
def convert_list_to_bitvect(fp_list):
    if isinstance(fp_list, str):
        fp_list = ast.literal_eval(fp_list)
    n_bits = len(fp_list)
    bv = ExplicitBitVect(n_bits)
    for i, bit in enumerate(fp_list):
        if int(bit):
            bv.SetBit(i)
    return bv

def prepare_fingerprints(df, fingerprint_cols):
    fps_dict = {}
    for col in fingerprint_cols:
        fps_dict[col] = [convert_list_to_bitvect(x) for x in df[col]]
    return fps_dict

# ===============================================================
# Helper metrik & BACC
# ===============================================================
def compute_bacc_from_probs(y_true, y_prob, thresh=0.5):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = (np.asarray(y_prob) >= thresh).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    return 0.5 * (sens + spec)

# ===============================================================
# Read-Across: similarity-weighted kNN (Eq. 4 manuskrip)
# ===============================================================
def compute_tanimoto_similarity(target_fp, reference_fps):
    return np.array([
        DataStructs.TanimotoSimilarity(target_fp, ref_fp)
        for ref_fp in reference_fps
    ])

def predict_properties_weighted(
    test_fps,
    train_fps,
    y_train,
    k=5,
    tanimoto_cutoff=0.0
):
    """
    P_f(q) = Σ w_j y_j / Σ w_j, w_j = Tanimoto(q,j)
    """
    y_train = np.asarray(y_train, dtype=float)
    n_test = len(test_fps)
    probs = np.zeros(n_test, dtype=float)

    for i, test_fp in enumerate(test_fps):
        sims = compute_tanimoto_similarity(test_fp, train_fps)

        # filter analog dengan cut-off (jika ingin)
        if tanimoto_cutoff > 0.0:
            valid_idx = np.where(sims >= tanimoto_cutoff)[0]
        else:
            valid_idx = np.arange(len(sims))

        if len(valid_idx) == 0:
            probs[i] = y_train.mean()
            continue

        sims_valid = sims[valid_idx]

        # ambil k tetangga paling mirip dari yang valid
        if len(sims_valid) > k:
            nn_local_idx = np.argsort(sims_valid)[-k:]
            nn_idx = valid_idx[nn_local_idx]
        else:
            nn_idx = valid_idx

        nn_sims = sims[nn_idx]
        nn_labels = y_train[nn_idx]

        weights = nn_sims.copy()
        if weights.sum() == 0:
            probs[i] = nn_labels.mean()
        else:
            probs[i] = np.sum(weights * nn_labels) / np.sum(weights)

    return probs

# ===============================================================
# 10-fold scaffold CV untuk Read-Across per fingerprint
# ===============================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    """
    Menghasilkan indeks train/val berbasis scaffold (approx),
    dengan KFold pada level scaffolds.
    """
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_Sf_via_scaffold_cv(
    df,
    fingerprint_cols,
    outcome_col="Outcome",
    n_splits=10,
    k_neighbors=5,
    tanimoto_cutoff=0.0
):
    """
    Menghitung AUC, BACC, dan Sf per fingerprint pakai 10-fold scaffold-CV
    di training set (sesuai definisi Sf di manuskrip).
    """
    # outcome
    y = df[outcome_col].astype(int).values

    # siapkan SMILES untuk scaffold grouping
    smiles = df["SMILES"].astype(str).values

    # precompute fingerprints RDKit objects per kolom
    fps_all = prepare_fingerprints(df, fingerprint_cols)

    folds = scaffold_kfold_indices(smiles, n_splits=n_splits, random_state=42)

    Sf_results = {}

    for fp_col in fingerprint_cols:
        print(f"\n=== CV Read-Across untuk fingerprint: {fp_col} ===")
        auc_scores = []
        bacc_scores = []

        fps = np.array(fps_all[fp_col], dtype=object)

        for fold_id, (idx_train, idx_val) in enumerate(folds, start=1):
            y_tr = y[idx_train]
            y_val = y[idx_val]

            fps_tr = fps[idx_train]
            fps_val = fps[idx_val]

            # prediksi RA weighted kNN di fold ini
            probs_val = predict_properties_weighted(
                test_fps=fps_val,
                train_fps=fps_tr,
                y_train=y_tr,
                k=k_neighbors,
                tanimoto_cutoff=tanimoto_cutoff
            )

            # metrik
            # handle kasus degenerate (semua label sama + probabilitas sama)
            try:
                auc = roc_auc_score(y_val, probs_val)
            except ValueError:
                auc = np.nan
            bacc = compute_bacc_from_probs(y_val, probs_val, thresh=0.5)

            auc_scores.append(auc)
            bacc_scores.append(bacc)

            print(f"Fold {fold_id}: AUC={auc:.4f}, BACC={bacc:.4f}")

        # buang NaN AUC (jika ada) sebelum rata-rata
        auc_scores = np.array(auc_scores, dtype=float)
        bacc_scores = np.array(bacc_scores, dtype=float)

        valid = ~np.isnan(auc_scores)
        if valid.sum() == 0:
            mean_auc = np.nan
        else:
            mean_auc = auc_scores[valid].mean()

        mean_bacc = bacc_scores.mean()

        Sf = 0.5 * (mean_auc + mean_bacc) if not np.isnan(mean_auc) else mean_bacc

        Sf_results[fp_col] = {
            "Mean_AUC_CV": mean_auc,
            "Mean_BACC_CV": mean_bacc,
            "Sf": Sf
        }

        print(f"===> {fp_col}: mean AUC={mean_auc:.4f}, mean BACC={mean_bacc:.4f}, Sf={Sf:.4f}")

    return Sf_results

# ===============================================================
# PANGGIL FUNGSI UNTUK TRAIN SET ANDA
# (train_df sudah disiapkan dari preprocessing yang Anda tulis)
# ===============================================================

fingerprint_columns = [
    'Morgan_Descriptors',
    'MACCS_Descriptors',
    'APF_Descriptors',
    'RDK_Descriptors'
]

# parameter RA
k_neighbors = 5
tanimoto_cutoff = 0.0    # set ke 0.3/0.4/0.5 kalau mau cut-off similarity eksplisit

Sf_results = compute_Sf_via_scaffold_cv(
    df=train_df,
    fingerprint_cols=fingerprint_columns,
    outcome_col="Outcome",
    n_splits=10,
    k_neighbors=k_neighbors,
    tanimoto_cutoff=tanimoto_cutoff
)

# ===============================================================
# Simpan Sf ke CSV di path yang sama seperti sebelumnya
# ===============================================================
output_dir = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Acute Dermal Toxicity\Read Across"
os.makedirs(output_dir, exist_ok=True)
sf_csv_path = os.path.join(output_dir, "Sf_per_fingerprint.csv")

rows = []
for fp, vals in Sf_results.items():
    rows.append({
        "Fingerprint": fp,
        "Mean_AUC_CV": vals["Mean_AUC_CV"],
        "Mean_BACC_CV": vals["Mean_BACC_CV"],
        "Sf": vals["Sf"]
    })

sf_df = pd.DataFrame(rows)
sf_df.to_csv(sf_csv_path, index=False)

print("\n=== Sf per fingerprint disimpan ke ===")
print(sf_csv_path)
