In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Descriptors import MolLogP
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.DataStructs import ExplicitBitVect
import sys
import multiprocessing
from standardiser import break_bonds, neutralise, rules, unsalt
from standardiser.utils import StandardiseException, sanity_check
%reload_ext autoreload
%autoreload 2
def warn(*args, **kwargs):
    pass 
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import sys
from sklearn.metrics import cohen_kappa_score
import csv
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import ShuffleSplit
import _pickle as cPickle
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit    
import bz2
from glob import glob
import _pickle as cPickle
import pickle
#Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
#Draw.DrawingOptions.atomLabelFontSize = 18
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors, rdMolDescriptors
from CDK_pywrapper import CDK
import pandas as pd
import os
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors

# Preprocessing

In [None]:
from rdkit import Chem
from rdkit.Chem import rdMolStandardize
import pandas as pd
import numpy as np
import os

# ==========================
# Path input & output
# ==========================
raw_path = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Original_dataset_acute dermal.xlsx"
out_path = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Curated_dataset_acute_dermal.xlsx"

df = pd.read_excel(raw_path)

# pastikan ada kolom SMILES dan Outcome
assert "SMILES" in df.columns, "Kolom SMILES tidak ditemukan"
assert "Outcome" in df.columns, "Kolom Outcome tidak ditemukan"

# ==========================
# RDKit standardization tools
# ==========================
normalizer = rdMolStandardize.Normalizer()
reionizer = rdMolStandardize.Reionizer()
tautomer_enumerator = rdMolStandardize.TautomerEnumerator()

def standardize_smiles(smi):
    """QSAR-ready curation ala manuskrip: validasi, parent, normalisasi, tautomer, stereokimia sederhana."""
    if not isinstance(smi, str):
        return None
    smi = smi.strip()
    if smi == "":
        return None

    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None  # struktural tidak valid → buang

    # 1) Potong jadi parent (hapus counterion/solvent, ambil frag terbesar)
    frags = Chem.GetMolFrags(mol, asMols=True, sanitize=False)
    if len(frags) > 1:
        # pilih frag dengan atom terbanyak
        frags = sorted(frags, key=lambda m: m.GetNumAtoms(), reverse=True)
        mol = frags[0]
    else:
        mol = frags[0]

    # 2) Normalisasi & reionisasi (charge normalization, kekhususan fungsional)
    try:
        mol = normalizer.normalize(mol)
        mol = reionizer.reionize(mol)
    except Exception:
        pass

    # 3) Kekosongan stereokimia/tautomer: pilih bentuk kanonik
    try:
        mol = tautomer_enumerator.Canonicalize(mol)
    except Exception:
        pass

    # 4) SMILES kanonik
    try:
        can_smi = Chem.MolToSmiles(mol, isomericSmiles=True)
    except Exception:
        return None
    return can_smi

# ==========================
# Terapkan kurasi ke seluruh dataset
# ==========================
curated_smiles = []
valid_flags = []

for smi in df["SMILES"]:
    cs = standardize_smiles(smi)
    curated_smiles.append(cs)
    valid_flags.append(cs is not None)

df["Curated_SMILES"] = curated_smiles
df["Is_Valid"] = valid_flags

# buang struktur tidak valid
df_valid = df[df["Is_Valid"]].copy().reset_index(drop=True)

# ==========================
# Buang duplikat (canonical SMILES)
# ==========================
# jika ada konflik label, Anda bisa pilih aturan lain (mis. drop keduanya)
dup_counts = df_valid["Curated_SMILES"].value_counts()
dups = dup_counts[dup_counts > 1].index.tolist()

rows = []
for smi in dups:
    sub = df_valid[df_valid["Curated_SMILES"] == smi]
    if len(sub["Outcome"].unique()) == 1:
        # label konsisten → ambil satu saja
        rows.append(sub.iloc[0])
    else:
        # label konflik → buang (sesuai manuskrip: records dengan label irreconcilable di-drop)[file:1]
        continue

df_unique = df_valid[df_valid["Curated_SMILES"].isin(dup_counts[dup_counts == 1].index)].copy()
if rows:
    df_unique = pd.concat([df_unique, pd.DataFrame(rows)], axis=0, ignore_index=True)

# drop kolom helper
df_unique = df_unique.drop(columns=["Is_Valid"]).rename(columns={"Curated_SMILES": "SMILES_canonical"})

print("Sebelum kurasi:", len(df), "baris")
print("Setelah validasi & deduplikasi:", len(df_unique), "baris")

# ==========================
# Simpan hasil kurasi
# ==========================
df_unique.to_excel(out_path, index=False)
print("Curated dataset disimpan di:", out_path)


# Dataset Splitting

In [None]:
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
import pandas as pd
import numpy as np
import os

# ==========================
# Load dataset
# ==========================
file_path = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Original_dataset_acute dermal.xlsx"
df = pd.read_excel(file_path)

print("Kolom dataset:", df.columns)

smiles_all = df["SMILES"].astype(str).values

# ==========================
# Fungsi: Bemis–Murcko scaffold
# ==========================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaff = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaff) if scaff is not None else None

# Hitung scaffold untuk semua molekul
scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_all]

# Group index per scaffold
scaf_to_idx = {}
for idx, scaf in enumerate(scaffolds):
    scaf_to_idx.setdefault(scaf, []).append(idx)

unique_scaffolds = list(scaf_to_idx.keys())
rng = np.random.RandomState(42)
rng.shuffle(unique_scaffolds)

# ==========================
# Scaffold-based 80:20 split
# ==========================
train_idx = []
test_idx = []
n_total = len(df)
target_train = 0.8 * n_total
current_train = 0

for scaf in unique_scaffolds:
    idxs = scaf_to_idx[scaf]
    if current_train + len(idxs) <= target_train:
        train_idx.extend(idxs)
        current_train += len(idxs)
    else:
        test_idx.extend(idxs)

train_idx = np.array(train_idx, dtype=int)
test_idx = np.array(test_idx, dtype=int)

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

# ==========================
# Save hasil split
# ==========================
folder = os.path.dirname(file_path)
train_file = os.path.join(folder, "Train_set_acute_dermal_scaffoldsplit.xlsx")
test_file = os.path.join(folder, "Test_set_acute_dermal_scaffoldsplit.xlsx")

train_df.to_excel(train_file, index=False)
test_df.to_excel(test_file, index=False)

print(f"Train set disimpan di: {train_file}")
print(f"Test set disimpan di: {test_file}")
print(f"Total sampel: {n_total}, Train: {len(train_df)}, Test: {len(test_df)}")

print("\nDistribusi Outcome Train:")
print(train_df["Outcome"].value_counts(normalize=True))

print("\nDistribusi Outcome Test:")
print(test_df["Outcome"].value_counts(normalize=True))


# FINGEPRINTS COMPUTATION

In [None]:
# Fungsi untuk menghitung Morgan Descriptors
def compute_morgan_fp(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits))
    return [0] * n_bits  # Jika gagal, kembalikan vektor nol

# Fungsi untuk menghitung MACCS Keys
def compute_maccs_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return list(MACCSkeys.GenMACCSKeys(mol))
    return [0] * 167  # MACCS memiliki 167 bit

# Fungsi APF bit vector
def compute_apf_fp(smiles, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=n_bits)
        return list(fp)
    return [0] * n_bits

# Daftar file input
input_files = [
    "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Test_set.xlsx",
    "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Train_set.xlsx"
]


# Proses setiap file input
for input_path in input_files:
    # Load dataset
    df = pd.read_excel(input_path)

    # Hitung fitur untuk setiap molekul
    df["Morgan_Descriptors"] = df["SMILES"].apply(compute_morgan_fp)
    df["MACCS_Descriptors"] = df["SMILES"].apply(compute_maccs_fp)
    df["APF_Descriptors"] = df["SMILES"].apply(compute_apf_fp)

    # Urutkan berdasarkan kolom Outcome
    df_sorted = df.sort_values(by="Outcome", ascending=True)

    # Tentukan output path (folder sama dengan input)
    folder = os.path.dirname(input_path)
    filename = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(folder, f"{filename}_with_fingerprints_sorted.xlsx")

    # Simpan hasil
    df_sorted.to_excel(output_path, index=False)
    print(f"Hasil untuk '{input_path}' telah disimpan ke '{output_path}'.")

print("Selesai memproses semua file.")


# RDKIT-CDK COMPUTATION

In [None]:
# Fungsi bantu konversi SMILES ke Mol RDKit
def smiles_to_mol(smiles):
    try:
        return Chem.MolFromSmiles(smiles)
    except:
        return None

# Fungsi hitung deskriptor RDKit
def calculate_rdkit_descriptors(mol):
    columns = [
        'Molecular Weight', 'logP', 'LabuteASA', 'TPSA', 'AMW', 'NumRotatableBonds',
        'NumAromaticRings', 'NumSaturatedRings', 'NumAliphaticRings', 'NumAromaticHeterocycles',
        'NumSaturatedHeterocycles', 'NumAliphaticHeterocycles', 'NumAromaticCarbocycles',
        'NumSaturatedCarbocycles', 'NumAliphaticCarbocycles', 'FractionCSP3',
        'Chi0v', 'Chi1v', 'Chi2v', 'Chi3v', 'Chi4v',
        'Chi1n', 'Chi2n', 'Chi3n', 'Chi4n',
        'HallKierAlpha',
        "Heavy Atom Count", "Ring Count", "Num H Donors", "Num H Acceptors"
    ]
    if mol is None:
        return {col: None for col in columns}
    return {
        'Molecular Weight': Descriptors.MolWt(mol),
        'logP': Descriptors.MolLogP(mol),
        'LabuteASA': rdMolDescriptors.CalcLabuteASA(mol),
        'TPSA': Descriptors.TPSA(mol),
        'AMW': Descriptors.MolWt(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'NumAromaticRings': Descriptors.NumAromaticRings(mol),
        'NumSaturatedRings': Descriptors.NumSaturatedRings(mol),
        'NumAliphaticRings': Descriptors.NumAliphaticRings(mol),
        'NumAromaticHeterocycles': Descriptors.NumAromaticHeterocycles(mol),
        'NumSaturatedHeterocycles': Descriptors.NumSaturatedHeterocycles(mol),
        'NumAliphaticHeterocycles': Descriptors.NumAliphaticHeterocycles(mol),
        'NumAromaticCarbocycles': Descriptors.NumAromaticCarbocycles(mol),
        'NumSaturatedCarbocycles': Descriptors.NumSaturatedCarbocycles(mol),
        'NumAliphaticCarbocycles': Descriptors.NumAliphaticCarbocycles(mol),
        'FractionCSP3': Descriptors.FractionCSP3(mol),
        'Chi0v': Descriptors.Chi0v(mol),
        'Chi1v': Descriptors.Chi1v(mol),
        'Chi2v': Descriptors.Chi2v(mol),
        'Chi3v': Descriptors.Chi3v(mol),
        'Chi4v': Descriptors.Chi4v(mol),
        'Chi1n': Descriptors.Chi1n(mol),
        'Chi2n': Descriptors.Chi2n(mol),
        'Chi3n': Descriptors.Chi3n(mol),
        'Chi4n': Descriptors.Chi4n(mol),
        'HallKierAlpha': Descriptors.HallKierAlpha(mol),
        'Heavy Atom Count': Descriptors.HeavyAtomCount(mol),
        'Ring Count': Descriptors.RingCount(mol),
        'Num H Donors': Descriptors.NumHDonors(mol),
        'Num H Acceptors': Descriptors.NumHAcceptors(mol)
    }

# Inisialisasi CDK
cdk = CDK()

# List file input
input_files = [
    "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Train_set_with_fingerprints_sorted.xlsx",
    "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Test_set_with_fingerprints_sorted.xlsx"

]

# Fitur CDK yang ingin diambil
cdk_features = [
    'ALogP', 'ALogp2', 'AMR', 'MLogP', 'nAtomP', 'naAromAtom', 'bpol',
    'nB', 'ECCEN', 'fragC', 'nHBAcc', 'nHBDon', 'nAtomLAC', 'nAtomLC',
    'PetitjeanNumber', 'nRotB', 'LipinskiFailures', 'TopoPSA', 'VAdjMat',
    'XLogP', 'Fsp3'
]

# Proses setiap file
for input_path in input_files:
    # Baca data
    df = pd.read_excel(input_path)
    
    # Buat Mol RDKit
    df['Mol'] = df['SMILES'].apply(smiles_to_mol)
    
    # Hitung deskriptor RDKit
    df['RDKit_Descriptors'] = df['Mol'].apply(calculate_rdkit_descriptors)
    
    # Ekspansi dict RDKit ke kolom
    rdkit_df = pd.json_normalize(df['RDKit_Descriptors'])
    
    # Hitung fitur CDK
    cdk_descriptors = cdk.calculate(df['Mol'])
    cdk_df = cdk_descriptors[cdk_features].reset_index(drop=True)
    
    # Gabungkan semua fitur ke DataFrame utama
    df_final = pd.concat([df.drop(columns=['Mol', 'RDKit_Descriptors']), rdkit_df, cdk_df], axis=1)
    
    # Tentukan output path
    output_path = input_path.replace('.xlsx', '_with_RDKit_and_CDK_features.xlsx')
    
    # Simpan hasil
    df_final.to_excel(output_path, index=False)
    print(f"Dataset '{input_path}' dengan RDKit + CDK features sudah disimpan di:\n{output_path}")

print("Selesai memproses semua file.")


In [None]:
# List file input Excel
input_files = [
   "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Carcinogencity (manual split)\Dataset\Train_set_Carcinogenicity_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx", 
    "C:\Fauzan\Manuskrip QSAR 1\Major Revision\Carcinogencity (manual split)\Dataset\Test_set_Carcinogenicity_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx"]


for file_path in input_files:
    # Baca Excel
    df = pd.read_excel(file_path)
    
    # Pastikan kolom 'Outcome' ada
    if 'Outcome' not in df.columns:
        print(f"Kolom 'Outcome' tidak ditemukan di {file_path}.")
        continue
    
    # Informasi dasar
    print(f"\nFile: {file_path}")
    print(f"Total data: {len(df)}")
    print("Distribusi Outcome:")
    print(df['Outcome'].value_counts())
    
    # Visualisasi distribusi Outcome
    plt.figure(figsize=(8, 6))
    df['Outcome'].value_counts().plot(kind='bar', color='skyblue')
    plt.title(f'Distribusi Outcome - {file_path.split("\\")[-1]}')
    plt.xlabel('Outcome')
    plt.ylabel('Frekuensi')
    plt.xticks(rotation=45)
    plt.show()
