In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
from mordred import Calculator, descriptors
import pandas as pd
from rdkit.Chem import rdMolDescriptors
import numpy as np
from rdkit.Chem import Descriptors, AllChem


In [None]:

train_original = pd.read_csv(f'../data_preprocessing/dacon/train.csv')
train_884 = pd.read_csv('../data_preprocessing/processed_data/aid_884_cyp3a4.csv')
train_1851 = pd.read_csv('../data_preprocessing/processed_data/aid_1851_cyp3a4.csv')
train_1645841 = pd.read_csv('../data_preprocessing/processed_data/aid_1645841_cyp3a4.csv')
train_1671201 = pd.read_csv('../data_preprocessing/processed_data/aid_1671201_mean.csv')
train_1671201.drop(['Unnamed: 0'],axis=1, inplace=True)
train_chembl = pd.read_csv('../data_preprocessing/processed_data/chembl_final.csv')
train_chembl.drop(['Unnamed: 0'],axis=1, inplace=True)
train_coldf = pd.read_csv('../data_preprocessing/processed_data/col_df.csv')
train_store = pd.read_csv('../data_preprocessing/processed_data/testosterone_df.csv')

train_original.drop(['ID'],axis=1, inplace=True)
train_original = pd.concat([train_original, train_884, train_1851, train_1645841, train_1671201, train_chembl, train_coldf, train_store], axis=0)

train_original.reset_index(drop=True, inplace=True)

test = pd.read_csv(f'../data_preprocessing/dacon/test.csv')
submission = pd.read_csv(f'../data_preprocessing/dacon/sample_submission.csv')

train = train_original

In [None]:
df_unique = train.drop_duplicates(subset=["Canonical_Smiles", "Inhibition"], keep="first")

grouped = df_unique.groupby("Canonical_Smiles")

stats = grouped["Inhibition"].agg(["count", "mean", "min", "max"]).reset_index()

not_duplicated = stats[stats["count"] == 1][["Canonical_Smiles"]]
result = df_unique[df_unique["Canonical_Smiles"].isin(not_duplicated["Canonical_Smiles"])]

duplicated_stats = stats[stats["count"] > 1]

processed_rows = []

for _, row in duplicated_stats.iterrows():
    smiles = row["Canonical_Smiles"]
    inh_min = row["min"]
    inh_max = row["max"]
    inh_mean = row["mean"]
    inh_diff = inh_max - inh_min

    if inh_diff >= 30:
        continue
    elif inh_mean <= 20:
        inh_value = inh_min
    elif inh_mean >= 70:
        inh_value = inh_max
    else:
        inh_value = inh_mean

    processed_rows.append({
        "Canonical_Smiles": smiles,
        "Inhibition": inh_value
    })

processed_df = pd.DataFrame(processed_rows)

final_df = pd.concat([result[["Canonical_Smiles", "Inhibition"]], processed_df], ignore_index=True)

final_df = final_df.reset_index(drop=True)
train = final_df.copy()

In [None]:
calc = Calculator(descriptors, ignore_3D=False)
calc.descriptors

In [None]:
def count_fragments(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return 0
        frags = Chem.GetMolFrags(mol)
        return len(frags)
    except:
        return 0
train['Fragment_Count'] = train['Canonical_Smiles'].apply(count_fragments)
test['Fragment_Count'] = test['Canonical_Smiles'].apply(count_fragments)

In [None]:
def get_largest_fragment(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        frags = Chem.GetMolFrags(mol, asMols=True, sanitizeFrags=True)
        largest = max(frags, key=lambda m: m.GetNumAtoms())
        return Chem.MolToSmiles(largest)
    except:
        return None

train['Canonical_Smiles'] = train['Canonical_Smiles'].apply(get_largest_fragment)
test['Canonical_Smiles'] = test['Canonical_Smiles'].apply(get_largest_fragment)
train['Canonical_Smiles'] 

In [None]:
x_train = calc.pandas([Chem.MolFromSmiles(x) for x in train.Canonical_Smiles])
x_train

In [None]:
x_train = x_train.apply(pd.to_numeric, errors='coerce')

In [None]:
train_complete = pd.concat([train, x_train], axis=1)
train_complete

In [None]:
x_test = calc.pandas([Chem.MolFromSmiles(x) for x in test.Canonical_Smiles])
x_test

In [None]:
x_test = x_test.apply(pd.to_numeric, errors='coerce')

In [None]:
test_complete = pd.concat([test, x_test], axis=1)
test_complete

In [None]:
nan_ratio = train_complete.isnull().mean()
cols_to_drop = nan_ratio[nan_ratio >= 0.3].index
train_complete = train_complete.drop(columns=cols_to_drop)
test_complete = test_complete.drop(columns=cols_to_drop)


In [None]:
train_complete.to_csv(f"train_clean.csv", index=False)
test_complete.to_csv(f"test_clean.csv", index=False)
train_complete

In [None]:

train = pd.read_csv('train_clean.csv')
test = pd.read_csv('test_clean.csv')

In [None]:
import re
def tokenize_smiles(smiles_string):
    """Tokenizes a SMILES string."""
    token_patterns = ['\[[^\]]+\]', '(Cl|Br|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|:|~|@|\?|>|\*|\$|\%[0-9]{2}|[0-9])']
    token_regex = '|'.join(token_patterns)
    
    tokens = re.findall(token_regex, smiles_string)
    processed_tokens = [str(tok) for tok in tokens]

    return processed_tokens

train['SMILES_tokens'] = train['Canonical_Smiles'].apply(tokenize_smiles)
test['SMILES_tokens'] = test['Canonical_Smiles'].apply(tokenize_smiles)

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

train_encoded = mlb.fit_transform(train['SMILES_tokens'])
test_encoded = mlb.transform(test['SMILES_tokens'])

mlb_classes = [f'SMILES_{c}' for c in mlb.classes_]

train_encoded_df = pd.DataFrame(train_encoded, columns=mlb.classes_)
test_encoded_df = pd.DataFrame(test_encoded, columns=mlb.classes_)

train_extended = pd.concat([train, train_encoded_df], axis=1)
test_extended = pd.concat([test, test_encoded_df], axis=1)

train_extended.head()


In [None]:

def generate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return {
        "MolWt": Descriptors.MolWt(mol),
        "MolLogP": Descriptors.MolLogP(mol),
        "NumHDonors": Descriptors.NumHDonors(mol),
        "NumHAcceptors": Descriptors.NumHAcceptors(mol),
        "NumRotatableBonds": Descriptors.NumRotatableBonds(mol),
        "NumAromaticRings": Descriptors.NumAromaticRings(mol),
        "NumSaturatedRings": Descriptors.NumSaturatedRings(mol),
        "NumAliphaticRings": Descriptors.NumAliphaticRings(mol),
        "RingCount": Descriptors.RingCount(mol),
        "TPSA": Descriptors.TPSA(mol),
        "NumValenceElectrons": Descriptors.NumValenceElectrons(mol),
        "NumRadicalElectrons": Descriptors.NumRadicalElectrons(mol),
        "NumAtoms": mol.GetNumAtoms(),
        "NumHeavyAtoms": mol.GetNumHeavyAtoms(),
        "ExactMolWt": Descriptors.ExactMolWt(mol),
        "NumHeteroatoms": Descriptors.NumHeteroatoms(mol)
    } if mol else {
        "MolWt": 0, "MolLogP": 0, "NumHDonors": 0, "NumHAcceptors": 0,
        "NumRotatableBonds": 0, "NumAromaticRings": 0, "NumSaturatedRings": 0,
        "NumAliphaticRings": 0, "RingCount": 0, "TPSA": 0, #"LabuteASA": 0,
        "NumValenceElectrons": 0, "NumRadicalElectrons": 0, "NumAtoms": 0,
        "NumHeavyAtoms": 0, "ExactMolWt": 0, "NumHeteroatoms": 0
    }

train_desc_df = train_extended['Canonical_Smiles'].apply(generate_descriptors).apply(pd.Series)
train_extended = pd.concat([train_extended, train_desc_df], axis=1)

test_desc_df = test_extended['Canonical_Smiles'].apply(generate_descriptors).apply(pd.Series)
test_extended = pd.concat([test_extended, test_desc_df], axis=1)

def generate_all_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    descriptors_functions = [attr for attr in dir(Descriptors) if callable(getattr(Descriptors, attr)) and not attr.startswith("_")]

    descriptors = {}
    for desc_func in descriptors_functions:
        try:
            value = getattr(Descriptors, desc_func)(mol)
            descriptors[desc_func] = value
        except:
            continue

    return descriptors

descriptors = train['Canonical_Smiles'].apply(generate_all_descriptors).apply(pd.Series)
descriptors.columns = [f'RDKit_{col}' for col in descriptors.columns]
train_extended = pd.concat([train_extended, descriptors], axis=1)

descriptors_te = test['Canonical_Smiles'].apply(generate_all_descriptors).apply(pd.Series)
descriptors_te.columns = [f'RDKit_{col}' for col in descriptors_te.columns]
test_extended = pd.concat([test_extended, descriptors_te], axis=1)
test_extended

In [None]:
from rdkit.DataStructs import ConvertToNumpyArray

def morgan_fp_array(smiles, radius=4, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        mol = Chem.MolFromSmiles('C') # fallback
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    arr = np.zeros((nBits,), dtype=int)
    ConvertToNumpyArray(fp, arr)
    return arr

train_fp_arrs = np.array([morgan_fp_array(smi) for smi in train_extended['Canonical_Smiles']])
test_fp_arrs = np.array([morgan_fp_array(smi) for smi in test_extended['Canonical_Smiles']])

# train에서 global mean, std 계산
fp_global_mean = np.mean(train_fp_arrs, axis=0)
fp_global_std = np.std(train_fp_arrs, axis=0) + 1e-8  # zero div 방지

# 각 샘플별 global std, mean 계산
def sample_global_std(arr):
    return np.std((arr - fp_global_mean) / fp_global_std)

def sample_global_mean(arr):
    return np.mean((arr - fp_global_mean) / fp_global_std)

train_extended['fp_std'] = [sample_global_std(fp) for fp in train_fp_arrs]
train_extended['fp_mean'] = [sample_global_mean(fp) for fp in train_fp_arrs]

test_extended['fp_std'] = [sample_global_std(fp) for fp in test_fp_arrs]
test_extended['fp_mean'] = [sample_global_mean(fp) for fp in test_fp_arrs]

In [None]:
from rdkit.Chem import AllChem, rdMolDescriptors

def get_fsp3(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return rdMolDescriptors.CalcFractionCSP3(mol) if mol else 0

def get_aromatic_ratio(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 0
    aromatic_carbons = sum(1 for atom in mol.GetAtoms() if atom.GetIsAromatic() and atom.GetSymbol() == 'C')
    total_carbons = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'C')
    return aromatic_carbons / (total_carbons + 1e-5) # avoid div by zero

train_extended['fsp3'] = train_extended['Canonical_Smiles'].apply(get_fsp3)
test_extended['fsp3'] = test_extended['Canonical_Smiles'].apply(get_fsp3)

train_extended['aromatic_ratio'] = train_extended['Canonical_Smiles'].apply(get_aromatic_ratio)
test_extended['aromatic_ratio'] = test_extended['Canonical_Smiles'].apply(get_aromatic_ratio)

In [None]:

def generate_fingerprints(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            raise ValueError(f"Invalid SMILES: {smiles}")
        return list(AllChem.GetMorganFingerprintAsBitVect(mol, radius=4, nBits=2048))
    except Exception as e:
        print(f"[Error] {e}")
        return [0] * 2048  # fallback: all-zero vector


fingerprints = train['Canonical_Smiles'].apply(generate_fingerprints).apply(pd.Series)
fingerprints.columns = [f'FP_{i}' for i in fingerprints.columns]
train_extended = pd.concat([train_extended, fingerprints], axis=1)

fingerprints_te = test['Canonical_Smiles'].apply(generate_fingerprints).apply(pd.Series)
fingerprints_te.columns = [f'FP_{i}' for i in fingerprints_te.columns]
test_extended = pd.concat([test_extended, fingerprints_te], axis=1)


In [None]:

def safe_mol(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol
    except:
        return None

def count_ester(smiles): return smiles.count('-C(=O)O-') + smiles.count('-COO-')
def count_amide(smiles): return smiles.count('-C(=O)N-')
def count_carboxyl(smiles): return smiles.count('-C(=O)O')
def count_hydroxyl(smiles): return smiles.count('-OH')
def count_alkene(smiles): return smiles.count('=C')
def count_alkyne(smiles): return smiles.count('#C')
def count_aromatic(smiles): return smiles.lower().count('c')
def count_ring(smiles): return sum(smiles.count(str(i)) for i in range(1, 10))
def count_halides(smiles): return sum(smiles.count(x) for x in ['F', 'Cl', 'Br', 'I'])
def count_nitro(smiles): return smiles.count('-NO2') + smiles.count('N(=O)=O')
def count_sulfonyl(smiles): return smiles.count('SO2') + smiles.count('S(=O)(=O)')
def count_phosphate(smiles): return smiles.count('PO4') + smiles.count('P(=O)(O)(O)O')
def count_ammonium(smiles): return smiles.count('N+')

def count_cyp_substrates(smiles):
    cyp_patterns = ['[N,O]S(=O)(=O)', 'N[C@@H](C)C(=O)O', 'NC(=O)C(N)N', 'n1ccccc1']
    return sum(smiles.count(pat) for pat in cyp_patterns)

def count_pg_pgp_substrates(smiles):
    pgp_patterns = ['NC(=O)N', 'n1cncn1', 'N[C@@H](C)CO']
    return sum(smiles.count(pat) for pat in pgp_patterns)

def count_functional_groups(smiles):
    functional_patterns = ['C(=O)O', 'C(=O)N', 'N(=O)=O']
    return sum(smiles.count(pat) for pat in functional_patterns)

def count_ugt_substrates(smiles):
    ugt_patterns = ['O=C(O)', 'O=C(C)O', 'Nc1ccc(cc1)C(=O)O', 'c1ccccc1C(=O)O']
    return sum(smiles.count(pat) for pat in ugt_patterns)

def count_gst_substrates(smiles):
    gst_patterns = ['S=C', 'N=C', 'O=C']
    return sum(smiles.count(pat) for pat in gst_patterns)

def count_alcohol(smiles): return smiles.count('O-H') + smiles.count('OH')
def count_ketone(smiles): return smiles.count('C=O')
def count_aldehyde(smiles): return smiles.count('CHO')
def count_amine(smiles): return smiles.count('NH2') + smiles.count('NH-') + smiles.count('-N<')
def count_ether(smiles): return smiles.count('-O-')
def count_thiol(smiles): return smiles.count('-SH')
def count_thioether(smiles): return smiles.count('-S-')
def count_azo(smiles): return smiles.count('-N=N-')
def count_isocyanate(smiles): return smiles.count('-N=C=O')
def count_thiocyanate(smiles): return smiles.count('-N=C=S')
def count_cyano(smiles): return smiles.count('-C#N')
def count_imide(smiles): return smiles.count('C=NC=N')
def count_azirine(smiles): return smiles.count('C=NN')
def count_hydrazine(smiles): return smiles.count('NN')

# RDKit-based structure counts
def count_atoms(smiles):
    mol = safe_mol(smiles)
    return mol.GetNumAtoms() if mol else 0

def count_bonds(smiles):
    mol = safe_mol(smiles)
    return mol.GetNumBonds() if mol else 0

def count_positive_atoms(smiles):
    mol = safe_mol(smiles)
    return sum(1 for atom in mol.GetAtoms() if atom.GetFormalCharge() > 0) if mol else 0

def count_negative_atoms(smiles):
    mol = safe_mol(smiles)
    return sum(1 for atom in mol.GetAtoms() if atom.GetFormalCharge() < 0) if mol else 0

def count_polar_atoms(smiles):
    mol = safe_mol(smiles)
    polar_atoms = {'N', 'O', 'S', 'P', 'F', 'Cl', 'Br', 'I'}
    return sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() in polar_atoms) if mol else 0

for df in [train_extended, test_extended]:
    df['num_alkene'] = df['Canonical_Smiles'].apply(count_alkene)
    df['num_alkyne'] = df['Canonical_Smiles'].apply(count_alkyne)
    df['num_aromatic'] = df['Canonical_Smiles'].apply(count_aromatic)
    df['num_ring'] = df['Canonical_Smiles'].apply(count_ring)
    df['num_halides'] = df['Canonical_Smiles'].apply(count_halides)
    df['num_nitro'] = df['Canonical_Smiles'].apply(count_nitro)
    df['num_sulfonyl'] = df['Canonical_Smiles'].apply(count_sulfonyl)
    df['num_phosphate'] = df['Canonical_Smiles'].apply(count_phosphate)
    df['num_ammonium'] = df['Canonical_Smiles'].apply(count_ammonium)
    df['num_cyp_substrates'] = df['Canonical_Smiles'].apply(count_cyp_substrates)
    df['num_pgp_substrates'] = df['Canonical_Smiles'].apply(count_pg_pgp_substrates)
    df['num_functional_groups'] = df['Canonical_Smiles'].apply(count_functional_groups)
    df['num_ugt_substrates'] = df['Canonical_Smiles'].apply(count_ugt_substrates)
    df['num_gst_substrates'] = df['Canonical_Smiles'].apply(count_gst_substrates)
    df['num_alcohol'] = df['Canonical_Smiles'].apply(count_alcohol)
    df['num_ketone'] = df['Canonical_Smiles'].apply(count_ketone)
    df['num_aldehyde'] = df['Canonical_Smiles'].apply(count_aldehyde)
    df['num_amine'] = df['Canonical_Smiles'].apply(count_amine)
    df['num_ether'] = df['Canonical_Smiles'].apply(count_ether)
    df['num_thiol'] = df['Canonical_Smiles'].apply(count_thiol)
    df['num_thioether'] = df['Canonical_Smiles'].apply(count_thioether)
    df['num_azo'] = df['Canonical_Smiles'].apply(count_azo)
    df['num_isocyanate'] = df['Canonical_Smiles'].apply(count_isocyanate)
    df['num_thiocyanate'] = df['Canonical_Smiles'].apply(count_thiocyanate)
    df['num_cyano'] = df['Canonical_Smiles'].apply(count_cyano)
    df['num_imide'] = df['Canonical_Smiles'].apply(count_imide)
    df['num_azirine'] = df['Canonical_Smiles'].apply(count_azirine)
    df['num_hydrazine'] = df['Canonical_Smiles'].apply(count_hydrazine)
    df['num_atoms'] = df['Canonical_Smiles'].apply(count_atoms)
    df['num_bonds'] = df['Canonical_Smiles'].apply(count_bonds)
    df['num_positive_atoms'] = df['Canonical_Smiles'].apply(count_positive_atoms)
    df['num_negative_atoms'] = df['Canonical_Smiles'].apply(count_negative_atoms)
    df['num_polar_atoms'] = df['Canonical_Smiles'].apply(count_polar_atoms)

train_extended.head()

In [None]:
def calculate_aromatic_proportion(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return 0
    aromatic_atoms = [atom for atom in mol.GetAtoms() if atom.GetIsAromatic()]
    total_atoms = mol.GetNumAtoms()
    return len(aromatic_atoms) / total_atoms if total_atoms else 0

def calculate_heavy_atom_count(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol.GetNumHeavyAtoms() if mol else 0

def calculate_formal_charge(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return Chem.rdmolops.GetFormalCharge(mol) if mol else 0

def calculate_ring_count(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol.GetRingInfo().NumRings() if mol else 0

def calculate_stereocenters(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return 0
    chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
    return len(chiral_centers)

def count_NandO(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return 0
    n_count = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'N')
    o_count = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'O')
    return n_count + o_count

def calculate_rule_of_five_satisfactions(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return 0
    mol_MW = Descriptors.MolWt(mol)
    QPlogPo_w = Descriptors.MolLogP(mol)
    accptHB = Descriptors.NumHAcceptors(mol)
    donorHB = Descriptors.NumHDonors(mol)

    satisfactions = 0
    if mol_MW >= 500:
        satisfactions += 1
    if QPlogPo_w >= 5:
        satisfactions += 1
    if donorHB > 5:
        satisfactions += 1
    if accptHB > 10:
        satisfactions += 1

    return satisfactions

train_extended['aromatic_proportion'] = train_extended['Canonical_Smiles'].apply(calculate_aromatic_proportion)
train_extended['heavy_atom_count'] = train_extended['Canonical_Smiles'].apply(calculate_heavy_atom_count)
train_extended['formal_charge'] = train_extended['Canonical_Smiles'].apply(calculate_formal_charge)
train_extended['num_ring'] = train_extended['Canonical_Smiles'].apply(calculate_ring_count)
train_extended['stereocenters'] = train_extended['Canonical_Smiles'].apply(calculate_stereocenters)
train_extended['NandO'] = train_extended['Canonical_Smiles'].apply(count_NandO)
train_extended['rule_of_five_satisfactions'] = train_extended['Canonical_Smiles'].apply(calculate_rule_of_five_satisfactions)

test_extended['aromatic_proportion'] = test_extended['Canonical_Smiles'].apply(calculate_aromatic_proportion)
test_extended['heavy_atom_count'] = test_extended['Canonical_Smiles'].apply(calculate_heavy_atom_count)
test_extended['formal_charge'] = test_extended['Canonical_Smiles'].apply(calculate_formal_charge)
test_extended['num_ring'] = test_extended['Canonical_Smiles'].apply(calculate_ring_count)
test_extended['stereocenters'] = test_extended['Canonical_Smiles'].apply(calculate_stereocenters)
test_extended['NandO'] = test_extended['Canonical_Smiles'].apply(count_NandO)
test_extended['rule_of_five_satisfactions'] = test_extended['Canonical_Smiles'].apply(calculate_rule_of_five_satisfactions)

In [None]:

train_extended.drop(['SMILES_tokens'],axis=1,inplace=True)
test_extended.drop(['SMILES_tokens'],axis=1,inplace=True)

In [None]:
single_value_cols = [col for col in train_extended.columns if train_extended[col].nunique() == 1]

print("train_extended에서 유니크값이 1개인 컬럼들:", single_value_cols)

train_extended = train_extended.drop(columns=single_value_cols)
test_extended = test_extended.drop(columns=single_value_cols)

print("train_extended shape before:", train_extended.shape, "after:", train_extended.shape)
print("Test_extended shape before:", test_extended.shape, "after:", test_extended.shape)

In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity, BulkTanimotoSimilarity

def smiles_to_fp(smiles, radius=4, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)

train_extended_fps = [smiles_to_fp(smi) for smi in train_extended['Canonical_Smiles']]
test_extended_fps = [smiles_to_fp(smi) for smi in test_extended['Canonical_Smiles']]

train_extended_fps = [fp if fp is not None else AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles('C'), 4, 2048) for fp in train_extended_fps]
test_extended_fps = [fp if fp is not None else AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles('C'), 4, 2048) for fp in test_extended_fps]

def avg_similarity(fp, reference_fps):
    sims = BulkTanimotoSimilarity(fp, reference_fps)
    return np.mean(sims)

train_extended['avg_similarity'] = [avg_similarity(fp, train_extended_fps) for fp in train_extended_fps]
test_extended['avg_similarity'] = [avg_similarity(fp, train_extended_fps) for fp in test_extended_fps]


In [None]:
train_extended

In [None]:
train_extended.drop(['MW','TPSA'],axis=1, inplace=True)
test_extended.drop(['MW','TPSA'],axis=1, inplace=True)

In [None]:
from make_descriptos2 import CYP3A4EssentialDescriptors 
train_smiles = train_extended['Canonical_Smiles'].tolist()
test_smiles = test_extended['Canonical_Smiles'].tolist()

generator = CYP3A4EssentialDescriptors()

print("Generating descriptors for TRAIN set...")
train_descriptors_df = generator.generate_essential_descriptors(train_smiles)
print("Train descriptors shape:", train_descriptors_df.shape)

print("\nGenerating descriptors for TEST set...")
test_descriptors_df = generator.generate_essential_descriptors(test_smiles)
print("Test descriptors shape:", test_descriptors_df.shape)

In [None]:
train_descriptors_df.drop(columns=['SMILES'], errors='ignore', inplace=True)
test_descriptors_df.drop(columns=['SMILES'], errors='ignore', inplace=True)

train_extended = pd.concat([train_extended, train_descriptors_df], axis=1)
test_extended = pd.concat([test_extended, test_descriptors_df], axis=1)

In [None]:
train_extended.to_csv('train_final.csv', index=False)
test_extended.to_csv('test_final.csv', index=False)
