In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
import pandas as pd
!pip install rdkit-pypi
!pip install rdkit
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from collections import Counter

df = pd.read_csv('/content/drive/MyDrive/newdrug/starting_data/train.csv')
smiles_list = df['Canonical_Smiles'].tolist()
y = df['Inhibition']



In [7]:

scaffold_smarts = {
    'imidazole':                  'n1cc[nH]c1',
    '1,2,4-triazole':             '[n]1cncn1',
    'pyridine':                   'c1ccncc1',
    'benzene':                    'c1ccccc1',
    'furan':                      'o1cccc1',
    'pyrrole':                    '[nH]1cccc1',
    'tertiary_amine':             '[NX3;H0;!$(NC=O)]',
    # 개별 inhibitor
    'ketoconazole':               'CC(C)OC1=CC=CC=C1C(=O)N2CCN(CC2)C3=NC(=CN3)C4=CC=C(Cl)C=C4Cl',
    'itraconazole':               'CC(C)N1CCN(Cc2nc(c[nH]2)C3=CC=C(Cl)C=C3Cl)CC1',
    'fluconazole':                'C[C@H](O)C1=NC=NN1C2=CC=CC=C2F',
    'posaconazole':               'COC1=CC=CC(=C1)OCC2CN(CCO2)C(=O)C3=CC(=C(C=C3F)F)C(F)(F)F',
    'voriconazole':               'C[C@H](O)C1=NC=NN1C2=CC(=C(C=C2F)F)N3CCOCC3',
    'clarithromycin':             'CC[C@H]1O[C@@H]2C[C@H](O)C(O[C@@H]2C[C@H]([C@H]1OC(=O)C)O)OC(=O)C',
    'erythromycin':               'CC[C@H]1O[C@@H]2C[C@H](O)C(O[C@H]2C[C@H]([C@H]1OC(=O)C)O)OC',
    'ritonavir':                  'CC(C)(C)C(=O)N[C@H](Cc1ccccc1)C(=O)N(CC2=CC=CC=C2)C(C)(C)CO',
    'piperine':                   'COC1=C(C=CC=C1)CC(=O)N2CCCCC2',
    'bergapten':                  'COC1=CC2=C(C=C1)OCO2',
    'bergamottin':                'COC1=CC2=C(C=C1)O[C@H]2CC=C(C)C(=O)O',
    'diltiazem':                  'CCOC(=O)C1=CC(=C(C=C1)COC)SCCN2CCOCC2',
    'verapamil':                  'CC[C@H](O)C1=CC=C(C=C1)CCN(CC)CC',
    'diallyl_disulfide':          'C=CCSSCC=C',
    'caffeic_acid':               'c1ccc(cc1O)C=CC(=O)O',
    'quercetin':                  'c1c(c2c(c(=O)c3c(c(c(c3O2)O)O)O)O)c(cc1O)O',
    'N-p-coumaroyltyramine':      'c1ccc(cc1)C=CC(=O)NCCc2ccc(O)cc2',
    'N-feruloyltyramine':         'COC1=CC=C(C=C1O)C=CC(=O)NCCc2ccc(O)cc2',
    'ciprofloxacin':              'C1CN(CC2=CC(=C(C=C2F)F)C(=O)C3=CC=CC=C3N1)C4=O',
    'fluvoxamine':                'COC1=CC=C(C=C1)OCCNCCO',
    'nefazodone':                 'CCN(CC)CCOC1=CC=C(C=C1)N2CCNCC2',
    'cimetidine':                 'CS(C(=N)NCCCN)NC',
    'amiodarone':                 'CC(C)C(O)CC1=CC(=C(C=C1I)O)O',
    'aprepitant':                 'C[C@H]1CN(CCN1C2=CC=CC=C2)C(=O)N3CCC[C@H]3C',
    # 구조적 특징
    'macrocyclic_lactone':        'C(=O)OCCCCCCCCCCC1',
    'desosamine_sugar':           'OC[C@H]1O[C@H](C)[C@H](O)[C@H](N(C)C)[C@H]1O',
    'thiazole':                   'n1ccs1',
    'quinolone':                  'O=C2Nc1ccccc1C(=O)N2',
    'hydroxyethylene':            'C(CO)C(O)C',
    'furanocoumarin':             'COC1=CC2=C(C=C1)OCO2',
    'epoxide':                    'C1OC1',
    'nitroso':                    '[NX1]=O',
    'michael_acceptor':           'C=CC=O',
    'benzothiazepine':            'c1cc2C(=C(C1)SCCN2)COC(=O)C',
    'phenylalkylamine':           'c1ccc(cc1)CCNCCN',
    'phenylpiperazine':           'c1ccc(cc1)N2CCNCC2',
    'halogenated_phenoxy':        'c1cc(cc(c1)F)OCC',
    'pah_skeleton':               'c1ccc2ccccc2c1',
    'terminal_acetylene':         '[CX2]#C',
    'catechol':                   'c1c(O)ccc(O)c1',
    'sulfonamide':                'S(=O)(=O)N',
    'urea':                       'NC(=O)N',
    'carbamate':                  'NC(=O)O',
    'tert_butyl':                 'C(C)(C)C',
    'trifluoromethyl':            'C(F)(F)F',
}

# Mol 객체로 변환
patterns = {}
for name, sm in scaffold_smarts.items():
    mol = Chem.MolFromSmarts(sm)
    if mol is None:
        print(f"Warning: Could not parse SMARTS for '{name}': {sm}")
    patterns[name] = mol

records = []
for idx, smi in enumerate(smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        continue
    hit_dict = {'ID': df.loc[idx, 'ID']}
    for name, patt in patterns.items():
        if patt is not None:
            hit_dict[name] = int(mol.HasSubstructMatch(patt))
        else:
            hit_dict[name] = 0
    records.append(hit_dict)

hits_df = pd.DataFrame(records)
hits_df.to_csv('scaffold_hits.csv', index=False)

scaffolds = []
for smi in smiles_list:
    mol = Chem.MolFromSmiles(smi)
    if mol is None: continue
    core = MurckoScaffold.GetScaffoldForMol(mol)
    core_smiles = Chem.MolToSmiles(core)
    scaffolds.append(core_smiles)

scaffold_counts = Counter(scaffolds)

pd.DataFrame(scaffold_counts.items(), columns=['scaffold', 'count']) \
  .sort_values('count', ascending=False) \
  .to_csv('murcko_scaffold_counts.csv', index=False)

[18:52:56] SMARTS Parse Error: unclosed ring for input: 'C1CN(CC2=CC(=C(C=C2F)F)C(=O)C3=CC=CC=C3N1)C4=O'
[18:52:56] SMARTS Parse Error: unclosed ring for input: 'C(=O)OCCCCCCCCCCC1'


  25  c1ccccc1
  11  c1ccc2[nH]ccc2c1
   9  O=c1cc[nH]c2ccccc12
   8  c1ccncc1
   6  c1ccc2ncccc2c1
   6  c1ccc(Cc2ccccc2)cc1
   5  O=C(Nc1ccccc1)c1ccccc1
   4  
   4  O=c1ccc2ccccc2o1
   4  c1ccc(COc2ccccc2)cc1
   4  c1ccc(-c2ccccc2)cc1
   3  O=C(Nc1ccccc1)c1cnccn1
   3  c1ccc(-c2ccccn2)cc1
   3  c1ccc2c(c1)[nH]c1ncncc12
   3  C1CCNCC1
   3  c1ccc(Nc2ccncc2)cc1
   3  c1ccc2ccccc2c1
   3  c1ccc(C2CCCCC2)cc1
   3  c1cncnc1
   3  c1ccc(Nc2ccccc2)cc1


In [18]:
sh = pd.read_csv('./scaffold_hits.csv')
pattern_cols = [c for c in sh.columns if c not in ['ID']]

ones_count = sh[pattern_cols].sum()
zero_cols = ones_count[ones_count == 0].index.tolist()

sh_filtered = sh.drop(columns=zero_cols)
sh_filtered.to_csv('input_candidates.csv', index=False)

print("각 패턴별 1의 개수:")
print(ones_count)

display(sh_filtered.head())

각 패턴별 1의 개수:
imidazole                  47
1,2,4-triazole            157
pyridine                  376
benzene                  1190
furan                      57
pyrrole                    55
tertiary_amine            437
ketoconazole                0
itraconazole                0
fluconazole                 0
posaconazole                0
voriconazole                0
clarithromycin              0
erythromycin                0
ritonavir                   0
piperine                    0
bergapten                   0
bergamottin                 0
diltiazem                   0
verapamil                   0
diallyl_disulfide           0
caffeic_acid                2
quercetin                   0
N-p-coumaroyltyramine       0
N-feruloyltyramine          0
ciprofloxacin               0
fluvoxamine                 0
nefazodone                  0
cimetidine                  0
amiodarone                  0
aprepitant                  0
macrocyclic_lactone         0
desosamine_sugar           

Unnamed: 0,ID,imidazole,"1,2,4-triazole",pyridine,benzene,furan,pyrrole,tertiary_amine,caffeic_acid,hydroxyethylene,...,phenylpiperazine,halogenated_phenoxy,pah_skeleton,terminal_acetylene,catechol,sulfonamide,urea,carbamate,tert_butyl,trifluoromethyl
0,TRAIN_0000,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,TRAIN_0001,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TRAIN_0002,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
3,TRAIN_0003,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TRAIN_0004,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
