In [None]:
import pandas as pd
import rdkit
from functools import cmp_to_key

from rdkit import Chem
import rdkit.Chem.Descriptors

# Donor : 01-FZA-10
# SMILES : c1c(c(cc(c1OCCCCCCCCCCCC)-c2ccc(s2)C)OCCCCCCCCCCCC)-c3ccc(s3)-c4cc5c(c6c(c(c5s4)-c7cc(c(s7)CC(CCCC)CC)F)cc(s6)C)-c8cc(c(s8)CC(CCCC)CC)F
# Scaffold1 Scaffold2 Scaffold3 Scaffold4
# 1	1 1	2 0	0 10 21	21 2 0 0

# 커스텀 정렬을 위한 규칙 함수
def order_by_weight(a, b):
    a_mol_weight = rdkit.Chem.Descriptors.ExactMolWt(rdkit.Chem.MolFromSmiles(a['smiles']))
    b_mol_weight = rdkit.Chem.Descriptors.ExactMolWt(rdkit.Chem.MolFromSmiles(b['smiles']))

    if a_mol_weight > b_mol_weight:
        return -1
    elif a_mol_weight == b_mol_weight:
        return 0
    else:
        return 1

# 분류 조각들을 읽어옮.
scaffold_df = pd.read_csv('scaffold_mod.csv')

scaffold1_fragment_list_not_sort = []
scaffold2_fragment_list_not_sort = []
scaffold3_fragment_list_not_sort = []
scaffold4_fragment_list_not_sort = []

for idx, row in scaffold_df.iterrows():
    if row['Group'] == 'Part1':
        scaffold1_fragment_list_not_sort.append({'number': row['number'], 'smiles': row['smiles'], 'bb': row['bb'], 'sc1': row['sc1'], 'sc2': row['sc2']})
    elif row['Group'] == 'Part3':
        scaffold3_fragment_list_not_sort.append({'number': row['number'], 'smiles': row['smiles'], 'bb': row['bb'], 'sc1': row['sc1'], 'sc2': row['sc2']})
    else:
        scaffold2_fragment_list_not_sort.append({'number': row['number'], 'smiles': row['smiles'], 'bb': row['bb'], 'sc1': row['sc1'], 'sc2': row['sc2']})
        scaffold4_fragment_list_not_sort.append({'number': row['number'], 'smiles': row['smiles'], 'bb': row['bb'], 'sc1': row['sc1'], 'sc2': row['sc2']})

scaffold1_fragment_list = sorted(scaffold1_fragment_list_not_sort, key=cmp_to_key(order_by_weight))
scaffold2_fragment_list = sorted(scaffold2_fragment_list_not_sort, key=cmp_to_key(order_by_weight))
scaffold3_fragment_list = sorted(scaffold3_fragment_list_not_sort, key=cmp_to_key(order_by_weight))
scaffold4_fragment_list = sorted(scaffold4_fragment_list_not_sort, key=cmp_to_key(order_by_weight))

# 01-FZA-10
mol = Chem.MolFromSmiles("c1c(c(cc(c1OCCCCCCCCCCCC)-c2ccc(s2)C)OCCCCCCCCCCCC)-c3ccc(s3)-c4cc5c(c6c(c(c5s4)-c7cc(c(s7)CC(CCCC)CC)F)cc(s6)C)-c8cc(c(s8)CC(CCCC)CC)F")
bin_print = ''

# Scaffold1 찾기
scaffold1 = {}
for _sc1 in scaffold1_fragment_list:
    sc1_mol = rdkit.Chem.MolFromSmiles(_sc1['smiles'])

    if(mol.HasSubstructMatch(sc1_mol)):
        ii = mol.GetSubstructMatches(sc1_mol)

        scaffold1['sc1_number'] = _sc1['number']
        scaffold1['mol_idx_list'] = ii

        bin_print += str(_sc1['bb']) + ', ' + str(_sc1['sc1']) + ', ' + str(_sc1['sc2']) + ', '

        mol_without_sc1 = Chem.DeleteSubstructs(mol, sc1_mol)
        break

# Scaffold2 찾기
scaffold2 = {}
for _sc2 in scaffold2_fragment_list:
    sc2_mol = rdkit.Chem.MolFromSmiles(_sc2['smiles'])

    if(mol_without_sc1.HasSubstructMatch(sc2_mol)):
        ii = mol_without_sc1.GetSubstructMatches(sc2_mol)

        scaffold2['sc2_number'] = _sc2['number']
        scaffold2['mol_idx_list'] = ii

        bin_print += str(_sc2['bb']) + ', ' + str(_sc2['sc1']) + ', ' + str(_sc2['sc2']) + ', '

        mol_without_sc2 = Chem.DeleteSubstructs(mol_without_sc1, sc2_mol)
        break

# Scaffold3 찾기
scaffold3 = {}
for _sc3 in scaffold3_fragment_list:
    sc3_mol = rdkit.Chem.MolFromSmiles(_sc3['smiles'])

    if(mol_without_sc1.HasSubstructMatch(sc3_mol)):
        ii = mol_without_sc1.GetSubstructMatches(sc3_mol)

        scaffold3['sc3_number'] = _sc3['number']
        scaffold3['mol_idx_list'] = ii

        bin_print += str(_sc3['bb']) + ', ' + str(_sc3['sc1']) + ', ' + str(_sc3['sc2']) + ', '

        mol_without_sc3 = Chem.DeleteSubstructs(mol_without_sc2, sc3_mol)
        break

# Scaffold4 찾기
scaffold4 = {}
for _sc4 in scaffold4_fragment_list:
    sc4_mol = rdkit.Chem.MolFromSmiles(_sc4['smiles'])

    if(mol_without_sc1.HasSubstructMatch(sc4_mol)):
        ii = mol_without_sc1.GetSubstructMatches(sc4_mol)

        scaffold4['sc3_number'] = _sc4['number']
        scaffold4['mol_idx_list'] = ii

        bin_print += str(_sc4['bb']) + ', ' + str(_sc4['sc1']) + ', ' + str(_sc4['sc2'])

        mol_without_sc4 = Chem.DeleteSubstructs(mol_without_sc3, sc4_mol)
        break

bin_print

'1, 1, 1, 2, 0, 0, 10, 21, 21, 2, 0, 0'