In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m36.2/36.2 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [None]:
import pandas as pd
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.ML.Cluster import Butina
import numpy as np
from tqdm import tqdm
import os

# --- ÏÑ§Ï†ï Î∞è ÌååÏùº Îß§Ìïë (Ïù¥ Î∂ÄÎ∂ÑÏùÄ ÎèôÏùº) ---
FP_SCORE_FILE = "selected_ligands3.smi"
PHARMA_SIG_FILE = "pharma_sig_output_parallel.csv"
OUTPUT_DIV_FILE = "diversity_3000_smi_id_only.smi"

ID_COLUMN = 'ID'
SMILES_COLUMN = 'smi'
SIMILARITY_COLUMN = 'Score_FP_main'
DELIMITER = '\t'
TARGET_SIM = 7000
TARGET_DIV = 3000
CLUSTER_THRESHOLD = 0.55 # ÏûÑÍ≥ÑÍ∞í Î≥ÄÏàòÎäî Í∑∏ÎåÄÎ°ú ÏÇ¨Ïö©

# --------------------------------
# 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Î≥ëÌï© (ÏÉùÎûµ Í∞ÄÎä•ÌïòÎÇò Ïò§Î•ò Î∞©ÏßÄÎ•º ÏúÑÌï¥ Ìè¨Ìï®)
# --------------------------------
print("1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Î≥ëÌï© (ÎÇòÎ®∏ÏßÄ Î∂ÑÏûê ÌíÄ ÌôïÎ≥¥)")
df_fp = pd.read_csv(FP_SCORE_FILE, sep=DELIMITER, header=None, names=[SMILES_COLUMN, ID_COLUMN, 'Mol_Object_Garbage', SIMILARITY_COLUMN, 'Score_2', 'Score_3', 'Score_4'])
df_fp[SIMILARITY_COLUMN] = pd.to_numeric(df_fp[SIMILARITY_COLUMN], errors='coerce')
df_pharma = pd.read_csv(PHARMA_SIG_FILE)
df_pharma_clean = df_pharma[df_pharma[ID_COLUMN] != ID_COLUMN].copy()
df_final = pd.merge(df_fp[[ID_COLUMN, SMILES_COLUMN, SIMILARITY_COLUMN]], df_pharma_clean[[ID_COLUMN, 'pharma_sig']], on=ID_COLUMN, how='inner')
df_final.dropna(subset=[SIMILARITY_COLUMN], inplace=True)

# --------------------------------
# 2. ÏÉÅÏúÑ 7,000Í∞ú Î∂ÑÏûê Ï†úÏô∏ (Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ ÎåÄÏÉÅ ÌôïÏ†ï)
# --------------------------------
df_sorted = df_final.sort_values(by=SIMILARITY_COLUMN, ascending=False)
df_similarity_7k = df_sorted.head(TARGET_SIM)
df_remaining = df_final[~df_final[ID_COLUMN].isin(df_similarity_7k[ID_COLUMN])].copy()
print(f"   -> Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ ÎåÄÏÉÅ ÎÇòÎ®∏ÏßÄ Î∂ÑÏûê ÌíÄ: {len(df_remaining)}Í∞ú")

# --------------------------------
# 3. Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ Í∏∞Î∞ò 30% Îã§ÏñëÏÑ± (3,000Í∞ú) ÏÑ†Î≥Ñ
# --------------------------------
print("3. Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ (3,000Í∞ú Îã§ÏñëÏÑ±) ÏãúÏûë...")
df_remaining['mol'] = df_remaining[SMILES_COLUMN].apply(Chem.MolFromSmiles)
df_butina_target = df_remaining[df_remaining['mol'].notnull()].reset_index(drop=True)

# Morgan Fingerprint Í≥ÑÏÇ∞
fps = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=2048)
       for m in tqdm(df_butina_target['mol'], desc="Butina FPs")]

# Í±∞Î¶¨ Í≥ÑÏÇ∞ (Tanimoto Distance)
dists = []
n = len(fps)
for i in tqdm(range(n), desc="Butina Distances"):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[i+1:])
    dists.extend([1 - s for s in sims])

# ‚≠ê‚≠ê ÏàòÏ†ïÎêú Î∂ÄÎ∂Ñ: cutoff -> distThresh ‚≠ê‚≠ê
clusters = Butina.ClusterData(dists, nPts=n, distThresh=CLUSTER_THRESHOLD, isDistData=True)
print(f"   -> ÏÉùÏÑ±Îêú ÌÅ¥Îü¨Ïä§ÌÑ∞ Ïàò: {len(clusters)}Í∞ú")

# ÌÅ¥Îü¨Ïä§ÌÑ∞ ÌÅ¨Í∏∞Ïóê ÎπÑÎ°ÄÌïòÏó¨ 3,000Í∞ú Î∂ÑÎ∞∞ ÏÑ†Î≥Ñ
cluster_sizes = [len(c) for c in clusters]
total_size = sum(cluster_sizes)
cluster_select_counts = [
    max(1, int(size / total_size * TARGET_DIV)) for size in cluster_sizes
]

selected_butina_indices = []
for clust, take_n in zip(clusters, cluster_select_counts):
    clust_indices = list(clust)
    np.random.shuffle(clust_indices)
    selected_butina_indices.extend(clust_indices[:take_n])

selected_butina_indices = list(dict.fromkeys(selected_butina_indices))[:TARGET_DIV]
df_diversity_3k = df_butina_target.iloc[selected_butina_indices].copy()
print(f"   -> Butina Îã§ÏñëÏÑ± ÏµúÏ¢Ö ÏÑ†ÌÉù: {len(df_diversity_3k)}Í∞ú")

# --------------------------------
# 4. 30% Îã§ÏñëÏÑ± Î∂ÑÏûêÎßå SMI ID ÌòïÌÉúÎ°ú Ï†ÄÏû•
# --------------------------------
df_diversity_3k[[SMILES_COLUMN, ID_COLUMN]].to_csv(OUTPUT_DIV_FILE, sep=DELIMITER, index=False, header=False)

print("-" * 30)
print(f"‚≠ê ÏµúÏ¢Ö 3,000Í∞ú Îã§ÏñëÏÑ± ÌõÑÎ≥¥Í∞Ä SMI ID ÌòïÌÉúÎ°ú '{OUTPUT_DIV_FILE}' ÌååÏùºÏóê Ï†ÄÏû• ÏôÑÎ£å! üéâ")

1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Î≥ëÌï© (ÎÇòÎ®∏ÏßÄ Î∂ÑÏûê ÌíÄ ÌôïÎ≥¥)
   -> Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ ÎåÄÏÉÅ ÎÇòÎ®∏ÏßÄ Î∂ÑÏûê ÌíÄ: 23259Í∞ú
3. Butina ÌÅ¥Îü¨Ïä§ÌÑ∞ÎßÅ (3,000Í∞ú Îã§ÏñëÏÑ±) ÏãúÏûë...


[1;30;43mÏä§Ìä∏Î¶¨Î∞ç Ï∂úÎ†• ÎÇ¥Ïö©Ïù¥ Í∏∏Ïñ¥ÏÑú ÎßàÏßÄÎßâ 5000Ï§ÑÏù¥ ÏÇ≠Ï†úÎêòÏóàÏäµÎãàÎã§.[0m
Butina FPs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23259/23259 [00:01<00:00, 13532.10it/s]
Butina Distances: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 23259/23259 [01:02<00:00, 372.59it/s] 


   -> ÏÉùÏÑ±Îêú ÌÅ¥Îü¨Ïä§ÌÑ∞ Ïàò: 1626Í∞ú
   -> Butina Îã§ÏñëÏÑ± ÏµúÏ¢Ö ÏÑ†ÌÉù: 3000Í∞ú
------------------------------
‚≠ê ÏµúÏ¢Ö 3,000Í∞ú Îã§ÏñëÏÑ± ÌõÑÎ≥¥Í∞Ä SMI ID ÌòïÌÉúÎ°ú 'diversity_3000_smi_id_only.smi' ÌååÏùºÏóê Ï†ÄÏû• ÏôÑÎ£å! üéâ


# ÏÉà ÏÑπÏÖò

In [None]:
import pandas as pd
import numpy as np

# --- ÌååÏùº ÏÑ§Ï†ï ---
FILE_7K = "/content/top_7060smi_id_only.smi"
FILE_3K = "diversity_3000_smi_id_only.smi"
DELIMITER = '\t'
COLUMN_NAMES = ['smi', 'ID']
ID_COLUMN = 'ID'

def load_ids(filename):
    """ÏßÄÏ†ïÎêú ÌååÏùºÏóêÏÑú IDÎ•º Î°úÎìúÌïòÍ≥† SetÏúºÎ°ú Î∞òÌôòÌï©ÎãàÎã§. Ïò§Î•ò Î∞úÏÉù Ïãú NoneÏùÑ Î∞òÌôòÌï©ÎãàÎã§."""
    try:
        # ÌååÏùº Î°úÎìú Î∞è ID Ï∂îÏ∂ú
        df = pd.read_csv(filename, sep=DELIMITER, header=None, names=COLUMN_NAMES)
        ids = set(df[ID_COLUMN].astype(str).str.strip())
        print(f"   -> {filename} ID Ïàò: {len(ids)}Í∞ú")
        return ids
    except FileNotFoundError:
        print(f"üö® Ïò§Î•ò: ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ÌååÏùºÎ™ÖÏùÑ ÌôïÏù∏Ìï¥ Ï£ºÏÑ∏Ïöî: {filename}")
        return None
    except Exception as e:
        print(f"üö® ÌååÏùº Î°úÎìú Ï§ë ÏòàÏÉÅÏπò Î™ªÌïú Ïò§Î•ò Î∞úÏÉù ({filename}): {e}")
        return None

# --------------------------------
# 1. ÌååÏùº Î°úÎìú Î∞è ID Ï∂îÏ∂ú
# --------------------------------
print("1. ÌååÏùº Î°úÎìú Î∞è ID Ï∂îÏ∂ú ÏãúÏûë...")

ids_7k = load_ids(FILE_7K)
ids_3k = load_ids(FILE_3K)

# ÌååÏùº Î°úÎìú ÏÑ±Í≥µ Ïó¨Î∂Ä ÌôïÏù∏
if ids_7k is None or ids_3k is None:
    print("\n‚ö†Ô∏è ÌååÏùº Î°úÎìúÏóê Ïã§Ìå®ÌïòÏó¨ Ï§ëÎ≥µ ÌôïÏù∏ÏùÑ ÏßÑÌñâÌï† Ïàò ÏóÜÏäµÎãàÎã§. ÌååÏùº Í≤ΩÎ°ú Î∞è Ïù¥Î¶ÑÏùÑ ÌôïÏù∏Ìï¥Ï£ºÏÑ∏Ïöî.")
else:
    # --------------------------------
    # 2. Ï§ëÎ≥µ ID (ÍµêÏßëÌï©) ÌôïÏù∏
    # --------------------------------
    print("\n2. Ï§ëÎ≥µ ID (ÍµêÏßëÌï©) ÌôïÏù∏...")

    # Îëê ÏßëÌï©Ïùò ÍµêÏßëÌï©ÏùÑ Í≥ÑÏÇ∞ (O(N+M) ÏÜçÎèÑ)
    duplicate_ids = ids_7k.intersection(ids_3k)
    num_duplicates = len(duplicate_ids)

    # --------------------------------
    # 3. Í≤∞Í≥º Ï∂úÎ†• Î∞è Ï†ÄÏû•
    # --------------------------------
    print("-" * 40)
    if num_duplicates == 0:
        print(f"‚úÖ Ï§ëÎ≥µÎêòÎäî Î∂ÑÏûê IDÎäî **Ï¥ù 0Í∞ú**ÏûÖÎãàÎã§. (Îëê ÌååÏùºÏùÄ ÏôÑÎ≤ΩÌïòÍ≤å Î∂ÑÎ¶¨ÎêòÏóàÏäµÎãàÎã§.)")
    else:
        OUTPUT_DUPLICATES_FILE = "duplicate_ids.txt"

        print(f"‚ö†Ô∏è Ï§ëÎ≥µÎêòÎäî Î∂ÑÏûê IDÍ∞Ä **Ï¥ù {num_duplicates}Í∞ú** Î∞úÍ≤¨ÎêòÏóàÏäµÎãàÎã§.")
        print(f"   -> Ï§ëÎ≥µ ID Î™©Î°ùÏùÑ '{OUTPUT_DUPLICATES_FILE}'Ïóê Ï†ÄÏû•Ìï©ÎãàÎã§.")

        # Ï§ëÎ≥µ IDÎ•º ÌååÏùºÎ°ú Ï†ÄÏû•
        with open(OUTPUT_DUPLICATES_FILE, 'w') as f:
            for _id in sorted(list(duplicate_ids)):
                f.write(f"{_id}\n")

    print("-" * 40)
    print("ÏûëÏóÖ ÏôÑÎ£å.")

1. ÌååÏùº Î°úÎìú Î∞è ID Ï∂îÏ∂ú ÏãúÏûë...
   -> /content/top_7060smi_id_only.smi ID Ïàò: 7060Í∞ú
   -> diversity_3000_smi_id_only.smi ID Ïàò: 3000Í∞ú

2. Ï§ëÎ≥µ ID (ÍµêÏßëÌï©) ÌôïÏù∏...
----------------------------------------
‚ö†Ô∏è Ï§ëÎ≥µÎêòÎäî Î∂ÑÏûê IDÍ∞Ä **Ï¥ù 55Í∞ú** Î∞úÍ≤¨ÎêòÏóàÏäµÎãàÎã§.
   -> Ï§ëÎ≥µ ID Î™©Î°ùÏùÑ 'duplicate_ids.txt'Ïóê Ï†ÄÏû•Ìï©ÎãàÎã§.
----------------------------------------
ÏûëÏóÖ ÏôÑÎ£å.


In [None]:
import pandas as pd

# --- ÌååÏùº ÏÑ§Ï†ï ---
FILE_7055 = "/content/top_7060smi_id_only.smi"  # ÌïòÏúÑ 5Í∞ú Ï†úÍ±∞Îêú ÌååÏùº
FILE_3K = "diversity_3000_smi_id_only.smi" # 3000Í∞ú Îã§ÏñëÏÑ± ÌååÏùº
OUTPUT_MERGED = "final_merged_10000_candidates.smi"

DELIMITER = '\t'
COLUMN_NAMES = ['smi', 'ID']
ID_COLUMN = 'ID'

# --------------------------------
# 1. Îëê ÌååÏùº Î°úÎìú
# --------------------------------
print("1. Îëê ÌååÏùº Î°úÎìú ÏãúÏûë...")

try:
    df_7055 = pd.read_csv(FILE_7055, sep=DELIMITER, header=None, names=COLUMN_NAMES)
    df_3k = pd.read_csv(FILE_3K, sep=DELIMITER, header=None, names=COLUMN_NAMES)

    print(f"   -> 7055 ÌååÏùº Î°úÎìú Î∂ÑÏûê Ïàò: {len(df_7055)}Í∞ú")
    print(f"   -> 3000 ÌååÏùº Î°úÎìú Î∂ÑÏûê Ïàò: {len(df_3k)}Í∞ú")

except FileNotFoundError as e:
    print(f"üö® Ïò§Î•ò: ÌååÏùºÏùÑ Ï∞æÏùÑ Ïàò ÏóÜÏäµÎãàÎã§. ÌååÏùºÎ™ÖÏùÑ ÌôïÏù∏Ìï¥ Ï£ºÏÑ∏Ïöî: {e.filename}")
    exit()
except Exception as e:
    print(f"üö® ÌååÏùº Î°úÎìú Ï§ë ÏòàÏÉÅÏπò Î™ªÌïú Ïò§Î•ò Î∞úÏÉù: {e}")
    exit()

# --------------------------------
# 2. Îëê Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑ Î≥ëÌï© Î∞è Ï§ëÎ≥µ ÌôïÏù∏/Ï†úÍ±∞
# --------------------------------
print("\n2. Îç∞Ïù¥ÌÑ∞ Î≥ëÌï© Î∞è Ï§ëÎ≥µ ÌôïÏù∏/Ï†úÍ±∞...")

# Îëê Îç∞Ïù¥ÌÑ∞ÌîÑÎ†àÏûÑÏùÑ ÏàòÏßÅÏúºÎ°ú Ìï©Ïπ©ÎãàÎã§.
df_combined = pd.concat([df_7055, df_3k], ignore_index=True)
initial_combined_count = len(df_combined)

# ID Ïª¨ÎüºÏùÑ Í∏∞Ï§ÄÏúºÎ°ú Ï§ëÎ≥µÏùÑ ÌôïÏù∏ÌïòÍ≥† Ï†úÍ±∞Ìï©ÎãàÎã§.
# keep='first'Î°ú ÏÑ§Ï†ïÌïòÏó¨ Ï§ëÎ≥µÎêòÎäî Í≤ΩÏö∞ Î®ºÏ†Ä Ìï©Ï≥êÏßÑ ÌååÏùº(7055)Ïùò Î∂ÑÏûêÎ•º ÎÇ®ÍπÅÎãàÎã§.
df_final_merged = df_combined.drop_duplicates(subset=[ID_COLUMN], keep='first')

final_merged_count = len(df_final_merged)
duplicates_removed = initial_combined_count - final_merged_count

print(f"   -> Ìï©Í≥Ñ ÏòàÏÉÅ Î∂ÑÏûê Ïàò: {initial_combined_count}Í∞ú")
print(f"   -> Ï†úÍ±∞Îêú Ï§ëÎ≥µ Î∂ÑÏûê Ïàò: {duplicates_removed}Í∞ú")
print(f"   -> ÏµúÏ¢Ö Ï§ëÎ≥µ Ï†úÍ±∞ Î∂ÑÏûê Ïàò: {final_merged_count}Í∞ú")

# --------------------------------
# 3. ÏµúÏ¢Ö ÌååÏùº Ï†ÄÏû• Î∞è Ï§Ñ Ïàò ÏÑ∏Í∏∞
# --------------------------------
print("\n3. ÏµúÏ¢Ö ÌååÏùº Ï†ÄÏû•...")

df_final_merged.to_csv(
    OUTPUT_MERGED,
    sep=DELIMITER,
    index=False,
    header=False
)

# ÏµúÏ¢Ö ÌååÏùºÏùò Ï§Ñ Ïàò(Î∂ÑÏûê Í∞úÏàò)Îäî final_merged_countÏôÄ Í∞ôÏäµÎãàÎã§.
final_lines_count = final_merged_count

print("-" * 40)
print(f"‚úÖ Ï§ëÎ≥µÏù¥ Ï†úÍ±∞Îêú ÏµúÏ¢Ö ÌõÑÎ≥¥Íµ∞ ÌååÏùº '{OUTPUT_MERGED}' Ï†ÄÏû•Ïù¥ ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§.")
print(f"‚≠ê ÏµúÏ¢Ö ÌååÏùºÏùò **Ï¥ù Ï§Ñ Ïàò(Î∂ÑÏûê Í∞úÏàò)**Îäî **{final_lines_count}Í∞ú**ÏûÖÎãàÎã§. ‚≠ê")
print("-" * 40)

1. Îëê ÌååÏùº Î°úÎìú ÏãúÏûë...
   -> 7055 ÌååÏùº Î°úÎìú Î∂ÑÏûê Ïàò: 7060Í∞ú
   -> 3000 ÌååÏùº Î°úÎìú Î∂ÑÏûê Ïàò: 3000Í∞ú

2. Îç∞Ïù¥ÌÑ∞ Î≥ëÌï© Î∞è Ï§ëÎ≥µ ÌôïÏù∏/Ï†úÍ±∞...
   -> Ìï©Í≥Ñ ÏòàÏÉÅ Î∂ÑÏûê Ïàò: 10060Í∞ú
   -> Ï†úÍ±∞Îêú Ï§ëÎ≥µ Î∂ÑÏûê Ïàò: 55Í∞ú
   -> ÏµúÏ¢Ö Ï§ëÎ≥µ Ï†úÍ±∞ Î∂ÑÏûê Ïàò: 10005Í∞ú

3. ÏµúÏ¢Ö ÌååÏùº Ï†ÄÏû•...
----------------------------------------
‚úÖ Ï§ëÎ≥µÏù¥ Ï†úÍ±∞Îêú ÏµúÏ¢Ö ÌõÑÎ≥¥Íµ∞ ÌååÏùº 'final_merged_10000_candidates.smi' Ï†ÄÏû•Ïù¥ ÏôÑÎ£åÎêòÏóàÏäµÎãàÎã§.
‚≠ê ÏµúÏ¢Ö ÌååÏùºÏùò **Ï¥ù Ï§Ñ Ïàò(Î∂ÑÏûê Í∞úÏàò)**Îäî **10005Í∞ú**ÏûÖÎãàÎã§. ‚≠ê
----------------------------------------


In [None]:
len(final_10000_candidates.smi)

26