In [1]:
!pip install pandas rdkit

Collecting rdkit
  Downloading rdkit-2023.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.4


In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

# Assuming you have two datasets in CSV format with a column named 'SMILES'
dataset1 = pd.read_csv('dili_tdc_dataset.csv')
dataset2 = pd.read_csv('dilismiles.csv')

# Convert SMILES strings to RDKit molecule objects
PandasTools.AddMoleculeColumnToFrame(dataset1, 'Drug', 'Molecule1')
PandasTools.AddMoleculeColumnToFrame(dataset2, 'col_smiles', 'Molecule2')

# Find the intersection of SMILES
overlap_smiles = set(dataset1['Drug']).intersection(set(dataset2['col_smiles']))

# Calculate the number of drugs in each dataset
num_drugs_paper = len(set(dataset2['col_smiles']))
num_drugs_tdc = len(set(dataset1['Drug']))

# Calculate the number of common drugs and the percentage of overlap
num_common_drugs = len(overlap_smiles)
percent_overlap = (num_common_drugs / num_drugs_paper) * 100

print(f"Number of drugs in paper's dataset: {num_drugs_paper}")
print(f"Number of drugs in TDC's dataset: {num_drugs_tdc}")
print(f"Number of common drugs: {num_common_drugs}")
print(f"Percentage of common drugs relative to paper's dataset: {percent_overlap:.2f}%")

# Convert the set of overlapping SMILES to a DataFrame
overlap_df = pd.DataFrame(list(overlap_smiles), columns=['Overlapping SMILES'])

overlap_df.head()


Number of drugs in paper's dataset: 587
Number of drugs in TDC's dataset: 475
Number of common drugs: 67
Percentage of common drugs relative to paper's dataset: 11.41%


Unnamed: 0,Overlapping SMILES
0,CCN(CC)CC(=O)Nc1c(C)cccc1C
1,CCCCNC(=O)NS(=O)(=O)c1ccc(C)cc1
2,Clc1ccccc1CN1CCc2sccc2C1
3,OC1C(O)C(O)C(O)C(O)C1O
4,CCOc1ccc(NC(C)=O)cc1
