In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

# Function to load SMILES from a CSV file
def load_smiles_from_csv(file_path, smiles_column):
    df = pd.read_csv(file_path)
    smiles = df[smiles_column].tolist()
    return smiles

# Function to compare datasets based on InChIKey
def compare_datasets(datasets):
    inchikeys_sets = []
    
    for dataset in datasets:
        smiles = load_smiles_from_csv(dataset['path'], dataset['smiles_column'])
        inchikeys = {get_inchikey(smiles) for smiles in smiles}
        inchikeys_sets.append(inchikeys)

    common_inchikeys = set.intersection(*inchikeys_sets)

    return common_inchikeys

# Function to generate InChIKey for a given SMILES
def get_inchikey(smiles):
    mol = Chem.MolFromSmiles(smiles)
    inchikey = AllChem.InchiToInchiKey(Chem.MolToInchi(mol))
    return inchikey

# List of datasets with paths and smiles column names
datasets = [
    {'path': '/home/leila/hERG_Datasets/data/eos2ta5/external_test_set_neg.csv', 'smiles_column': 'smiles'},
    {'path': '/home/leila/hERG_Datasets/data/eos2ta5/external_test_set_new.csv', 'smiles_column': 'smiles'},
    {'path': '/home/leila/hERG_Datasets/data/eos2ta5/external_test_set_pos.csv', 'smiles_column': 'smiles'},
    {'path': '/home/leila/hERG_Datasets/data/eos30f3/Cai_TableS3_fixed.csv', 'smiles_column': 'smiles'},
     {'path': '/home/leila/hERG_Datasets/data/eos4tcc/pretraining/MLSMR_training.csv', 'smiles_column': 'smiles'},
     {'path': '/home/leila/hERG_Datasets/data/eos4tcc/pretraining/MLSMR_validation.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-1m2HMChbuzgnuKoch-_5efSR65fjwOM9pQjnWp6r8E0=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-CfiEKFSvJvK1sbkN1-m6Y60GXkk2gwgfoWZ8y_AnWGg=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-LXNv_MaWOPU1wfg21EleC0uDbYuU1p-O1oEDG5EhtJA=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-N2lYBOSQqtctxULhQ_8DuOj_esrEWtrk5G-0c-VoN2Y=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-Ru4vxBv5KmiI2wbPCIldE79Ig8JJMajCXi-ogJAwhEM=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-_rgjmkhzrj6IhYHygNP4gqOL4rJ6ZQV56ggbSJPqcT0=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-cWQMQAJNiIqwfiDELDFvYiBDTDl-9B559Ya6AzP8qQI=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-cZodZxzy9FXMMon1EsDfYV6ltHwVb2_jbHYjVJCfdoQ=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-fexMS7V1A-_WngPdQNllc6wniss2x5FKayXLvep8GK8=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-hJmsqZxeLNB3ca3NNhqMUDllDr4v7NXYowpV1HdpOz8=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-ixEejoTDPuLeRXnkUbv4S4p6Ge51wdzq5iLJdEpsLMc=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-jI_-YFCed36-KYiHKYmPTo9ybMQd_4D18nsGKTLEk9Q=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-oTNh9anmVYzuHUxVgBzNxM8vE-3FsPiYMazrHAijAIQ=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-q4oac-qnZvfoQqtfOJL4fn-C1qCYufYt8cpPPuoYa10=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/CHEMBL25-chembl_activity-qnz5hxAXh2CxwDn5Yf0_oYRk79kGi5eV1WvXU2ugJ9k=.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # {'path': '/home/leila/hERG_Datasets/data/eos43at/.csv', 'smiles_column': 'smiles'},
    # Add more datasets as needed


    
]

# Compare datasets based on InChIKey
common_inchikeys = compare_datasets(datasets)

# Print the common InChIKeys
print("Common InChIKeys:", common_inchikeys)









































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































