In [1]:
import pandas as pd
import numpy as np

# Define the dataset paths
datasets = [
    {'path': '../data/eos2ta5/eos2ta5.csv'},
    {'path': '../data/eos30f3/eos30f3.csv'},
    {'path': '../data/eos4tcc/eos4tcc.csv'},
    {'path': '../data/eos43at/eos43at.csv'}
]

# Read datasets into a list of DataFrames
dfs = [pd.read_csv(dataset['path']) for dataset in datasets]

# Dictionary to store the results
overlap_results = {}

# Method 1: Get inchi keys common to ALL the models' datasets
common_keys = set.intersection(*(set(df['InChIKey']) for df in dfs))
total_common_molecules = len(common_keys)

for i, df in enumerate(dfs):
    total_molecules = len(df)
    percentage_overlap = (total_common_molecules / total_molecules) * 100
    overlap_results[f"Percentage overlap for {datasets[i]['path']} with all models"] = percentage_overlap

# Method 2: Pairwise percentage overlap
for i in range(len(dfs)):
    for j in range(i + 1, len(dfs)):
        common_keys_pairwise = set.intersection(set(dfs[i]['InChIKey']), set(dfs[j]['InChIKey']))
        total_molecules_i = len(dfs[i])
        total_molecules_j = len(dfs[j])
        percentage_overlap_pairwise = (len(common_keys_pairwise) / min(total_molecules_i, total_molecules_j)) * 100
        overlap_results[f"Percentage overlap between {datasets[i]['path']} and {datasets[j]['path']}"] = percentage_overlap_pairwise

# Display the results
for key, value in overlap_results.items():
    print(f"{key}: {value:.2f}%")


Percentage overlap for ../data/eos2ta5/eos2ta5.csv with all models: 2.80%
Percentage overlap for ../data/eos30f3/eos30f3.csv with all models: 4.77%
Percentage overlap for ../data/eos4tcc/eos4tcc.csv with all models: 0.12%
Percentage overlap for ../data/eos43at/eos43at.csv with all models: 2.28%
Percentage overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos30f3/eos30f3.csv: 89.61%
Percentage overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos4tcc/eos4tcc.csv: 87.00%
Percentage overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos43at/eos43at.csv: 6.22%
Percentage overlap between ../data/eos30f3/eos30f3.csv and ../data/eos4tcc/eos4tcc.csv: 90.05%
Percentage overlap between ../data/eos30f3/eos30f3.csv and ../data/eos43at/eos43at.csv: 5.43%
Percentage overlap between ../data/eos4tcc/eos4tcc.csv and ../data/eos43at/eos43at.csv: 9.41%
