In [5]:
import pandas as pd
import numpy as np

# Define the dataset paths
datasets = [
    {'path': '../data/eos2ta5/eos2ta5.csv'},
    {'path': '../data/eos30f3/eos30f3.csv'},
    {'path': '../data/eos4tcc/eos4tcc.csv'},
    {'path': '../data/eos43at/eos43at.csv'}
]

# Read datasets into a list of DataFrames
dfs = [pd.read_csv(dataset['path']) for dataset in datasets]

# Dictionary to store the results
overlap_results = {}

# Check for overlapping molecules between all pairs of datasets based on inchikey
for i in range(len(dfs)):
    for j in range(i+1, len(dfs)):
        # Use intersect1d to find common values between two DataFrames
        common_values = np.intersect1d(dfs[i].columns, dfs[j].columns)

        # Use the first common column found or modify as needed
        common_column = common_values[0] if common_values.size > 0 else None

        if common_column:
            overlap = pd.merge(dfs[i], dfs[j], on=common_column, how='inner')
            key = f"Overlap between {datasets[i]['path']} and {datasets[j]['path']} based on {common_column}"
            overlap_results[key] = len(overlap)

# Display the results
for key, value in overlap_results.items():
    print(f"{key}: {value} molecules")


Overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos30f3/eos30f3.csv based on InChIKey: 7575 molecules
Overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos4tcc/eos4tcc.csv based on InChIKey: 13515 molecules
Overlap between ../data/eos2ta5/eos2ta5.csv and ../data/eos43at/eos43at.csv based on InChIKey: 2737 molecules
Overlap between ../data/eos30f3/eos30f3.csv and ../data/eos4tcc/eos4tcc.csv based on InChIKey: 8533 molecules
Overlap between ../data/eos30f3/eos30f3.csv and ../data/eos43at/eos43at.csv based on InChIKey: 2391 molecules
Overlap between ../data/eos4tcc/eos4tcc.csv and ../data/eos43at/eos43at.csv based on InChIKey: 5543 molecules
