In [1]:
#TASK: CHECK IF TRAIN SET (train.csv) SHARES SMILES WITH TRAINSET SUPPLEMENTS (dataset1/2/3/4.csv)

# Import packages
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Draw, Descriptors

In [None]:
train = pd.read_csv('train.csv')
dataset1 = pd.read_csv('dataset1.csv')
dataset2 = pd.read_csv('dataset2.csv')
dataset3 = pd.read_csv('dataset3.csv')
dataset4 = pd.read_csv('dataset4.csv')

#set() already removes duplicates
train_smiles = set(train['SMILES'])
dataset1_smiles = set(dataset1['SMILES'])
dataset2_smiles = set(dataset2['SMILES'])
dataset3_smiles = set(dataset3['SMILES'])
dataset4_smiles = set(dataset4['SMILES'])

overlap1 = train_smiles.intersection(dataset1_smiles)
overlap2 = train_smiles.intersection(dataset2_smiles)
overlap3 = train_smiles.intersection(dataset3_smiles)
overlap4 = train_smiles.intersection(dataset4_smiles)

print(f'Overlap between train.csv and dataset1.csv: {len(overlap1)} SMILES')
print(f'Overlap between train.csv and dataset2.csv: {len(overlap2)} SMILES')
print(f'Overlap between train.csv and dataset3.csv: {len(overlap3)} SMILES')
print(f'Overlap between train.csv and dataset4.csv: {len(overlap4)} SMILES')    

Overlap between train.csv and dataset1.csv: 737 SMILES
Overlap between train.csv and dataset2.csv: 5191 SMILES
Overlap between train.csv and dataset3.csv: 0 SMILES
Overlap between train.csv and dataset4.csv: 0 SMILES


In [None]:
#Get size of each dataset
print(f'Shape of train.csv: {train.shape}')
print(f'Shape of dataset1.csv: {dataset1.shape}')
print(f'Shape of dataset2.csv: {dataset2.shape}')
print(f'Shape of dataset3.csv: {dataset3.shape}')
print(f'Shape of dataset4.csv: {dataset4.shape}')

Shape of train.csv: (7973, 7)
Shape of dataset1.csv: (874, 2)
Shape of dataset2.csv: (7208, 1)
Shape of dataset3.csv: (46, 2)
Shape of dataset4.csv: (862, 2)


In [None]:
#Total number of unique smiles between train and all supplementdatasets
total_unique_smiles = len(train_smiles.union(dataset1_smiles).union(dataset2_smiles).union(dataset3_smiles).union(dataset4_smiles))
print(f'Total number of unique SMILES across all datasets: {total_unique_smiles}')


Total number of unique SMILES across all datasets: 10345


In [26]:
#select subset of train.csv with a Tc value attached
train_with_Tc = train[train['Tc'].notnull()] #subset of train dataframe with Tc values

print(f"Number of SMILES with Tc values in train.csv: {len(train_with_Tc)}")

dataset1_with_Tc = dataset1[dataset1['TC_mean'].notnull()]

print(f"Number of SMILES with Tc values in dataset1.csv: {len(dataset1_with_Tc)}")



Number of SMILES with Tc values in train.csv: 737
Number of SMILES with Tc values in dataset1.csv: 874


In [33]:
#Concatenate train_with_Tc and dataset1_with_Tc for model training
combined_data = pd.concat([train_with_Tc, dataset1_with_Tc], ignore_index=True)

#set rows with TC_mean values as Tc values in combined_data
combined_data['Tc'] = combined_data.apply(lambda row: row['TC_mean'] if pd.notnull(row['TC_mean']) else row['Tc'], axis=1)

In [34]:
combined_data.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg,TC_mean
0,87817.0,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,,
1,2986007.0,*c1ccc(-c2ccc3c(c2)C(CCCCCCC#N)(CCCCCCC#N)c2cc...,,0.402397,0.487,0.901123,28.682441,
2,3013292.0,*CC(*)c1ccc(C(=O)O)c(C(=O)O)c1,,,0.171,1.184354,13.534248,
3,6645418.0,*CCCCCNC(=O)CCCCC(=O)N*,,0.332741,0.327,,,
4,7687820.0,*CCCCCCCCCCCCCCCCCCNC(=O)NCCCCCCNC(=O)N*,,,0.383,,,


In [31]:
combined_data.shape

(1611, 8)

In [22]:
# A function to canonicalize isomeric SMILES
def canonicalize_smiles(smiles, isomeric:bool=True):
    """
    Convert any SMILES to its canonical form.

    Args:
        smiles (str): Input SMILES string

    Returns:
        str: Canonical SMILES
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None  # Invalid SMILES
    return Chem.MolToSmiles(mol, isomericSmiles=isomeric)


In [23]:
#Appyl canonicalization to the SMILES column
combined_data['Canonical_SMILES'] = combined_data['SMILES'].apply(canonicalize_smiles)

In [None]:
combined_data.head()

Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg,TC_mean,Canonical_SMILES


In [25]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                0 non-null      float64
 1   SMILES            0 non-null      object 
 2   Tg                0 non-null      float64
 3   FFV               0 non-null      float64
 4   Tc                0 non-null      float64
 5   Density           0 non-null      float64
 6   Rg                0 non-null      float64
 7   TC_mean           0 non-null      float64
 8   Canonical_SMILES  0 non-null      object 
dtypes: float64(7), object(2)
memory usage: 0.0+ bytes
