In [1]:
import pandas as pd
import torch
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit.Chem import Draw

In [2]:
# Step 1: Read the CSV file
file_path = '13321_2024_820_MOESM2_ESM.csv'  # Replace with the path to your CSV file
data = pd.read_csv(file_path)

In [3]:
# Step 2: Inspect the data
# Show the first few rows
print("First 5 rows of the dataset:")
print(data.head())

# Show data summary
print("\nDataset summary:")
print(data.info())

# Show basic statistics
print("\nDescriptive statistics:")
print(data.describe())

# Optional: Display column names
print("\nColumn names:")
print(data.columns)

# Show molecular structures
"""
# Convert SMILES to RDKit Molecules
data['Molecule'] = data['CANON_SMILES'].apply(lambda x: Chem.MolFromSmiles(x))

for idx, mol in enumerate(data['Molecule']):
    if mol:  # Ensure the molecule is valid
        img = Draw.MolToImage(mol, size=(300, 300))  # Generate molecule image
        display(img)
    else:
        print(f"Molecule {idx + 1} could not be processed.")
"""

First 5 rows of the dataset:
   Index                     Name CANON_SMILES  Temperature (K)  \
0      0  1,2-acetylene dibromide    Br/C=C\Br           274.01   
1      1  1,2-acetylene dibromide    Br/C=C\Br           284.10   
2      2  1,2-acetylene dibromide    Br/C=C\Br           293.08   
3      3  1,2-acetylene dibromide    Br/C=C\Br           303.29   
4      4  1,2-acetylene dibromide    Br/C=C\Br           312.64   

   Inverse temperature (1/K)  Viscosity (cP)  log(Viscosity)  MD_density  \
0                   0.003650           1.217        0.085291    2.425891   
1                   0.003520           1.070        0.029384    2.407717   
2                   0.003412           0.960       -0.017729    2.396062   
3                   0.003297           0.859       -0.066007    2.358438   
4                   0.003199           0.782       -0.106793    2.341529   

   MD_FV      MD_Rg   MD_SP_E    MD_SP_V      MD_SP     MD_HV    MD_RMSD  \
0   0.01  27.678297  8.026725  20.6

'\n# Convert SMILES to RDKit Molecules\ndata[\'Molecule\'] = data[\'CANON_SMILES\'].apply(lambda x: Chem.MolFromSmiles(x))\n\nfor idx, mol in enumerate(data[\'Molecule\']):\n    if mol:  # Ensure the molecule is valid\n        img = Draw.MolToImage(mol, size=(300, 300))  # Generate molecule image\n        display(img)\n    else:\n        print(f"Molecule {idx + 1} could not be processed.")\n'

In [6]:

# Step 3: Generate Features with RDkit
def compute_features(smiles):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        # Extract features
        features = {
            'MolecularWeight': Descriptors.MolWt(mol),
            'NumAtoms': mol.GetNumAtoms(),
            'NumBonds': mol.GetNumBonds(),
            'LogP': Descriptors.MolLogP(mol),
            'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
            'TPSA': Descriptors.TPSA(mol)
        }
        return features
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None

def compute_fingerprint(smiles, radius=3, nBits=1024):
    try:
        # Convert SMILES to RDKit molecule
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return None
        # Generate Morgan fingerprint as a bit vector
        fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
        # Convert to a list of binary values
        return list(fingerprint)
    except Exception as e:
        print(f"Error processing SMILES {smiles}: {e}")
        return None
        
# Apply the function to the SMILES column
data['MolecularFeatures'] = data['CANON_SMILES'].apply(compute_features)
# Apply the fingerprint computation function
data['Fingerprint'] = data['CANON_SMILES'].apply(compute_fingerprint)

# Expand the features dictionary into separate columns
features_df = data['MolecularFeatures'].apply(pd.Series)
fingerprint_df = data['Fingerprint'].apply(pd.Series)
data = pd.concat([data, features_df, fingerprint_df], axis=1)

# Drop the temporary 'MolecularFeatures' column
data.drop(columns=['MolecularFeatures'], inplace=True)

# Save or view the updated dataset
print(data.head())
data.to_csv('processed_data_with_fingerprints.csv', index=False)

   Index                     Name CANON_SMILES  Temperature (K)  \
0      0  1,2-acetylene dibromide    Br/C=C\Br           274.01   
1      1  1,2-acetylene dibromide    Br/C=C\Br           284.10   
2      2  1,2-acetylene dibromide    Br/C=C\Br           293.08   
3      3  1,2-acetylene dibromide    Br/C=C\Br           303.29   
4      4  1,2-acetylene dibromide    Br/C=C\Br           312.64   

   Inverse temperature (1/K)  Viscosity (cP)  log(Viscosity)  MD_density  \
0                   0.003650           1.217        0.085291    2.425891   
1                   0.003520           1.070        0.029384    2.407717   
2                   0.003412           0.960       -0.017729    2.396062   
3                   0.003297           0.859       -0.066007    2.358438   
4                   0.003199           0.782       -0.106793    2.341529   

   MD_FV      MD_Rg  ...  1014  1015  1016  1017  1018 1019 1020  1021  1022  \
0   0.01  27.678297  ...     0     0     0     0     0    0 