In [32]:
import numpy as np
import pandas as pd

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

## Raw data

In [33]:
data = pd.read_csv('immuno_big.csv')

## Shuffle

In [34]:
data = data.sample(frac=1).reset_index(drop=True)

## Separate

In [35]:
data['Z'].to_csv('y.csv', index=None)
data['SMILES'].to_csv('smiles.csv', index=None)

## Generate Coordinates

In [36]:
smiles = pd.read_csv('smiles.csv')

In [55]:
max_len = max(Chem.MolFromSmiles(smile).GetNumHeavyAtoms() for smile in smiles['SMILES'])

In [57]:
lengths = [Chem.MolFromSmiles(smile).GetNumHeavyAtoms() for smile in smiles['SMILES']]

In [45]:
coordinates = []

max_len = max(Chem.MolFromSmiles(smile).GetNumHeavyAtoms() for smile in smiles['SMILES'])

for smile in smiles['SMILES']:

    # Generate Coordinates
    
    mol = Chem.MolFromSmiles(smile)
    if mol is None: continue
    mol = Chem.AddHs(mol)
    embed = AllChem.EmbedMolecule(mol, randomSeed=16)  # Set a random seed for reproducibility
    if embed != 0: continue
    AllChem.UFFOptimizeMolecule(mol)  # Perform an optimization using UFF force field

    coords = np.full((mol.GetNumHeavyAtoms(), 3), np.inf)

    j = 0
    for atom in mol.GetAtoms():
        if atom.GetSymbol() == 'H': continue
        pos = mol.GetConformer().GetAtomPosition(atom.GetIdx())  # Get coordinates for each atom
        coords[j] = pos.x, pos.y, pos.z
        j += 1

    coordinates.append(coords)

In [52]:
max_len = max(coords.shape[0] for coords in coordinates)

padded_coordinates = [
    np.pad(coords, ((0, max_len - coords.shape[0]), (0, 0)), mode='constant', constant_values=np.inf)
    for coords in coordinates
]

(26, 3)