In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.3.2


In [None]:
import pickle
import pandas as pd
import torch
import torch.nn as nn
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Step 1: Load the pickle file and create a DataFrame
def load_pickle_to_df(pickle_file):
    with open(pickle_file, 'rb') as file:
        data = pickle.load(file)
    return pd.DataFrame(data)

# Step 2: Create a custom dataset to convert SMILES data into embeddings
class SMILESDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.valid_indices = self.get_valid_indices()

    def get_valid_indices(self):
        valid_indices = []
        for idx in range(len(self.df)):
            smiles = self.df.iloc[idx]['isosmiles']
            mol = MolFromSmiles(smiles)
            if mol is not None:
                valid_indices.append(idx)
        return valid_indices

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        idx = self.valid_indices[idx]
        smiles = self.df.iloc[idx]['isosmiles']
        mol = MolFromSmiles(smiles)

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        fp_arr = torch.tensor([int(bit) for bit in fp.ToBitString()], dtype=torch.float)  # Convert to list of integers
        return fp_arr

# Step 3: Define the SimCLR architecture
class SimCLR(nn.Module):
    def __init__(self, input_size, output_size):
        super(SimCLR, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, output_size)
        )

    def forward(self, x):
        return self.encoder(x)

# Step 4: Define the cosine similarity loss function
def cosine_similarity_loss(x, y):
    cosine_sim = nn.CosineSimilarity(dim=1)
    return 1 - cosine_sim(x, y).mean()


In [None]:

# Step 1: Load the pickle file into a DataFrame
input_pickle_file = '/content/drive/MyDrive/description_df.pkl'
df = load_pickle_to_df(input_pickle_file)

# Step 2: Convert SMILES data into embeddings using SimCLR architecture
input_size = 2048  # Size of the input fingerprint
output_size = 128  # Size of the output embeddings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

dataset = SMILESDataset(df)
data_loader = DataLoader(dataset, batch_size=64, shuffle=False)

model = SimCLR(input_size, output_size).to(device)

# Set the model to evaluation mode
model.eval()

embeddings = []
with torch.no_grad():
    for batch in tqdm(data_loader, desc='Converting SMILES to embeddings'):
        batch = batch.to(device)
        output = model(batch)
        embeddings.extend(output.cpu().numpy())

# Step 3: Save the embeddings into a new pickle file
output_pickle_file = 'embeddings.pickle'
with open(output_pickle_file, 'wb') as file:
    pickle.dump(embeddings, file)

[17:02:37] Explicit valence for atom # 1 Cl, 7, is greater than permitted
[17:02:37] Explicit valence for atom # 1 Br, 3, is greater than permitted
[17:02:37] Explicit valence for atom # 1 Br, 5, is greater than permitted
[17:02:37] Explicit valence for atom # 1 Cl, 3, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 3 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Cl, 5, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:38] Explicit valence for atom # 1 Si, 8, is greater than permitted
[17:02:41] Explicit valence for atom # 1 Cl, 6, is greater than permitted
[17:02:56] Explicit valence for atom #

KeyboardInterrupt: ignored