# Self-sup Learning with SELFIES

## Prep the dataset.

In [1]:
import pickle
import random

import selfies as sf
import torch
from tqdm.auto import tqdm
from transformers import BertConfig, BertLMHeadModel

from mol_data.dataset import read_smiles

raw_smiles = read_smiles("./data/pubchem-10m-clean.txt")
random.shuffle(raw_smiles)
raw_smiles_2m = raw_smiles[:2000000]


def process_smiles(smile):
    try:
        return sf.encoder(smile)
    except:
        return None

Read SELFIES

In [None]:
raw_selfies = []
for smile in tqdm(raw_smiles_2m):
    raw_selfies.append(process_smiles(smile))

  0%|          | 0/2000000 [00:00<?, ?it/s]

In [None]:
# Filter out None values if any
raw_selfies = [selfie for selfie in raw_selfies if selfie is not None]

alphabet = sf.get_alphabet_from_selfies(raw_selfies)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))
pad_to_len = max(sf.len_selfies(s) for s in raw_selfies)  # Find the max length
symbol_to_idx = {s: i for i, s in enumerate(alphabet)}


# Function to convert selfies to encoding
def process_to_encoding(selfie):
    return sf.selfies_to_encoding(selfies=selfie,
                                  vocab_stoi=symbol_to_idx,
                                  pad_to_len=pad_to_len,
                                  enc_type="label")


with open("raw_selfies.pickle", 'wb') as f:
    pickle.dump(raw_selfies, f)


In [None]:
cfg = BertConfig(vocab_size=200, num_hidden_layers=6, max_position_embeddings=955)
model = BertLMHeadModel(cfg)

In [None]:
enc = sf.selfies_to_encoding(selfies=raw_selfies[0],
                             vocab_stoi=symbol_to_idx,
                             pad_to_len=pad_to_len,
                             enc_type="label")
enc_tensor = torch.Tensor(enc).unsqueeze(0).int()

In [None]:
model.forward(enc_tensor)  #%%
raw_selfies = []
for smile in tqdm(raw_smiles_2m):
    raw_selfies.append(process_smiles(smile))

In [None]:
model.forward(enc_tensor)  #%%
cfg = BertConfig(vocab_size=200, num_hidden_layers=6, max_position_embeddings=955)
model = BertLMHeadModel(cfg)

In [57]:
enc = sf.selfies_to_encoding(selfies=raw_selfies[0],
                             vocab_stoi=symbol_to_idx,
                             pad_to_len=pad_to_len,
                             enc_type="label")
enc_tensor = torch.Tensor(enc).unsqueeze(0).int()

In [69]:
model.forward(enc_tensor)

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ 0.0000,  0.1593,  0.4132,  ...,  0.3299,  0.8596,  0.5625],
         [ 0.0000, -0.8123,  0.9486,  ..., -0.4053,  0.5365,  0.0542],
         [ 0.0000,  0.0087,  0.5161,  ...,  0.4030,  0.9750,  0.2213],
         ...,
         [ 0.0000, -0.0752,  1.2702,  ...,  0.2011,  0.3855,  0.3431],
         [ 0.0000, -0.0474,  1.0749,  ..., -0.4253,  0.2335,  0.8612],
         [ 0.0000,  0.6605,  1.3869,  ...,  0.0016,  0.2408,  0.9332]]],
       grad_fn=<ViewBackward0>), past_key_values=None, hidden_states=None, attentions=None, cross_attentions=None)