## NOTES

- [Basic Molecular Representation for Machine Learning - Medium](https://towardsdatascience.com/basic-molecular-representation-for-machine-learning-b6be52e9ff76)
- - [Stackoverflow method](https://stackoverflow.com/questions/66131399/extracting-layer-output-from-classification-model-of-simpletransformer)


## DATA

In [10]:
# List of SMILES compounds - Kinase
smiles_compounds = [
    "O=C(Cc1cccc2ccccc12)Nc1n[nH]c2ccc(N3CCCS3(=O)=O)cc12",
    "COC(=O)NC[C@@H](NC(=O)c1ccc(-c2nc(C3CCOCC3)cnc2N)cc1F)c1cccc(Br)c1",
    "COc1ccccc1Nc1cc(Oc2cc(C)c(C)nc2-c2ccccn2)ccn1",
    "O=C(/C=C/CN1CCCC1)N1CCOc2cc3ncnc(Nc4ccc(F)c(Cl)c4)c3cc21",
    "O=C(Nc1cccc(Nc2cc3c(=O)[nH][nH]c(=O)c3cc2Cl)c1)c1cccc(Cl)c1",
    "Cc1cc(CNc2nc(Nc3cc(C4CC4)[nH]n3)cc(NC3CC4CCC(C3)N4C)n2)on1", 
    "Cc1cc(-c2cc(O)ccc2Cl)cc2nnc(Nc3ccc(S(N)(=O)=O)cc3)nc12",
    "NS(=O)(=O)c1cccc(N/C=C2\C(=O)Nc3ccccc32)c1",
    "CC(=O)Nc1ccc2cnn(-c3cc(NC4CC4)n4ncc(C#N)c4n3)c2c1",
    "CS(=O)(=O)c1cccc(Nc2nccc(N(CC#N)c3c(Cl)ccc4c3OCO4)n2)c1",
    "Cc1cnc(-c2ccnc(C(C)(C)O)n2)cc1-n1c(C)cc(OCc2ccc(F)cc2F)c(Cl)c1=O",
    "Cc1ccc(C(=O)Nc2cc(C(F)(F)F)ccn2)cc1/C=C/n1cnc2cncnc21",
    "CNC(=O)c1cnn2ccc(N3C[C@@H](O)C[C@@H]3c3cccc(F)c3)nc12",
    "COc1cc2c(cc1OC1CCOC1)Cc1c-2n[nH]c1-c1ccc(C#N)cc1"
]

In [1]:
import pandas as pd

protein_type = "gpcr"
# protein_type = "kinase"
# protein_type = "protease"


df_path = f"../thesis_work/data/{protein_type}_smiles.csv"
df_path_vector = f"../thesis_work/data/{protein_type}_smiles_vector.csv"

smiles_df = pd.read_csv(df_path, usecols=["smiles"])
# smiles_compounds = smiles_df["smiles"].to_numpy().tolist()
# smiles_compounds = smiles_df[:1_000]

In [2]:
smiles_df

Unnamed: 0,smiles
0,O=C(C1CC12CCN(C1CCOCC1)CC2)N1CCN(C2CCCCCC2)CC1
1,CC(C)c1nc(C(F)(F)F)cnc1N[C@H]1CCC[C@@H]1NC(=O)...
2,CC(C)N1CCN(C(=O)OC2CCN(c3ccc(C(F)(F)F)cn3)CC2)CC1
3,Cc1ccc2c(=O)c(C(=O)NC3CCCCC3)cn(Cc3ccc(F)cc3)c2n1
4,CCn1nc(C)cc1CCOC(=O)N1CCc2ncsc2C1c1cc(Cl)ccc1O...
...,...
68004,COCC(c1ccc(Cl)cc1)C1CCN(S(=O)(=O)c2c(C)n[nH]c2...
68005,O=C(Nc1ccccc1)NS(=O)(=O)c1ccc(OCCCN2CCCCC2)cc1
68006,O=C(N[C@H](Cc1ccc(Cl)cc1)C(=O)N1CCC(Cn2cncn2)(...
68007,c1cc(CN2CCOCC2)cc(OCC2CN(C3CC3)CCO2)c1


## Utility Functions

In [None]:
# From: https://discuss.huggingface.co/t/generate-raw-word-embeddings-using-transformer-models-like-bert-for-downstream-process/2958/2

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
 
 
def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)


def get_hidden_states(encoded, token_ids_word, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
    Select only those subword token outputs that belong to our word of interest
    and average them."""
    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    word_tokens_output = output[token_ids_word]

    return word_tokens_output.mean(dim=0)


def get_word_vector(sent, idx, tokenizer, model, layers):
    """
    Get a word vector by first tokenizing the input sentence, getting all token idxs
    that make up the word of interest, and then `get_hidden_states`.

    NOTE: `BertTokenizer` doesn't support `word_ids`. Use `BertTokenizerFast` instead.
    """
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, token_ids_word, model, layers)


def main(layers=None):
    # Use last four layers by default
    layers = [-4, -3, -2, -1] if layers is None else layers
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

    sent = "I like cookies ." 
    idx = get_word_idx(sent, "cookies")

In [3]:
# From Selformer: https://github.com/HUBioDataLab/SELFormer/blob/main/get_embeddings.py
# TODO: pandaparallel can be used: https://github.com/nalepae/pandarallel

from transformers import RobertaTokenizer, RobertaTokenizerFast, RobertaModel
from typing import Union


def get_vector_embedding(model: RobertaModel, tokenizer: Union[RobertaTokenizer, RobertaTokenizerFast], smile_str: str):

    # torch.set_num_threads(1)
    token = torch.tensor([tokenizer.encode(smile_str, add_special_tokens=True, max_length=512, padding=True, truncation=True)])
    output = model(token)

    sequence_out = output[0]
    # return torch.mean(sequence_out[0], dim=0).tolist()
    return torch.mean(sequence_out[0], dim=0).detach().numpy()

  from .autonotebook import tqdm as notebook_tqdm


## Word BERT Models

In [None]:
from transformers import RobertaModel, AutoTokenizer

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

# sentences = ["apples taste good", "monkeys like bananas", "dogs are nice"]
sentences = "apples taste good"
model_inputs = tokenizer(sentences, return_tensors="pt")
outputs = model(**model_inputs)
# outputs

last_hidden_states = outputs.last_hidden_state
last_hidden_states

In [None]:
# Tensorflow - Not working

from transformers import RobertaTokenizer, TFRobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)

last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple


In [None]:
# Pytorch

from transformers import RobertaTokenizer, RobertaForTokenClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base', return_dict=True)


inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

outputs = model(**inputs)
outputs
## last_layer_features = outputs.hidden_states[-1]

## Deepchem

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs


model_type = "DeepChem/ChemBERTa-77M-MLM"

model_args = ClassificationArgs(
    evaluate_each_epoch=True,
    evaluate_during_training_verbose=True,
    no_save=True,
    num_train_epochs=10,
    # overwrite_output_dir=True,
    # auto_weights=True, # NOTE: Not working
    # NOTE: Necessary for training outside of Colab
    use_multiprocessing=False,
    # dataloader_num_workers=0,
    # process_count=1,
    use_multiprocessing_for_evaluation=False,
)

model_args_dict = {
    "output_hidden_states": True
}

model = ClassificationModel(
    model_type="roberta",
    model_name=model_type,
    # args=model_args,
    args=model_args_dict,
    # use_cuda=False,
)

model.__dict__

In [4]:
from transformers import RobertaTokenizerFast, RobertaModel
import torch

model_name = "DeepChem/ChemBERTa-77M-MLM"

# tokenizer = RobertaTokenizerFast.from_pretrained('seyonec/PubChem10M_SMILES_BPE_450k')
# model = RobertaModel.from_pretrained('seyonec/ChemBERTa-77M-MLM', output_hidden_states = True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name, output_hidden_states = True)
model.eval()

Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be 

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(600, 384, padding_idx=1)
    (position_embeddings): Embedding(515, 384, padding_idx=1)
    (token_type_embeddings): Embedding(1, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.144, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.109, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dr

In [None]:
# word_embedding = get_vector_embedding(model, tokenizer, smiles_compounds[0])
# states = [get_vector_embedding(model, tokenizer, x).tolist() for x in smiles_compounds]
# from pandarallel import pandarallel
# pandarallel.initialize(nb_workers=5)

# Gpcr: Took 3m 41s
# Kinase: Took 3m 24s
# Protease: Took 1m 56s
smiles_df["vectors"] = smiles_df["smiles"].apply(lambda x: get_vector_embedding(model, tokenizer, x).tolist())
# smiles_df["vectors"] = smiles_df["smiles"].parallel_apply(lambda x: get_vector_embedding(model, tokenizer, x).tolist())

# smiles_df["vectors"]

In [6]:
smiles_df.to_csv(df_path_vector, index=False)

In [None]:
# Vector size: [65, 384]
#  65: Token length
#  384: Embedding size

# max_length=128
inputs = tokenizer(smiles_compounds, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    out = model(**inputs)

    # TODO: Should we use this?
    # out = out[0][:, 0, :].numpy()  # Take the [CLS] token's embedding for each sentence

print(inputs['input_ids'].shape)

In [None]:
for token in inputs['input_ids'][0]:
  print(tokenizer.decode([token]))

In [7]:
# TODO: What is the best way to convert 3d states to 2d states?
# FIXME: Doesn't work with more than 1,000 samples

states = out.hidden_states[-1].squeeze()

# print(states.shape)
# print(states[0].shape)
# print(states[0])

# Average the token vectors for each sample, which will give you a single 384-dimensional vector for each sample.
states_2d = states.mean(dim=1).numpy()
states_2d.shape

(5, 384)

## Cluster

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(states_2d)

clusters = kmeans.predict(states_2d)
clusters

### AIO

In [2]:
# Deepchem
from transformers import RobertaTokenizerFast, RobertaModel
import torch

model_name = "DeepChem/ChemBERTa-77M-MLM"
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)
texts = smiles_compounds

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at DeepChem/ChemBERTa-77M-MLM were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MLM and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should prob

In [None]:
# Bert - Uncased
import torch
from transformers import BertTokenizer, BertModel

## model_type = "roberta-base"
model_type = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_type)
model = BertModel.from_pretrained(model_type)
texts = [
    "This is an example sentence.",
    "BERT is a powerful language model.",
    "Clustering with BERT is interesting.",
    "Unsupervised learning can be fun.",
    "Transformers library is great.",
    "BERT embeddings are useful for clustering.",
    "Python programming is versatile.",
    "K-means is a popular clustering algorithm."
]

In [None]:
import numpy as np
from sklearn.cluster import KMeans

# Tokenize and convert texts to BERT embeddings
input_ids = torch.tensor([tokenizer.encode(text, add_special_tokens=True, max_length=128, padding='max_length', truncation=True) for text in texts])
with torch.no_grad():
    model_output = model(input_ids)
    embeddings = model_output[0][:, 0, :].numpy()  # Take the [CLS] token's embedding for each sentence


# Set the number of clusters you want to create
num_clusters = 2

# Apply k-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings)

# Print the clustering results
for cluster_id in range(num_clusters):
    print(f"Cluster {cluster_id + 1}:")
    cluster_text_indices = np.where(cluster_labels == cluster_id)[0]
    for index in cluster_text_indices:
        print(texts[index])
    print("\n")

