## NOTES

- [Basic Molecular Representation for Machine Learning - Medium](https://towardsdatascience.com/basic-molecular-representation-for-machine-learning-b6be52e9ff76)
- - [Stackoverflow method](https://stackoverflow.com/questions/66131399/extracting-layer-output-from-classification-model-of-simpletransformer)


## DATA

In [None]:
# List of SMILES compounds
smiles_compounds = [
    "O=C(Cc1cccc2ccccc12)Nc1n[nH]c2ccc(N3CCCS3(=O)=O)cc12",
    "COC(=O)NC[C@@H](NC(=O)c1ccc(-c2nc(C3CCOCC3)cnc2N)cc1F)c1cccc(Br)c1",
    "COc1ccccc1Nc1cc(Oc2cc(C)c(C)nc2-c2ccccn2)ccn1",
    "O=C(/C=C/CN1CCCC1)N1CCOc2cc3ncnc(Nc4ccc(F)c(Cl)c4)c3cc21",
    "O=C(Nc1cccc(Nc2cc3c(=O)[nH][nH]c(=O)c3cc2Cl)c1)c1cccc(Cl)c1",
    "Cc1cc(CNc2nc(Nc3cc(C4CC4)[nH]n3)cc(NC3CC4CCC(C3)N4C)n2)on1",
    "Cc1cc(-c2cc(O)ccc2Cl)cc2nnc(Nc3ccc(S(N)(=O)=O)cc3)nc12",
    "NS(=O)(=O)c1cccc(N/C=C2\C(=O)Nc3ccccc32)c1",
    "CC(=O)Nc1ccc2cnn(-c3cc(NC4CC4)n4ncc(C#N)c4n3)c2c1",
    "CS(=O)(=O)c1cccc(Nc2nccc(N(CC#N)c3c(Cl)ccc4c3OCO4)n2)c1",
    "Cc1cnc(-c2ccnc(C(C)(C)O)n2)cc1-n1c(C)cc(OCc2ccc(F)cc2F)c(Cl)c1=O",
    "Cc1ccc(C(=O)Nc2cc(C(F)(F)F)ccn2)cc1/C=C/n1cnc2cncnc21",
    "CNC(=O)c1cnn2ccc(N3C[C@@H](O)C[C@@H]3c3cccc(F)c3)nc12",
    "COc1cc2c(cc1OC1CCOC1)Cc1c-2n[nH]c1-c1ccc(C#N)cc1"
]

In [None]:
import pandas as pd

smiles_compounds = pd.read_csv("../thesis_work/data/kinase_smiles.csv", usecols=["smiles"])["smiles"].to_numpy().tolist()
smiles_compounds = smiles_compounds[:1_000]

## Utility Functions

In [None]:
# From: https://discuss.huggingface.co/t/generate-raw-word-embeddings-using-transformer-models-like-bert-for-downstream-process/2958/2

import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
 
 
def get_word_idx(sent: str, word: str):
    return sent.split(" ").index(word)


def get_hidden_states(encoded, token_ids_word, model, layers):
    """Push input IDs through model. Stack and sum `layers` (last four by default).
    Select only those subword token outputs that belong to our word of interest
    and average them."""
    with torch.no_grad():
        output = model(**encoded)

    # Get all hidden states
    states = output.hidden_states
    # Stack and sum all requested layers
    output = torch.stack([states[i] for i in layers]).sum(0).squeeze()
    # Only select the tokens that constitute the requested word
    word_tokens_output = output[token_ids_word]

    return word_tokens_output.mean(dim=0)


def get_word_vector(sent, idx, tokenizer, model, layers):
    """
    Get a word vector by first tokenizing the input sentence, getting all token idxs
    that make up the word of interest, and then `get_hidden_states`.

    NOTE: `BertTokenizer` doesn't support `word_ids`. Use `BertTokenizerFast` instead.
    """
    encoded = tokenizer.encode_plus(sent, return_tensors="pt")
    # get all token idxs that belong to the word of interest
    token_ids_word = np.where(np.array(encoded.word_ids()) == idx)

    return get_hidden_states(encoded, token_ids_word, model, layers)


def main(layers=None):
    # Use last four layers by default
    layers = [-4, -3, -2, -1] if layers is None else layers
    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    model = AutoModel.from_pretrained("bert-base-cased", output_hidden_states=True)

    sent = "I like cookies ." 
    idx = get_word_idx(sent, "cookies")

## Word BERT Models

In [None]:
from transformers import RobertaModel, AutoTokenizer

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name)

# sentences = ["apples taste good", "monkeys like bananas", "dogs are nice"]
sentences = "apples taste good"
model_inputs = tokenizer(sentences, return_tensors="pt")
outputs = model(**model_inputs)
# outputs

last_hidden_states = outputs.last_hidden_state
last_hidden_states

In [None]:
# Tensorflow - Not working

from transformers import RobertaTokenizer, TFRobertaModel

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaModel.from_pretrained('roberta-base')

inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
outputs = model(inputs)

last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple


In [None]:
# Pytorch

from transformers import RobertaTokenizer, RobertaForTokenClassification

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForTokenClassification.from_pretrained('roberta-base', return_dict=True)


inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

outputs = model(**inputs)
outputs
## last_layer_features = outputs.hidden_states[-1]

## Deepchem

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs


model_type = "DeepChem/ChemBERTa-77M-MLM"

model_args = ClassificationArgs(
    evaluate_each_epoch=True,
    evaluate_during_training_verbose=True,
    no_save=True,
    num_train_epochs=10,
    # overwrite_output_dir=True,
    # auto_weights=True, # NOTE: Not working
    # NOTE: Necessary for training outside of Colab
    use_multiprocessing=False,
    # dataloader_num_workers=0,
    # process_count=1,
    use_multiprocessing_for_evaluation=False,
)

model_args_dict = {
    "output_hidden_states": True
}

model = ClassificationModel(
    model_type="roberta",
    model_name=model_type,
    # args=model_args,
    args=model_args_dict,
    # use_cuda=False,
)

model.__dict__

In [None]:
# Vector size: [65, 384]
#  65: Token length
#  384: Embedding size

from transformers import RobertaTokenizerFast, RobertaModel
import torch

model_name = "DeepChem/ChemBERTa-77M-MLM"

# tokenizer = RobertaTokenizerFast.from_pretrained('seyonec/PubChem10M_SMILES_BPE_450k')
# model = RobertaModel.from_pretrained('seyonec/ChemBERTa-77M-MLM', output_hidden_states = True)
tokenizer = RobertaTokenizerFast.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name, output_hidden_states = True)

model.eval()

# max_length=128
inputs = tokenizer(smiles_compounds, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    out = model(**inputs)

print(inputs['input_ids'].shape)

In [None]:
for token in inputs['input_ids'][0]:
  print(tokenizer.decode([token]))

In [None]:
# TODO: What is the best way to convert 3d states to 2d states?

states = out.hidden_states[-1].squeeze()

# print(states.shape)
# print(states[0].shape)
# print(states[0])

# Average the token vectors for each sample, which will give you a single 384-dimensional vector for each sample.
states_2d = states.mean(dim=1).numpy()
states_2d.shape

## Cluster

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=0)
kmeans.fit(states_2d)

clusters = kmeans.predict(states_2d)
clusters