# Attention readout for examples

In [1]:
import numpy as np
import pandas as pd
from constants import TASK_MODEL_PATH, TASK_PATH, TOKENIZER_PATH, FAIRSEQ_PREPROCESS_PATH, MODEL_PATH
from scoring import load_model
from fairseq.data import Dictionary
from preprocessing import canonize_smile, translate_selfie
import re
from tokenisation import get_tokenizer, tokenize_with_space
from fairseq_utils import compute_attention_output
pd.set_option('display.max_columns', 500)
CUDA_DEVICE = "cpu"

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/jgut/miniconda3/envs/SMILES_OR_SELFIES/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
import fairseq
print(fairseq.__version__)

0.12.2


In [3]:
model_identifier = "smiles_atom_isomers"
print(MODEL_PATH/model_identifier/"checkpoint_last.pt")

/data/jgut/SMILES_or_SELFIES/fairseq/smiles_atom_isomers/checkpoint_last.pt


In [4]:
def get_dictionary(target_dict_path):
    return Dictionary.load(str(target_dict_path))

def gather_attention_model(input, model_identifier):
    tokenizer = get_tokenizer(TOKENIZER_PATH/model_identifier)
    source_dictionary = get_dictionary(FAIRSEQ_PREPROCESS_PATH/model_identifier/"dict.txt")
    print(TASK_PATH/"delaney"/model_identifier)
    model = load_model(TASK_MODEL_PATH/"delaney"/model_identifier/"1e-05_0.2_based_norm"/"checkpoint_best.pt", TASK_PATH/"delaney"/model_identifier)
    preprocessed = model.encode(input)
    print(preprocessed)
    attended = compute_attention_output([preprocessed], model, [input], source_dictionary, tokenizer)
    return attended

def to_markdown(molecule, smiles_atom, smiles_sentencepiece, selfies_atom, selfies_sentencepiece):
    md = ""
    md+=f'## Molecule'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(molecule):.3f}") for letter in molecule])
    md+=f'{"".join(input_data[:,0])}'+"""\ """
    md+=f'## SMILES atomwise'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(smiles_atom):.3f}") for letter in smiles_atom])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in smiles_atom])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()])
    md += df.to_markdown()+"""\ """
    md+=f'## SELFIES atomwise'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(selfies_atom):.3f}") for letter in selfies_atom])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in selfies_atom])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()])
    md += df.to_markdown()+"""\ """
    md+=f'## SMILES SentencePiece'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(smiles_sentencepiece):.3f}") for letter in smiles_sentencepiece])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in smiles_sentencepiece])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()]) 
    md += df.to_markdown()+"""\ """
    md+=f'## SELFIES SentencePiece'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(selfies_sentencepiece):.3f}") for letter in selfies_sentencepiece])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in selfies_sentencepiece])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()]) 
    md += df.to_markdown()+"""\ """
    return md

def gather_attention(SMILES):
    SMILES = canonize_smile(SMILES)
    SELFIES = translate_selfie(SMILES)
    smiles_atom = gather_attention_model(SMILES, "smiles_atom_isomers")
    smiles_sentencepiece = gather_attention_model(SMILES, "smiles_sentencepiece_isomers")
    selfies_atom = gather_attention_model(SELFIES, "selfies_atom_isomers")
    selfies_sentencepiece = gather_attention_model(SELFIES, "selfies_sentencepiece_isomers")
    markdown = to_markdown(smiles_atom, smiles_sentencepiece, selfies_atom, selfies_sentencepiece)
    return markdown

In [5]:
gather_attention("CCOCC")

/data/jgut/SMILES_or_SELFIES/task/delaney/smiles_atom_isomers
tensor([0, 3, 3, 3, 2])
tensor([[0, 3, 3, 3, 2]])
tensor([[2, 0, 3, 3, 3]])
RobertaHubInterface(
  (model): RobertaModel(
    (encoder): RobertaEncoder(
      (sentence_encoder): TransformerEncoder(
        (dropout_module): FairseqDropout()
        (embed_tokens): Embedding(433, 768, padding_idx=1)
        (embed_positions): LearnedPositionalEmbedding(514, 768, padding_idx=1)
        (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (layers): ModuleList(
          (0-11): 12 x TransformerEncoderLayerBase(
            (self_attn): MultiheadAttention(
              (dropout_module): FairseqDropout()
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768

TypeError: tuple indices must be integers or slices, not str