In [1]:
import numpy as np
from constants import TASK_MODEL_PATH, TASK_PATH, TOKENIZER_PATH
from scoring import load_model, load_dataset
from pathlib import Path
from fairseq.models.bart import BARTModel
from fairseq.data import Dictionary
from constants import MOLNET_DIRECTORY
from deepchem.feat import RawFeaturizer
from preprocessing import canonize_smile, translate_selfie
import re
from tokenisation import get_tokenizer


CUDA_DEVICE = "cpu"
task = "delaney"

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. cannot import name 'DMPNN' from 'deepchem.models.torch_models' (/home/jgut/miniconda3/envs/SoS3/lib/python3.10/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'pytorch_lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
regex =r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"

def genereate_prev_output_tokens(sample):
    tokens = sample.unsqueeze(-1)
    prev_output_tokens = tokens.clone()
    prev_output_tokens[:, 0] = tokens.gather(
        0,
        (tokens.ne(source_dictionary.pad()).sum(0)-1).unsqueeze(-1)
    ).squeeze()
    prev_output_tokens[:, 1:] = tokens[:, :-1]
    return prev_output_tokens

def compute_attention_output(dataset, model, text, tokenizer=None):    
    attentions = []
    for counter, sample in enumerate(dataset):
        if tokenizer is None:
            parsed_tokens=[parsed_token for parsed_token in re.split(regex, text[counter]) if parsed_token]
        else:
            parsed_tokens=tokenizer.convert_ids_to_tokens(tokenizer(str(text[counter])).input_ids)
        prev_output_tokens = genereate_prev_output_tokens(sample)
        # same as in predict
        attention = model.model(sample.unsqueeze(0), None, prev_output_tokens)[1]["attn"][0][0][0].tolist()
        attentions.append(list(zip(attention, parsed_tokens)))
    return attentions

model_path = Path("/data/SoS_models/")
attention_encodings = []
for encoding in ["smiles_atom", "selfies_atom", "smiles_sentencepiece", "selfies_sentencepiece"]:
    specific_model_path = model_path/task/encoding/"5e-05_0.3_based_norm"/"5e-05_0.3_based_norm"/"checkpoint_last.pt"
    data_path = TASK_PATH/task/encoding
    model = BARTModel.from_pretrained(
        str(specific_model_path.parent),
        data_name_or_path=str(data_path),
        checkpoint_file=str(specific_model_path.name),
        layernorm_embedding=True,
    )
    model.to(CUDA_DEVICE)
    model.eval()
    model.zero_grad()
    test = MOLNET_DIRECTORY[task]["load_fn"](
            featurizer=RawFeaturizer(smiles=True), splitter=MOLNET_DIRECTORY[task]["split"]
        )[1][2]
    test_SMILES = test.X
    test_labels = test.y
    data_path = data_path/"input0" / "test"
    dataset = load_dataset(data_path)

    assert len(test_SMILES)==len(dataset), f"Real and filtered dataset {task} do not have same length."
    text = [canonize_smile(smile) for smile in test_SMILES]
    if encoding.startswith("selfies"):
        text = [translate_selfie(smile)[0] for smile in text]
    source_dictionary = Dictionary.load(str(data_path.parent / "dict.txt"))
    if encoding.endswith("sentencepiece"):
        tokenizer = get_tokenizer(TOKENIZER_PATH/encoding)
    else:
        tokenizer = None
    attention_encodings.append(compute_attention_output(dataset, model, text, tokenizer))
output = list(zip(*attention_encodings))
output

[([(0.03820929303765297, 'c'),
   (0.017093926668167114, '1'),
   (0.02830050326883793, 'c'),
   (0.03773948922753334, 'c'),
   (0.023588141426444054, '2'),
   (0.03547489643096924, 'c'),
   (0.04334151744842529, 'c'),
   (0.03772232308983803, 'c'),
   (0.04043123498558998, '3'),
   (0.03727398440241814, 'c'),
   (0.03921506553888321, 'c'),
   (0.03831299766898155, 'c'),
   (0.04603942856192589, 'c'),
   (0.036191102117300034, '4'),
   (0.04593060910701752, 'c'),
   (0.04261321201920509, 'c'),
   (0.041808392852544785, 'c'),
   (0.03570577874779701, '('),
   (0.0390562079846859, 'c'),
   (0.02592811919748783, '1'),
   (0.040388964116573334, ')'),
   (0.040555939078330994, 'c'),
   (0.035503316670656204, '2'),
   (0.042800839990377426, 'c'),
   (0.04167011380195618, '3'),
   (0.0367802195250988, '4')],
  [(0.029546329751610756, '[C]'),
   (0.04266010597348213, '[=C]'),
   (0.02664215862751007, '[C]'),
   (0.043345797806978226, '[=C]'),
   (0.03070697747170925, '[C]'),
   (0.039371181279

In [14]:
def aggregate_SMILE_attention(line):
    # http://www.dalkescientific.com/writings/diary/archive/2004/01/05/tokens.html
    output_dict = {}
    for (score, token) in line:
        if token in ["(", ")"] or token.isnumeric():
            output_dict["structure_att"] = output_dict.get("structure_att", 0)+score
            output_dict["structure_count"] = output_dict.get("structure_count", 0)+1
        elif token in ['=', '#', '/', '\\', ':', '~', '-']:
            output_dict["bond_att"] = output_dict.get("bond_att", 0)+score
            output_dict["bond_count"] = output_dict.get("bond_count", 0)+1
        else:
            output_dict["atom_att"] = output_dict.get("atom_att", 0)+score
            output_dict["atom_count"] = output_dict.get("atom_count", 0)+1
    # distribute bond attention
    output_dict["structure_att_added"] = output_dict.get("structure_att",0)+ output_dict.get("bond_att",0)*(output_dict.get("structure_att",0)/(output_dict.get("structure_att",0)+output_dict.get("bond_att",0)))
    output_dict["atom_att_added"] = output_dict["atom_att"]+output_dict.get("bond_att",0)*(output_dict["atom_att"]/(output_dict.get("structure_att",0)+output_dict["atom_att"] ))
    
    output_dict["structure_count_added"] = output_dict.get("structure_count",0)+output_dict.get("bond_count", 0)*(output_dict.get("structure_count",0)/(output_dict.get("structure_count",0)+output_dict.get("bond_count", 0)))
    output_dict["atom_count_added"] = output_dict.get("atom_count",0)+output_dict.get("bond_count", 0)*(output_dict.get("atom_count",0)/(output_dict.get("structure_count",0)+output_dict.get("atom_count",0)))
    return output_dict

def aggregate_SELFIE_attention(line):
    output_dict = {}
    structure_tokens = 0
    for (score, token) in line:
        if structure_tokens >0:
            structure_tokens -= 1
            output_dict["structure_att"] = output_dict.get("structure_att", 0)+score
            output_dict["structure_count"] = output_dict.get("structure_count", 0)+1
        elif "Ring" in token or "Branch" in token:
            output_dict["structure_att"] = output_dict.get("structure_att", 0)+score
            output_dict["structure_count"] = output_dict.get("structure_count", 0)+1
            structure_tokens = int(token[-2])
        else:
            output_dict["atom_att"] = output_dict.get("atom_att", 0)+score
            output_dict["atom_count"] = output_dict.get("atom_count", 0)+1
    return output_dict

SMILE_dict = {}
SELFIE_dict = {}
for line in output:
    curr_dict = aggregate_SMILE_attention(line[0])
    SMILE_dict = {key: SMILE_dict.get(key,0)+curr_dict.get(key,0) for key in SMILE_dict|curr_dict}
    curr_dict = aggregate_SELFIE_attention(line[1])
    SELFIE_dict = {key: SELFIE_dict.get(key,0)+curr_dict.get(key,0) for key in SELFIE_dict|curr_dict}

print(f"On average there are {SMILE_dict.get('structure_count',0)/len(output):.3f} structure SMILES tokens")
print(f"They accumulate {SMILE_dict.get('structure_att',0)/len(output):.3f} attention per sample.")
print(f"Which is {SMILE_dict.get('structure_att',0)/SMILE_dict.get('structure_count',1):.3f} attention per token.")
print(f"Distributing the bond tokens according to the amount of tokens, we have {SMILE_dict.get('structure_count_added',0)/len(output):.3f} structure SMILES tokens")
print(f"They accumulate {SMILE_dict.get('structure_att_added',0)/len(output):.3f} attention per sample.")
print(f"Which is {SMILE_dict.get('structure_att_added',0)/SMILE_dict.get('structure_count_added',1):.3f} attention per token.")

print(f"On average there are {SMILE_dict.get('atom_count',0)/len(output):.3f} atom SMILES tokens")
print(f"They accumulate {SMILE_dict.get('atom_att',0)/len(output):.3f} attention per sample.")
print(f"Which is {SMILE_dict.get('atom_att',0)/SMILE_dict.get('atom_count',1):.3f} attention per token.")
print(f"Distributing the bond tokens according to the amount of tokens, we have {SMILE_dict.get('atom_count_added',0)/len(output):.3f} atom SMILES tokens")
print(f"They accumulate {SMILE_dict.get('atom_att_added',0)/len(output):.3f} attention per sample.")
print(f"Which is {SMILE_dict.get('atom_att_added',0)/SMILE_dict.get('atom_count_added',1):.3f} attention per token.")

print(f"On average there are {SMILE_dict.get('bond_count',0)/len(output):.3f} bond SMILES tokens")
print(f"They accumulate {SMILE_dict.get('bond_att',0)/len(output):.3f} attention per sample.")
print(f"Which is {SMILE_dict.get('bond_att',0)/SMILE_dict.get('bond_count',1):.3f} attention per token.")

print(f"On average there are {SELFIE_dict.get('structure_count',0)/len(output):.3f} structure SELFIES tokens")
print(f"They accumulate {SELFIE_dict.get('structure_att',0)/len(output):.3f} attention per sample.")
print(f"Which is {SELFIE_dict.get('structure_att',0)/SELFIE_dict.get('structure_count',1):.3f} attention per token.")

print(f"On average there are {SELFIE_dict.get('atom_count',0)/len(output):.3f} atom SELFIES tokens")
print(f"They accumulate {SELFIE_dict.get('atom_att',0)/len(output):.3f} attention per sample.")
print(f"Which is {SELFIE_dict.get('atom_att',0)/SELFIE_dict.get('atom_count',1):.3f} attention per token.")

On average there are 11.504 structure SMILES tokens
They accumulate 0.276 attention per sample.
Which is 0.024 attention per token.
Distributing the bond tokens according to the amount of tokens, we have 12.730 structure SMILES tokens
They accumulate 0.306 attention per sample.
Which is 0.024 attention per token.
On average there are 18.752 atom SMILES tokens
They accumulate 0.658 attention per sample.
Which is 0.035 attention per token.
Distributing the bond tokens according to the amount of tokens, we have 19.660 atom SMILES tokens
They accumulate 0.685 attention per sample.
Which is 0.035 attention per token.
On average there are 1.478 bond SMILES tokens
They accumulate 0.039 attention per sample.
Which is 0.026 attention per token.
On average there are 12.080 structure SELFIES tokens
They accumulate 0.373 attention per sample.
Which is 0.031 attention per token.
On average there are 18.752 atom SELFIES tokens
They accumulate 0.598 attention per sample.
Which is 0.032 attention per 

In [11]:
import pandas as pd
pd.set_option('display.max_columns', 500)
md = ""
labels = np.array(test_labels).transpose()[0]
for i in range(20):
    md+=f'# Sample {i+1} with value {labels[i]:.3f}'+"""\ """
    md+=f'## Molecule'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(output[i][0]):.3f}") for letter in output[i][0]])
    md+=f'{"".join(input_data[:,0])}'+"""\ """
    md+=f'## SMILES'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(output[i][0]):.3f}") for letter in output[i][0]])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in output[i][0]])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()])
    md += df.to_markdown()+"""\ """
    md+=f'## SELFIES'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(output[i][1]):.3f}") for letter in output[i][1]])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in output[i][1]])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()])
    md += df.to_markdown()+"""\ """
    md+=f'## SMILES SentencePiece'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(output[i][2]):.3f}") for letter in output[i][2]])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in output[i][2]])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()]) 
    md += df.to_markdown()+"""\ """
    md+=f'## SELFIES SentencePiece'+"""\ """
    input_data = np.array([(letter[1], f"{letter[0]-1/len(output[i][3]):.3f}") for letter in output[i][3]])
    input_data2 = np.array([(letter[1], f"{letter[0]:.3f}") for letter in output[i][3]])
    df = pd.concat([pd.DataFrame(data = input_data[:,1], index = input_data[:,0]).transpose(),pd.DataFrame(data = input_data2[:,1], index = input_data2[:,0]).transpose()]) 
    md += df.to_markdown()+"""\ """
md

'# Sample 1 with value -1.601\\ ## Molecule\\ c1cc2ccc3cccc4ccc(c1)c2c34\\ ## SMILES\\ |    |      c |      1 |      c |      c |      2 |      c |     c |      c |     3 |      c |     c |      c |     c |      4 |     c |     c |     c |      ( |     c |      1 |     ) |     c |      2 |     c |     3 |      4 |\n|---:|-------:|-------:|-------:|-------:|-------:|-------:|------:|-------:|------:|-------:|------:|-------:|------:|-------:|------:|------:|------:|-------:|------:|-------:|------:|------:|-------:|------:|------:|-------:|\n|  0 | -0     | -0.021 | -0.01  | -0.001 | -0.015 | -0.003 | 0.005 | -0.001 | 0.002 | -0.001 | 0.001 | -0     | 0.008 | -0.002 | 0.007 | 0.004 | 0.003 | -0.003 | 0.001 | -0.013 | 0.002 | 0.002 | -0.003 | 0.004 | 0.003 | -0.002 |\n|  0 |  0.038 |  0.017 |  0.028 |  0.038 |  0.024 |  0.035 | 0.043 |  0.038 | 0.04  |  0.037 | 0.039 |  0.038 | 0.046 |  0.036 | 0.046 | 0.043 | 0.042 |  0.036 | 0.039 |  0.026 | 0.04  | 0.041 |  0.036 | 0.043 | 0.042 |  0.