# Baselines

Using the Nucleotide Transformer pre-trained model and fine-tuning on the GUE dataset for promoter prediction.

In [2]:
!nvidia-smi --query-gpu=memory.used --format=csv

/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
memory.used [MiB]
167 MiB


In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.nn import functional as F

In [2]:
df_train = pd.read_csv("./promoter_detection/train.csv", header=0)
df_val = pd.read_csv("./promoter_detection/dev.csv", header=0)
df_test = pd.read_csv("./promoter_detection/test.csv", header=0)

In [3]:
df_train_slice = df_train[:100]
df_test_slice = df_test[:100]

In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=2)


  from .autonotebook import tqdm as notebook_tqdm
2023-07-11 15:24:04.523606: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-11 15:24:04.551644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref were not used when initializing EsmForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you a

Defining the metrics:

In [4]:
from transformers import EvalPrediction
import evaluate

def make_metrics_func(*dataset_load_args):
    def compute_metrics(eval_pred: EvalPrediction):
        accuracy = evaluate.load("accuracy")
        logits, labels = eval_pred
        pred_class = np.argmax(logits, axis=-1)  # take the max-scoring logit as the predicted class ID
        return accuracy.compute(predictions=pred_class,
                                references=labels)
    return compute_metrics

compute_metrics = make_metrics_func()

  from .autonotebook import tqdm as notebook_tqdm
2023-07-11 21:53:44.539955: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-11 21:53:44.567615: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Creating the dataset objects for the HuggingFace Transformers API:

In [5]:
from datasets import load_dataset

data_files = dict(
    train="./promoter_detection/train.csv",
    test="./promoter_detection/test.csv",
    val="./promoter_detection/dev.csv"
)

promoter_dataset = load_dataset("csv", data_files=data_files)

promoter_dataset

Found cached dataset csv (/home/chris/.cache/huggingface/datasets/csv/default-68212c3a0ebc43dc/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)
100%|██████████| 3/3 [00:00<00:00, 1497.07it/s]


DatasetDict({
    train: Dataset({
        features: ['sequence', 'label'],
        num_rows: 47356
    })
    test: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
    val: Dataset({
        features: ['sequence', 'label'],
        num_rows: 5920
    })
})

In [6]:
train_dataset = promoter_dataset["train"]
val_dataset = promoter_dataset["val"]
test_dataset = promoter_dataset["test"]

How long is one sequence:

In [7]:
len(train_dataset[0]['sequence'])

300

Fine-tuning the Nucleotide Transformer model:

In [8]:
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from pathlib import Path

model_save_dir = Path("results/model_nucleotide_transformer")
training_args = TrainingArguments(model_save_dir,
                                    evaluation_strategy="epoch")


def tokenize(dataset: Dataset):
    return tokenizer(dataset["sequence"], padding=True)


def training_pipeline(tokenizer,
                      model,
                      training_args,
                      train_dataset,
                      val_dataset):

    tokenized_train_dataset = train_dataset.map(tokenize, batched=True)
    tokenized_val_dataset = val_dataset.map(tokenize, batched=True)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [None]:
trainer = training_pipeline(tokenizer, model, training_args, train_dataset, val_dataset)
trainer.train()

Save the fine-tuned model:

In [21]:
trainer.save_model("nucleotide_transformer_prom300")

**At this point you might need to restart the kernel to clear CUDA memory and load the fine-tuned model back again**

In [10]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification
import torch

# Import the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model = AutoModelForSequenceClassification.from_pretrained("./nucleotide_transformer_prom300", num_labels=2)


Send the model to the GPU:

In [11]:
model = model.to("cuda")

Check how much memory you have left on the GPU after loading the model:

In [12]:
!nvidia-smi --query-gpu=memory.used --format=csv

/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
memory.used [MiB]
2443 MiB


The Nucleotide Transformer model is about 2.3 GB.

Evaluate the test dataset and collect metrics:

In [9]:
import evaluate
from typing import Callable, Tuple, Union, Iterable, List
import numpy as np
from sklearn.metrics import accuracy_score

# Splitting the test set into batches to avoid OOM errors with my lovely RTX 4080:
# 5920 / 16 = 370

Metric = Callable[[torch.Tensor | np.ndarray, torch.Tensor | np.ndarray], torch.Tensor | np.ndarray]

def _eval(model: nn.Module, test_batch: torch.Tensor, attention_mask: torch.Tensor, output_hidden_states: bool):
    with torch.no_grad():
        torch_outs = model(
            test_batch,
            attention_mask=attention_mask,
            output_hidden_states=output_hidden_states
        )
    return torch_outs


def test_and_calculate_metrics(tokens_ids: torch.Tensor,
                               labels: torch.Tensor,
                               model: nn.Module,
                               metrics: List[Metric] | Metric,
                               split_into: int = 4,
                               output_hidden_states: bool = False):
    metric_vals = dict()

    if not type(metrics) is list:
        # wrap singular metric into a list to use the same interface downstream
        metrics = [metric]

    for metric in metrics:
        # initialize lists where we will store collected metrics
        metric_vals[metric.__name__] = []

    slice_size = tokens_ids.shape[0] // split_into
    for test_batch, batch_labels in zip(tokens_ids.split(slice_size),
                                        torch.tensor(labels).split(slice_size)):

        # Compute the embeddings:
        attention_mask = test_batch != tokenizer.pad_token_id

        # Send tokens and attention mask to the GPU:
        test_batch = test_batch.to("cuda")
        attention_mask = attention_mask.to("cuda")

        # Model outputs:
        torch_outs = _eval(model,
                           test_batch,
                           attention_mask,
                           output_hidden_states)
        
        y_hat_prob = nn.Sigmoid()(torch_outs.logits)
        y_hat = torch.argmax(y_hat_prob, axis=-1)
        for metric in metrics:
            metric_value = metric(batch_labels.to("cpu").detach().numpy(), y_hat.to("cpu").detach().numpy())
            metric_vals[metric.__name__].append(metric_value)

    return metric_vals


Define a pipeline for testing. This is what will collect and present our metrics:

In [10]:
from typing import Dict, List
from sklearn.metrics import precision_score, recall_score
from pprint import pprint


def testing_pipeline(tokenizer, model, test_dataset):

    split_into = 4

    tokens_ids = tokenizer(test_dataset["sequence"], return_tensors="pt")["input_ids"]

    metrics = [accuracy_score, precision_score, recall_score]

    metric_vals = test_and_calculate_metrics(tokens_ids,
                                             test_dataset["label"],
                                             model,
                                             metrics,
                                             split_into)

    avgs_of_metric_vals = dict_average(metric_vals)

    pprint(f"Metrics averaged over batches: {avgs_of_metric_vals}", indent=4)


def dict_average(dict_of_metrics: Dict[str, np.array | List[int] | List[float]]) -> Dict[str, np.ndarray]:
    avg_dict = dict()
    for k, v in dict_of_metrics.items():
        avg_acc = np.mean(v)
        avg_dict[k] = avg_acc
    return avg_dict

In [15]:
testing_pipeline(tokenizer, model, test_dataset)

("Metrics averaged over batches: {'accuracy_score': 0.9030405405405405, "
 "'precision_score': 0.9084807312891368, 'recall_score': 0.8987181337691119}")


## DNABERT

We now attempt to fine-tune DNABERT on the same task with the same dataset. **You might need to reload your kernel before you start** because your GPU memory could be overloaded.

In [28]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset
from pathlib import Path

tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)

model_save_dir = Path("results/model_dnabert")
training_args = TrainingArguments(model_save_dir,
                                    evaluation_strategy="epoch")

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing DNABertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DNABertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/

TrainOutput(global_step=17760, training_loss=0.7012527543145257, metrics={'train_runtime': 546.339, 'train_samples_per_second': 260.036, 'train_steps_per_second': 32.507, 'total_flos': 219021453591120.0, 'train_loss': 0.7012527543145257, 'epoch': 3.0})

In [None]:

trainer = training_pipeline(tokenizer, model, training_args, train_dataset, val_dataset)
trainer.train()

In [29]:
trainer.save_model("dnabert_prom300")

**You might want to restart the kernel and load the model again to save GPU memory before proceeding**. Loading the model back from the saved state:

In [16]:
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model = AutoModelForSequenceClassification.from_pretrained("dnabert_prom300", trust_remote_code=True)

Send model to the GPU:

In [18]:
model = model.to("cuda")

In [22]:
!nvidia-smi --query-gpu=memory.used --format=csv

/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
memory.used [MiB]
12063 MiB


In [23]:
testing_pipeline(tokenizer, model, test_dataset)

("Metrics averaged over batches: {'accuracy_score': 0.5033783783783784, "
 "'precision_score': 0.5033783783783784, 'recall_score': 1.0}")


We already know that Nucleotide Transformer has 500M params (since we're using the smaller one) but how many params does DNABERT actually have?

In [24]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

89192450

## Monte Carlo Trainings

We also initialize the classifier heads with a number of different random weights using a few different random seeds and we evaluate their test stage metrics:

In [15]:
import transformers
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from pathlib import Path

In [None]:
tokenizer = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref")
model_loader = lambda: AutoModelForSequenceClassification.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-human-ref", num_labels=2)
model_save_dir = Path("results/model_nucleotide_transformer")
training_args = TrainingArguments(model_save_dir,
                                  evaluation_strategy="epoch")

In [11]:
np.random.seed(420)
how_many_monte_carlo_runs = 3
random_seeds = np.random.random_integers(1, 100, how_many_monte_carlo_runs)
print(random_seeds)

[50 73  7]


  random_seeds = np.random.random_integers(1, 100, how_many_monte_carlo_runs)


In [12]:
def monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, seed: int):
    transformers.set_seed(seed)
    # Start from the pretrained model at each iteration:
    model = model_loader()
    # If you enjoy faster training times...
    model.to("cuda")
    # Train:
    trainer = training_pipeline(tokenizer, model, training_args, train_dataset, val_dataset)
    trainer.train()
    # Test:
    testing_pipeline(tokenizer, model, test_dataset)

Due to memory limitations we need to fine-tune the model on each Monte Carlo run manually and we need to **restart the kernel** after each seed is used to guarantee that the memory is freed:

In [None]:
monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[0])

In [15]:
monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[1])

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref were not used when initializing EsmForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.out_proj.bias', 'classifier

Epoch,Training Loss,Validation Loss


("Metrics averaged over batches: {'accuracy_score': 0.9109797297297297, "
 "'precision_score': 0.9133445170612462, 'recall_score': 0.9101584816064393}")


In [14]:
monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[2])

Some weights of the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref were not used when initializing EsmForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing EsmForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing EsmForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at InstaDeepAI/nucleotide-transformer-500m-human-ref and are newly initialized: ['classifier.out_proj.weight', 'classifi

Epoch,Training Loss,Validation Loss


("Metrics averaged over batches: {'accuracy_score': 0.9119932432432432, "
 "'precision_score': 0.9211313731447485, 'recall_score': 0.9031749725928688}")


In [13]:
!nvidia-smi --query-gpu=memory.used --format=csv

/bin/bash: /home/chris/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)
memory.used [MiB]
169 MiB


Then we do the same for DNABERT:

In [16]:
tokenizer = AutoTokenizer.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)
model_loader = lambda: AutoModelForSequenceClassification.from_pretrained("zhihan1996/DNA_bert_6", trust_remote_code=True)

model_save_dir = Path("results/model_dnabert")
training_args = TrainingArguments(model_save_dir,
                                  evaluation_strategy="epoch")

In [17]:
monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[0])

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing DNABertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DNABertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/

("Metrics averaged over batches: {'accuracy_score': 0.5033783783783784, "
 "'precision_score': 0.5033783783783784, 'recall_score': 1.0}")


In [18]:

monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[1])

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing DNABertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DNABertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/

("Metrics averaged over batches: {'accuracy_score': 0.5033783783783784, "
 "'precision_score': 0.5033783783783784, 'recall_score': 1.0}")


In [19]:
monte_carlo_train_and_test_pipeline(tokenizer, model_loader, training_args, train_dataset, val_dataset, random_seeds[2])

Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing DNABertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DNABertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DNABertForSequenceClassification were not initialized from the model checkpoint at zhihan1996/

("Metrics averaged over batches: {'accuracy_score': 0.5033783783783784, "
 "'precision_score': 0.5033783783783784, 'recall_score': 1.0}")


## Entropy of 4-letter DNA Representation vs. DNA+P Representation

In [7]:
import pandas as pd
from typing import Dict
from scipy.stats import entropy

# The probability table has been extracted from:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7127678/
probability_table_dnap = {
    "A": 0.0777,
    "C": 0.0157,
    "D": 0.0530,
    "E": 0.0656,
    "F": 0.0405,
    "G": 0.0691,
    "H": 0.0227,
    "I": 0.0591,
    "K": 0.0595,
    "L": 0.0960,
    "M": 0.0238,
    "N": 0.0427,
    "P": 0.0469,
    "Q": 0.0393,
    "R": 0.0526,
    "S": 0.0694,
    "T": 0.0550,
    "V": 0.0667,
    "W": 0.0118,
    "Y": 0.0311,
}

probability_of_gene = {
    # approximately 1.5% of the human genome consists of protein-encoding genes
    # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9186530/
    "M": 0.015,
    "*": 0.015
}

probability_table_given_gene = {k: v * probability_of_gene["M"] for k, v in probability_table_dnap.items()}

protein_alphabet_map = {
    "A": "Ala",
    "C": "Cys",
    "D": "Asp",
    "E": "Glu",
    "F": "Phe",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "K": "Lys",
    "L": "Leu",
    "M": "Met",
    "N": "Asp",
    "P": "Pro",
    "Q": "Gln",
    "R": "Arg",
    "S": "Ser",
    "T": "Thr",
    "V": "Val",
    "W": "Trp",
    "Y": "Tyr",
    "*": "*"
}

probability_table_given_gene_3_letter = {protein_alphabet_map[k]: v for k, v in probability_table_given_gene.items()}
expected_probability_of_gene = sum(probability_table_given_gene_3_letter.values())
print(expected_probability_of_gene)  # should come out to about 15%

0.014177999999999998


In [12]:
# What is the probability that a base does not belong to a particular gene:
expected_probability_of_non_gene = 1 - expected_probability_of_gene

# Assume general base distribution in human genome for those sequences that are non-genes:
# https://pubs.acs.org/doi/pdf/10.1021/ja01111a016
# We take this from the revolutionary Chargaff's paper and we take values for the human genome
# rounded up so that pairing bases have the exact same probability (A, T and G, C)

probabilities_of_bases_in_general = {
    "A": 0.30,  # Chargaff: 0.304
    "T": 0.30,  # Chargaff: 0.301
    "G": 0.19,  # Chargaff: 0.196
    "C": 0.19   # Chargaff: 0.199
}

probabilities_of_bases_elsewhere = {k: v * expected_probability_of_non_gene
                                    for k, v in probabilities_of_bases_in_general.items()
}

print(probabilities_of_bases_elsewhere)

{'A': 0.29574659999999997, 'T': 0.29574659999999997, 'G': 0.18730618, 'C': 0.18730618}


In [13]:
# Now we can put all probability values into one table:
from copy import deepcopy

probability_table_dnap = deepcopy(probability_table_given_gene_3_letter)
probability_table_dnap.update(probabilities_of_bases_elsewhere)

In [14]:
print(probability_table_dnap)

{'Ala': 0.0011655, 'Cys': 0.00023549999999999998, 'Asp': 0.0006405, 'Glu': 0.000984, 'Phe': 0.0006075, 'Gly': 0.0010364999999999999, 'His': 0.0003405, 'Ile': 0.0008864999999999999, 'Lys': 0.0008925, 'Leu': 0.0014399999999999999, 'Met': 0.000357, 'Pro': 0.0007034999999999999, 'Gln': 0.0005895, 'Arg': 0.000789, 'Ser': 0.001041, 'Thr': 0.000825, 'Val': 0.0010004999999999999, 'Trp': 0.000177, 'Tyr': 0.00046649999999999996, 'A': 0.29574659999999997, 'T': 0.29574659999999997, 'G': 0.18730618, 'C': 0.18730618}


Check if we got it right, whether this roughly sums up to $1$:

In [15]:
print(sum(probability_table_dnap.values()))

0.9802835599999999


Looks good. Now with this distribution at hand, it's possible to calculate sequence entropies, including expected entropies:

In [19]:
def entropy_of_dnap_sequence(seq: str,
                             probability_table_dnap: Dict[str, float]):
    probs = []
    for n in seq:
        prob = probability_table_dnap[n.upper()]
        probs.append(prob)
    return entropy(probs)


def expected_entropy(probability_table: Dict[str, float]): return entropy(list(probability_table.values()))

def expected_entropy_dnap(): return expected_entropy(probability_table_dnap)

def expected_entropy_voss(): return expected_entropy(probabilities_of_bases_in_general)

# TODO: calculate the expected entropy for a 6-mer representation, use all combinations of probabilities, check Large Mental Diarrhea Notebook
def expected_entropy_6_mer(): return None

In [20]:
print(expected_entropy_dnap())
print(expected_entropy_voss())
print(expected_entropy_6_mer())

1.4579007912957895
1.3608804979761429
None


- [ ] Compare the Shannon Entropy for samples labeled as containing promoter sequences vs those that are labeled as not containing promoter sequences