# MaskedLM Results Notebook

This notebook shows the results of the MaskedLM model on the kmers data

In [None]:
# Imports #
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Loading in results from the model:

In [None]:
# CHANGE THE PERCENTAGE TO THE ONE USED IN THE CONFIG FILE (model.json) #
percentage = 0.00004727318
total_length = int(np.floor(percentage * 2115364))
seq_len = 509
vocab_size = 4101
results = np.memmap('../data/results.npy', dtype=np.float32, mode='r', shape=(total_length, seq_len, vocab_size))

Calculating nucleotide scores:

In [None]:
# Calculate average probabilities for nucleotides across all sequences #
nucleotide_scores = results[..., :4].mean(axis=0)

# Normalize the nucleotide scores #
nucleotide_scores /= nucleotide_scores.sum(axis=1, keepdims=True)

Plotting:

In [None]:
colors = {'A': 'green', 'C': 'blue', 'G': 'orange', 'T': 'red'}
nucleotides = ['A', 'C', 'G', 'T']

# Range of positions to visualize #
start_pos = 9
end_pos = 28

# Slice nucleotide_scores to include only the specified range #
scores_in_range = nucleotide_scores[start_pos:end_pos, :]
x = np.arange(start_pos, end_pos)  # Adjust x-axis to match the selected range #

# Plot the sequence logo for the specified range #
fig, ax = plt.subplots(figsize=(10, 6))
bottom = np.zeros(scores_in_range.shape[0])

# Stack the nucleotide scores at each position #
for i, nucleotide in enumerate(nucleotides):
    ax.bar(x, scores_in_range[:, i], bottom=bottom, color=colors[nucleotide], label=nucleotide)
    bottom += scores_in_range[:, i]

# Labeling and display #
ax.set_xticks(x)
ax.set_xticklabels([f"Pos {i+1}" for i in x], rotation=90)
ax.set_xlabel("Position in Sequence")
ax.set_ylabel("Normalized Score")
ax.legend(title="Nucleotides")
plt.title(f"Sequence Logo for Positions {start_pos + 1} to {end_pos}")
plt.tight_layout()
plt.show()