# Motif Notebook

This notebook shows what the motif data is visualized like.

In [None]:
# Imports #
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict

Loading data in:

In [None]:
# Load motifs and counts #
motifs = []
counts = []
with open('../data/motifs.txt') as file:
    for line in file:
        motif, count = line.strip().split(": ")
        motifs.append(motif)
        counts.append(int(count))

Plotting frequency of each motif (top 50 frequencies):

In [None]:
# Plotting #
plt.figure(figsize=(10, 6))
plt.bar(motifs, counts, color='blue')
plt.xticks(rotation=90)
plt.xlabel("Motifs")
plt.ylabel("Frequency")
plt.title("Motif Frequency Visualization")
plt.tight_layout()
plt.show()

Calculating freqencies:

In [None]:
# Determine k-mer length #
k = len(motifs[0])

# Initialize position-wise nucleotide counts #
nucleotide_counts = [defaultdict(int) for _ in range(k)]

# Populate nucleotide counts based on k-mers and their frequencies #
for motif, count in zip(motifs, counts):
    for pos, nucleotide in enumerate(motif):
        nucleotide_counts[pos][nucleotide] += count

# Calculate nucleotide frequencies for each position #
nucleotide_frequencies = []
for pos_counts in nucleotide_counts:
    total = sum(pos_counts.values())
    frequencies = {nucleotide: count / total for nucleotide, count in pos_counts.items()}
    nucleotide_frequencies.append(frequencies)

Creating visualization to show frequency of each nucleotide at each position in the motifs:

In [None]:
# Plotting #
fig, ax = plt.subplots(figsize=(10, 4))

# Define colors for nucleotides #
colors = {'A': 'green', 'C': 'blue', 'G': 'orange', 'T': 'red'}
x = np.arange(k)

# Initialize bottom for stacking #
bottom = np.zeros(k)

# Plot each nucleotide as a stacked bar #
for nucleotide in 'ACGT':
    heights = [nucleotide_frequencies[pos].get(nucleotide, 0) for pos in range(k)]
    ax.bar(x, heights, bottom=bottom, color=colors[nucleotide], label=nucleotide)
    bottom += heights

# Labeling and display settings #
ax.set_xticks(x)
ax.set_xticklabels([f"Pos {i+1}" for i in x])
ax.set_xlabel("Position in k-mer")
ax.set_ylabel("Frequency")
ax.legend(title="Nucleotides")
plt.title("Sequence Logo Visualization Based on k-mer Frequencies")
plt.tight_layout()
plt.show()