# Example Notebook to Generate Positive and Negative Sequences


In [1]:
from seqinfer.infer.generators import (
    MotifInserter,
    PWMInserter,
    RandomSequenceGenerator,
)
from IPython.display import display, HTML
import re
import random

random.seed(42)

In order to simulate positive and negative sequences, which are with or without certain motifs, we can use `RandomSequenceGenerator`, `MotifInserter` and `PWMInserter`. 

The `RandomSequenceGenerator` can generate random sequences using the characters from the input alphabet. 
The `MotifInserter` and `PWMInserter` will replace subset of the random sequences with the motif based on user specified location. Note `PWMInserter` is an extended version of `MotifInserter` since it allows motif definition via a position weight matrix. 

In [2]:
# Define the motif and PWM
motif = "AT-GC"
pwm = [
    {"A": 0.9, "C": 0.1},
    {"T": 1.0},
    {"-": 1.0},
    {"G": 1.0},
    {"C": 1.0},
]

# Create instances of RandomSequenceGenerator, MotifInserter, and PWMInserter
rsg = RandomSequenceGenerator(alphabet="ATCG", char_distribution=[0.25, 0.25, 0.25, 0.25], seed=42)
motif_inserter = MotifInserter(motif=motif)
pwm_inserter = PWMInserter(pwm=pwm, seed=42)

In [3]:
positive_sequences = []
negative_sequences = []

for _ in range(10):
    # Generate a base sequence with a length between 20 and 30
    length = random.randint(20, 30)
    base_seq = rsg.generate_sequence(length)

    # Insert the motif at a random position
    insert_pos = random.randint(5, 10)  # Motif roughly inserted in the middle of the sequence
    positive_seq = motif_inserter.insert_motif(base_seq, insert_pos)
    positive_sequences.append(positive_seq)

    # Generate a negative sequence with similar length and character distribution
    negative_seq = rsg.generate_sequence(length)
    negative_sequences.append(negative_seq)



In [4]:
print("Positive Sequences:")
motif_pattern = motif.replace("-", ".")  #  Replace gaps with regex wildcard '.'
for seq in positive_sequences:
    # Use regex to find all matches of the motif and highlight them
    highlighted_seq = re.sub(
        f"({motif_pattern})",
        r"<span style='color: red;'>\1</span>",
        seq
    )
    display(HTML(highlighted_seq))

print("\nNegative Sequences:")
for seq in negative_sequences:
    # Use regex to find all matches of the motif and highlight them
    highlighted_seq = re.sub(
        f"({motif_pattern})",
        r"<span style='color: red;'>\1</span>",
        seq
    )
    display(HTML(highlighted_seq))



Positive Sequences:



Negative Sequences:


In [5]:
positive_sequences = []
negative_sequences = []

for _ in range(10):
    # Generate a base sequence with a length between 20 and 30
    length = random.randint(20, 30)
    base_seq = rsg.generate_sequence(length)

    # Insert the motif at a random position
    insert_pos = random.randint(5, 10)  # Motif roughly inserted in the middle of the sequence
    positive_seq = pwm_inserter.insert_pwm(base_seq, insert_pos)
    positive_sequences.append(positive_seq)

    # Generate a negative sequence with similar length and character distribution
    negative_seq = rsg.generate_sequence(length)
    negative_sequences.append(negative_seq)

In [6]:
re.findall("[AC]T-GC", "ATATATGTGGATAGCTCGCGGAAA")
re.findall("AT.GC", "ATATATGTGGATAGCTCGCGGAAA")

['ATAGC']

In [7]:
pwm_pattern = "[AC]T-GC".replace("-", ".") 

print("Positive Sequences:")
for seq in positive_sequences:
    # Use regex to find all matches of the motif and highlight them
    highlighted_seq = re.sub(
        f"({pwm_pattern})",
        r"<span style='color: red;'>\1</span>",
        seq
    )
    display(HTML(highlighted_seq))

print("\nNegative Sequences:")
for seq in negative_sequences:
    # Use regex to find all matches of the motif and highlight them
    highlighted_seq = re.sub(
        f"({motif_pattern})",
        r"<span style='color: red;'>\1</span>",
        seq
    )
    display(HTML(highlighted_seq))





Positive Sequences:



Negative Sequences:
