In [1]:
import random


def generate_kmer_6(sequence):
    kmers = []
    for i in range(len(sequence) - 5):
        kmer = sequence[i:i + 6]
        if 'N' not in kmer:  # Skip k-mers with non-ATGC characters
            kmers.append(kmer)
    return kmers


def sliding_window_with_skip(sequences, window_size=100, skip_step=1):
    windows = []
    for seq in sequences:
        if type(seq) is not str:
            continue
        if len(seq) < window_size:
            continue

        for i in range(0, len(seq) - window_size + 1, skip_step):
            window = seq[i:i + window_size]
            if all(base in "ATGC" for base in window):
                windows.append(window)

    return windows


def clean_sequence(sequence):
    bases = list(sequence)
    for i, base in enumerate(bases):
        if base not in "ATGC":
            bases[i] = random.choice("ATGC")
    return ''.join(bases)


def preprocess_data(lysogenic_seqs, lytic_seqs, window_size=100):

    # Apply sliding window with skip_step=1 for lysogenic sequences
    lysogenic_windows = sliding_window_with_skip(lysogenic_seqs, window_size=window_size, skip_step=1)

    # Apply sliding window with skip_step=91 for lytic sequences
    lytic_windows = sliding_window_with_skip(lytic_seqs, window_size=window_size, skip_step=91)

    # Convert sequences to k-mer 6 representation
    # lysogenic_kmers = [generate_kmer_6(window) for window in lysogenic_windows]
    # lytic_kmers = [generate_kmer_6(window) for window in lytic_windows]

    print(f"Generated {len(lysogenic_windows)} lysogenic windows and {len(lytic_windows)} lytic windows")

    # Return balanced dataset by sampling if needed
    min_count = min(len(lysogenic_windows), len(lytic_windows))

    if len(lysogenic_windows) > min_count:
        lysogenic_windows = random.sample(lysogenic_windows, min_count)
        lysogenic_kmers = [generate_kmer_6(window) for window in lysogenic_windows]

    if len(lytic_windows) > min_count:
        lytic_windows = random.sample(lytic_windows, min_count)
        lytic_kmers = [generate_kmer_6(window) for window in lytic_windows]

    return {
        'lysogenic_windows': lysogenic_windows,
        'lytic_windows': lytic_windows,
        # 'lysogenic_kmers': lysogenic_kmers,
        # 'lytic_kmers': lytic_kmers
    }

In [2]:
import pandas as pd

lysogenic_df = pd.read_csv('lysogenic_train.csv')
lysogenic_df.dropna(inplace=True)
lytic_df = pd.read_csv('lytic_train.csv')

In [3]:
prepared_data = preprocess_data(lysogenic_df['sequence_filled'].values, lytic_df['sequence'].values, window_size=500)

KeyError: 'sequence_filled'

In [21]:
columns = ['sequence', 'label']
labels = [0] * len(prepared_data['lysogenic_windows'])
lysogenic_df = pd.DataFrame(zip(prepared_data['lysogenic_windows'], labels), columns=columns)
lysogenic_df.head()

Unnamed: 0,sequence,label
0,GTTACTCTACTGTGGACACTGTGTGGACACTCTCGGCCTCAGTACC...,0
1,TTACTCTACTGTGGACACTGTGTGGACACTCTCGGCCTCAGTACCA...,0
2,TACTCTACTGTGGACACTGTGTGGACACTCTCGGCCTCAGTACCAC...,0
3,ACTCTACTGTGGACACTGTGTGGACACTCTCGGCCTCAGTACCACC...,0
4,CTCTACTGTGGACACTGTGTGGACACTCTCGGCCTCAGTACCACCT...,0


In [22]:
lysogenic_df.size

38028

In [4]:
columns = ['sequence', 'label']
labels = [1] * len(prepared_data['lytic_windows'])
lytic_df = pd.DataFrame(zip(prepared_data['lytic_windows'], labels), columns=columns)
lytic_df.head()

NameError: name 'prepared_data' is not defined

In [24]:
lytic_df.size

38028

In [25]:
combined_df = pd.concat([lysogenic_df, lytic_df], ignore_index=True)
shuffled_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)
shuffled_df.head()

Unnamed: 0,sequence,label
0,AGCACCATTGCGGTTTTCATTTCCGGCGTGTAGCGGATATCACGCG...,0
1,CCTCGTTGATGGTTTCATTGATATCAGGGAGGGTGAGCTTGGTTGC...,1
2,AGCAAGCACAAACAATACTAGATAAAAACGCAAACGACGTTATATT...,1
3,GGCTGATGCCGATGGCCTCAGCGTACGAATTTCACCGAAGGGGGTC...,0
4,ACGGTTCTGACAAACACCTCATCACCCGGGAATACTTTGGTGTTAG...,0


In [26]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(shuffled_df, test_size=0.2, random_state=42)

print("Training DataFrame:")
print(train_df.head())

print("\nTesting DataFrame:")
print(test_df.head())

Training DataFrame:
                                                sequence  label
22193  GTGTTCTTCACTGTTATAGCTGCGGATTTTTACTTGTTCGTTTGGC...      0
2499   TATAGTTTATTATATGGTGTAGACTCATCTAGAATACTCTTAAGTG...      1
15920  AGCTTCCGTCTTCGCACGGTTTGCTTGCGTGTTGGCTTGGGTGGCA...      1
8475   ACCTGAAAATTCGGGATCCAGAAAATCTCATCGACGTACAGGTCGC...      0
21925  GAAGTGAAGAAAGTCGTTAGAACATCTCCTGTAGAAGTAGGAGATA...      1

Testing DataFrame:
                                                sequence  label
12890  AGGAGAGTAAAAAAGAAGTGGTAAGAGTATATAAGGATAAAGACTA...      1
29193  ATGTCATACTGACTCGTTCTTCCCCGAACGATCCAATCGCACTTAA...      1
20374  ATTGCTTCTGGATGATGACAGCAGTGAGCGCGTCCAGAAAGTTCTG...      0
3930   GTTTTGTACTTGGGTATGATAACGAATGTAATGTAATGTTATTATC...      1
10226  GAATTGGTAACACCTTATTTTGTAGAATATGATTTAAATGGAGATA...      1


In [27]:
train_df.to_csv("../../data/dnabert_2_preparation/train.csv", index=False)
test_df.to_csv("../../data/dnabert_2_preparation/dev.csv", index=False)