<a href="https://colab.research.google.com/github/joycao1/salzmanwork/blob/main/hyena_training_pipeline_carrots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython pandas



In [2]:
import pandas as pd
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [3]:
# Upload file
from google.colab import files
uploaded = files.upload()

# Load into DataFrame
file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name, sep='\t', header=None, names=["sample_id", "cbc", "anchor", "target", "count"])

df.head()

Saving ERR13720418_pair_1.txt to ERR13720418_pair_1.txt


Unnamed: 0,sample_id,cbc,anchor,target,count
0,0,ATAGAGACACAAACGG,AAAACAAG,CGGATTCTCATGGAACACATCCACAAGCTGA,2
1,0,ATAGAGACACAAACGG,AAAACAAG,GATATCAGGAAATTTTTGGATGGTATCGATG,2
2,0,ATAGAGACACAAACGG,AAAACACT,CAACAAACTAGGAATATAATGAAACTGCCTC,2
3,0,ATAGAGACACAAACGG,AAAACACT,GAAAAATATTCCAAGCTTCATATTAACCCTA,2
4,0,ATAGAGACACAAACGG,AAAACGAT,GAATTAACAGAAAGAATAAAGCATTTATCCT,1


In [4]:
# Sort by count (descending) for each CBC/anchor pair
df_sorted = df.sort_values(by=["cbc", "anchor", "count"], ascending=[True, True, False])

# Build concatenated sequence for each CBC
cbc_to_seq = {}
for cbc, group in df_sorted.groupby("cbc"):
    sequence_parts = []
    for anchor, anchor_group in group.groupby("anchor"):
        sequence_parts.append(anchor)
        sequence_parts.extend(anchor_group["target"].tolist())
    cbc_to_seq[cbc] = "".join(sequence_parts)

cbc_to_seq_list = [{"cbc": cbc, "sequence": seq} for cbc, seq in cbc_to_seq.items()]
pd.DataFrame(cbc_to_seq_list).head()

Unnamed: 0,cbc,sequence
0,AAACCCAGTATGAGCG,AAAACAAAAATTAGCTGGGTATGGTGACGTGCCTGTGATAAAACAA...
1,AAAGAACAGTCTGCGC,AAAACAAAGCACATGCTGCCCAGTGGCTTCCGGAAGTTCAAAACAA...
2,AAAGGATGTTTCTATC,AAAACAAAAATAAACGATGATAATCTTTACTGGTGAAAAAAAACAA...
3,AAAGGGCAGAGAGTTT,AAAACAAAAATCAAGGAATTGAACAACACATGTGAACCCAAAACAA...
4,AAAGGGCTCACCATAG,AAAACAAAAAAAGAATATCATGGTGGCTTACACCAGCAATTGGTCA...


In [None]:
# # directly outporting fasta without cleaning. carrot output.
# records = [
#     SeqRecord(Seq(seq), id=cbc, description="")
#     for cbc, seq in cbc_to_seq.items()
# ]

# SeqIO.write(records, "output.fasta", "fasta")

# # Download
# from google.colab import files
# files.download("output.fasta")

In [5]:

# ---- Config ----
CONTEXT_LEN = 250_000         # target context window
SEP = "N" * 10                # separator between cell barcodes
ALLOW = set("ACGTN")          # allowed alphabet; everything else -> 'N'
PAD_FINAL = True              # pad last chunk to CONTEXT_LEN
PAD_TOKEN = "4"               # HyenaDNA padding token
WRITE_FASTA = True            # also emit a FASTA with chunk headers

# ---- Input ----
# Assumes you already have: cbc_to_seq: Dict[str, str]
# (from your earlier notebook code)

def clean_seq(s: str) -> str:
    s = s.upper()
    return "".join(ch if ch in ALLOW else "N" for ch in s)

# 1) Clean each barcode sequence
cleaned = [clean_seq(seq) for _, seq in cbc_to_seq.items()]

# 2) Join with separators
joined = SEP.join(cleaned)

# 3) Chunk into CONTEXT_LEN windows
chunks = [joined[i:i+CONTEXT_LEN] for i in range(0, len(joined), CONTEXT_LEN)]

# 4) Pad last chunk with PAD_TOKEN if needed
if PAD_FINAL and chunks:
    last = chunks[-1]
    if len(last) < CONTEXT_LEN:
        chunks[-1] = last + (PAD_TOKEN * (CONTEXT_LEN - len(last)))

# 5) Save plain-text chunks
txt_path = "hyena_input_250k.txt"
with open(txt_path, "w") as f:
    for ch in chunks:
        f.write(ch + "\n")

# 6) Save as FASTA (optional)
fasta_path = "hyena_chunks_250k.fasta"
if WRITE_FASTA:
    records = [SeqRecord(Seq(ch), id=f"chunk_{i}", description="hyena_250k")
               for i, ch in enumerate(chunks)]
    SeqIO.write(records, fasta_path, "fasta")

# 7) Sanity check
print(f"Total barcodes: {len(cleaned)}")
print(f"Joined length:  {len(joined):,}")
print(f"Chunks (250k):  {len(chunks)}")
if chunks:
    print(f"First chunk length: {len(chunks[0])}")
    print(f"Last  chunk length: {len(chunks[-1])}")

# 8) Download outputs
files.download(txt_path)
if WRITE_FASTA:
    files.download(fasta_path)

Total barcodes: 314
Joined length:  101,303,817
Chunks (250k):  406
First chunk length: 250000
Last  chunk length: 250000


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>