In [2]:
# 🧙 Notebook magic: autoreload modules
%load_ext autoreload
%autoreload 2
import bagpipe_io as bpio
import bagpipe_config as bconfig
from pathlib import Path
from read_breaker_io import FastqReader, FastqWriter
from read_breaker_parser import ReadParser
from read_breaker_parser import read_clip_and_write
import yaml

## # — Set up I/O paths & parser (you’ll already have this) —

## provides configuration for the pipeline
## and list of input files
bagpipe_yaml = "../configs/bagpipe_config.yaml"
bagpipe_config = bconfig.load_config(bagpipe_yaml)
## create output directory
out_dir = Path(bagpipe_config["output_dir"])
out_dir.mkdir(exist_ok=True)

## configuration for parsing the reads
read_breaker_config_path = "../configs/bagpipe_2N.yaml"
with open(read_breaker_config_path, "r") as f:
    read_breaker_config = yaml.safe_load(f)

## instantiate the parser
read_parser = ReadParser(
        pipeline_cfg={"pipeline": read_breaker_config["pipeline"]},
        globals_cfg=read_breaker_config["params"],
        globals_namespace="params",
        base_dir = "../"
    )

print(read_parser)

## show how this works for one sample
sample = bagpipe_config["samples"][0]
r1_path = sample["fastq_r1"]
r2_path = sample["fastq_r2"]

stub    = sample["id"]  ## for output file names

## create reader from the input files and a writer for the output
reader = FastqReader(r1_path, r2_path, trim_tail=True)
writer = FastqWriter(str(out_dir), stub)

## read the input files, parse the reads, clip and write the output
read_clip_and_write(
    reader,
    read_parser,
    writer,
    default_start_r1=read_parser.globals.get("start_r1", 0),
    default_start_r2=read_parser.globals.get("start_r2", 0)
)

writer.close()
print("Finished writing output files.")


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Loaded 95 barcodes for bc1
Loaded 96 barcodes for bc2
Loaded 96 barcodes for bc3
ReadParser Pipeline:
[  0 ]  check_upstream_motif_r1: hamming_test (read 1, must pass: True)
[  1 ]  extract_bc3: extract (read 1, must pass: False)
[  2 ]  extract_bc2: extract (read 1, must pass: False)
[  3 ]  extract_bc1: extract (read 1, must pass: False)
[  4 ]  good_barcodes: test ({{ bc1_match and bc2_match and bc3_match }}, must pass: True)
[  5 ]  extract_vt3: extract (read 1, must pass: True)
[  6 ]  extract_vt2: extract (read 1, must pass: True)
[  7 ]  extract_vt1: extract (read 1, must pass: True)
[  8 ]  find_adapter_r1: regex_search (read 1, must pass: False)
[  9 ]  find_polya_r1: regex_search (read 1, must pass: False)
[ 10 ]  find_adapter_r2: regex_search (read 2, must pass: False)
[ 11 ]  compute_end_r1: compute ({{ (adapter1_start - params.trim_offset_r1) if adapter1_start is not none else (polya_st

In [55]:
## compare the results of the two output files
r1_path = out_dir / f"{stub}.R1.fastq.gz"
r2_path = out_dir / f"{stub}.R2.fastq.gz"

## original trimmed output files
r1_orig_path = out_dir / f"r1.fastq.gz"
r2_orig_path = out_dir / f"r2.fastq.gz"

new_reader = FastqReader(r1_path, r2_path)
original_reader = FastqReader(r1_orig_path, r2_orig_path)

original_reads = {}
new_reads = {}

for ind, read_pair in enumerate( original_reader ):
    original_reads[read_pair[0]] = read_pair

for ind, read_pair in enumerate( new_reader ):
    new_reads[read_pair[0]] = read_pair


In [56]:
## compare the dictionaries:
print("Original reads:", len(original_reads))
print("New reads:", len(new_reads))

## turn them into dataframes
import pandas as pd
df_orig = pd.DataFrame.from_dict(original_reads, orient="index")
df_new = pd.DataFrame.from_dict(new_reads, orient="index")


Original reads: 132
New reads: 132


In [57]:
import pandas as pd

# (assuming df_orig and df_new already exist and use read-ID as their index)

# 1) give readable column names instead of 0..4
cols = ["id", "seq1", "qual1", "seq2", "qual2"]
df_orig.columns = cols
df_new .columns = cols

# 2) outer-join (union of indices); add suffixes so columns stay separate
cmp = df_orig.join(df_new, how="outer", lsuffix="_orig", rsuffix="_new")

# 3) rows present only on one side
only_orig = cmp[cmp["id_new"].isna()]
only_new  = cmp[cmp["id_orig"].isna()]

print("Missing from NEW:", len(only_orig))
print("Missing from ORIG:", len(only_new))

# 4) for shared rows, flag which columns differ
shared = cmp.dropna(subset=["id_orig", "id_new"])
diff_mask = (shared[[c+"_orig" for c in cols]].values !=
             shared[[c+"_new"  for c in cols]].values)

rows_with_diffs = shared[diff_mask.any(axis=1)]

print("Rows with content differences:", len(rows_with_diffs))

# (optional) inspect a few mismatches
print(rows_with_diffs.head())


Missing from NEW: 0
Missing from ORIG: 0
Rows with content differences: 0
Empty DataFrame
Columns: [id_orig, seq1_orig, qual1_orig, seq2_orig, qual2_orig, id_new, seq1_new, qual1_new, seq2_new, qual2_new]
Index: []


In [3]:

## check the output
#open the output file with a fastq reader
from read_breaker_io import FastqReader

r1_path = out_dir / f"{stub}.R1.fastq.gz"
r2_path = out_dir / f"{stub}.R2.fastq.gz"
reader = FastqReader(r1_path, r2_path)
for ind, read_pair in enumerate( reader):
    print(ind, read_pair[0], [len(x) for x in read_pair[1:]])



0 A00296:241:HMWTTDRXX:1:1101:17201:6073 1:N:0:CGTACTAG/1_CTGTTAGTTCCCACTTC_ACTTCGCTAGGC [70, 70, 150, 150]
1 A00296:241:HMWTTDRXX:1:1101:17219:6073 1:N:0:CGTACTAG/1_CTAAGAATTTACCGTGA_GTTTTAGGGCAG [70, 70, 46, 46]
2 A00296:241:HMWTTDRXX:1:1101:17255:6073 1:N:0:CGTACTAG/1_TCTGGAATACCGTGTAA_AAGTACCTCGCG [70, 70, 150, 150]
3 A00296:241:HMWTTDRXX:1:1101:17291:6073 1:N:0:CGTACTAG/1_GCTACTGAAGCGGGTCA_CGGGTTATTGAA [70, 70, 42, 42]
4 A00296:241:HMWTTDRXX:1:1101:17327:6073 1:N:0:CGTACTAG/1_TGATGGATTCAGATTCC_TGTACAATGCAC [70, 70, 150, 150]
5 A00296:241:HMWTTDRXX:1:1101:17345:6073 1:N:0:CGTACTAG/1_GATTTGTAGGAAATACG_TTCAAGTTGGCC [70, 70, 41, 41]
6 A00296:241:HMWTTDRXX:1:1101:17363:6073 1:N:0:CGTACTAG/1_GCTTCATAGGAAAGAAG_CGATACATCCGC [70, 70, 150, 150]
7 A00296:241:HMWTTDRXX:1:1101:17418:6073 1:N:0:CGTACTAG/1_CATTTAAGTTTGGCTAA_GGGTGCATGTAC [70, 70, 119, 119]
8 A00296:241:HMWTTDRXX:1:1101:17616:6073 1:N:0:CGTACTAG/1_TTCACGGAACAATTACC_CAGGATTTAAGA [70, 70, 76, 76]
9 A00296:241:HMWTTDRXX:1:1101:17653:

Everything above seems to be working and the stuff below is "legacy" from the time spent building the above.

In [40]:
import yaml
from read_breaker_parser import ReadParser
import bagpipe_config as bconfig # Still needed to get FASTQ paths

# --- Configuration ---
# Still load the old config *just* to get the sample file paths
bagpipe_yaml = "../bagpipe_config.yaml" # Path to original bagpipe config
bagpipe_config_data = bconfig.load_config(bagpipe_yaml)

# Path to the NEW read_breaker YAML configuration
read_breaker_yaml_path = "../configs/bagpipe_2N.yaml" # Adjust path as needed

# Load the read_breaker configuration
try:
    with open(read_breaker_yaml_path, 'r') as f:
        read_breaker_config = yaml.safe_load(f)
    print("Successfully loaded read_breaker config.")
except FileNotFoundError:
    print(f"ERROR: read_breaker config file not found at {read_breaker_yaml_path}")
    read_breaker_config = None
except Exception as e:
    print(f"ERROR: Failed to load or parse read_breaker config: {e}")
    read_breaker_config = None

# Instantiate the new ReadParser
if read_breaker_config:
    # ReadParser expects the full config dict.
    # It automatically looks for 'pipeline' and 'params' keys if globals_cfg is not explicitly passed.
    # 2. Initialize parser with the new structure
    read_parser = ReadParser(
        pipeline_cfg={"pipeline": read_breaker_config["pipeline"]},
        globals_cfg=read_breaker_config["params"],
        globals_namespace="params",
        base_dir = "../"
    )
    print("ReadParser initialized.")
else:
    read_parser = None
    print("ReadParser could not be initialized.")

print(read_parser)

Successfully loaded read_breaker config.
Loaded 95 barcodes for bc1
Loaded 96 barcodes for bc2
Loaded 96 barcodes for bc3
ReadParser initialized.
ReadParser Pipeline:
[  0 ]  check_upstream_motif_r1: hamming_test (read 1, must pass: True)
[  1 ]  extract_bc3: extract (read 1, must pass: False)
[  2 ]  extract_bc2: extract (read 1, must pass: False)
[  3 ]  extract_bc1: extract (read 1, must pass: False)
[  4 ]  good_barcodes: test ({{ bc1_match and bc2_match and bc3_match }}, must pass: True)
[  5 ]  extract_vt3: extract (read 1, must pass: True)
[  6 ]  extract_vt2: extract (read 1, must pass: True)
[  7 ]  extract_vt1: extract (read 1, must pass: True)
[  8 ]  find_adapter_r1: regex_search (read 1, must pass: False)
[  9 ]  find_polya_r1: regex_search (read 1, must pass: False)
[ 10 ]  find_adapter_r2: regex_search (read 2, must pass: False)
[ 11 ]  compute_end_r1: compute ({{ (adapter1_start - params.trim_offset_r1) if adapter1_start else (polya_start if polya_start else len_seq1) }

In [41]:

print(read_parser.get_parse_log())
from read_breaker_io import FastqReader # Make sure to use the new reader import

# Get paths from the previously loaded bagpipe_config_data
sample_info = bagpipe_config_data.get("samples", [])[0]
path1 = sample_info["fastq_r1"]
path2 = sample_info["fastq_r2"]
print(f"Processing R1: {path1}")
print(f"Processing R2: {path2}")

read_count = 0
processed_count = 0
max_reads_to_show = 50 # Limit output for demonstration


good_rows = []
bad_rows = []
# Use the FastqReader from read_breaker_io
with FastqReader(path1, path2, trim_tail=True) as read_walker:
    for read_pair in read_walker:
        read_count += 1
        # New parser takes the same arguments (read_id, seq1, qual1, seq2, qual2)
        # It returns a dictionary or potentially None/fail-dict on failure
        result_dict = read_parser.parse(*read_pair)
        if result_dict and result_dict.get("status") == "ok":
            # Successfully parsed according to the pipeline
            processed_count += 1
            good_rows.append(result_dict)
            # --- Next Steps (outside read_breaker parser) ---
            # 1. Construct new read ID using extracted tags (e.g., result_dict['bc1'], etc.)
            # new_id = f"{result_dict['read_id']}/1_{result_dict['bc3']}{result_dict['bc2']}..."
            # 2. Perform trimming based on results or separate logic
            # trimmed_seq1, trimmed_qual1 = trim_logic(read_pair[1], read_pair[2], result_dict)
            # 3. Write output using FastqWriter
            # writer.write((new_id, trimmed_seq1, ...))
        elif result_dict and result_dict.get("status") == "fail":
            bad_rows.append(result_dict)
            print(f"Read {read_count} FAIL: {result_dict['message']}")
        else:
            print(f"Read {read_count} FAIL: Parser returned unexpected value: {result_dict}")

print(f"\nFinished processing. Total reads: {read_count}, Successfully parsed: {processed_count}")
# Print final parsing stats from the parser instance
print("\nParser Stats:")
print(read_parser.get_parse_log())


{'total_reads': 0, 'successful_reads': 0, 'failed_reads': 0, 'failures_by_step': {'check_upstream_motif_r1': 0, 'extract_bc3': 0, 'extract_bc2': 0, 'extract_bc1': 0, 'good_barcodes': 0, 'extract_vt3': 0, 'extract_vt2': 0, 'extract_vt1': 0, 'find_adapter_r1': 0, 'find_polya_r1': 0, 'find_adapter_r2': 0, 'compute_end_r1': 0, 'compute_end_r2': 0, 'check_read_lens': 0, 'assemble_bead_tag': 0, 'assemble_varietal_tag': 0, 'assemble_read_tag': 0}}
Processing R1: ../data/test_data/r1_new2N.fastq.gz
Processing R2: ../data/test_data/r2_new2N.fastq.gz
Read 4 FAIL: Test operation failed
Read 9 FAIL: Hamming_test operation failed
Read 11 FAIL: Hamming_test operation failed
Read 12 FAIL: Hamming_test operation failed
Read 13 FAIL: Hamming_test operation failed
Read 16 FAIL: Hamming_test operation failed
Read 17 FAIL: Test operation failed
Read 22 FAIL: Test operation failed
Read 26 FAIL: Test operation failed
Read 29 FAIL: Hamming_test operation failed
Read 31 FAIL: Test operation failed
Read 33 FAI

In [42]:
read_parser.regex_patterns

{'adapter_r1': re.compile(r'CCAAACACACCCAA|(CCAA|CCAAA|CCAAAC|CCAAACA|CCAAACAC|CCAAACACA|CCAAACACAC|CCAAACACACC|CCAAACACACCC|CCAAACACACCCA|CCAAACACACCCAA)$',
            re.UNICODE),
 'polyA_tail': re.compile(r'AAAAAAAAAAAAAA', re.UNICODE),
 'adapter_r2': re.compile(r'GAGCGGACTCTGCG|(GAGC|GAGCG|GAGCGG|GAGCGGA|GAGCGGAC|GAGCGGACT|GAGCGGACTC|GAGCGGACTCT|GAGCGGACTCTG|GAGCGGACTCTGC|GAGCGGACTCTGCG)$',
            re.UNICODE)}

In [43]:

import pandas as pd


df_good = pd.DataFrame(good_rows)
df_bad = pd.DataFrame(bad_rows)
df_good

Unnamed: 0,read_id,len_seq1,len_seq2,motif_ok,bc3,bc3_match,bc2,bc2_match,bc1,bc1_match,...,adapter1_start,polya_start,adapter2_start,end_r1,end_r2,read_lens_ok,bead_tag,varietal_tag,read_tag,status
0,A00296:241:HMWTTDRXX:1:1101:17201:6073,150,150,True,CTGTTA,True,GTTCCC,True,ACTTC,True,...,,,,150,150,True,CTGTTAGTTCCCACTTC,ACTTCGCTAGGC,CTGTTAGTTCCCACTTC_ACTTCGCTAGGC,ok
1,A00296:241:HMWTTDRXX:1:1101:17219:6073,150,150,True,CTAAGA,True,ATTTAC,True,CGTGA,True,...,,,71.0,150,46,True,CTAAGAATTTACCGTGA,GTTTTAGGGCAG,CTAAGAATTTACCGTGA_GTTTTAGGGCAG,ok
2,A00296:241:HMWTTDRXX:1:1101:17255:6073,150,150,True,TCTGGA,True,ATACCG,True,TGTAA,True,...,,,,150,150,True,TCTGGAATACCGTGTAA,AAGTACCTCGCG,TCTGGAATACCGTGTAA_AAGTACCTCGCG,ok
3,A00296:241:HMWTTDRXX:1:1101:17291:6073,150,150,True,GCTACT,True,GAAGCG,True,GGTCA,True,...,,,67.0,150,42,True,GCTACTGAAGCGGGTCA,CGGGTTATTGAA,GCTACTGAAGCGGGTCA_CGGGTTATTGAA,ok
4,A00296:241:HMWTTDRXX:1:1101:17327:6073,150,150,True,TGATGG,True,ATTCAG,True,ATTCC,True,...,,,,150,150,True,TGATGGATTCAGATTCC,TGTACAATGCAC,TGATGGATTCAGATTCC_TGTACAATGCAC,ok
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127,A00296:241:HMWTTDRXX:1:1101:27706:6073,150,150,True,CTCATA,True,TTGGCC,True,ACAGC,True,...,,,,150,150,True,CTCATATTGGCCACAGC,TTTTATCTGAGC,CTCATATTGGCCACAGC_TTTTATCTGAGC,ok
128,A00296:241:HMWTTDRXX:1:1101:27832:6073,150,150,True,TCAGAT,True,AAAGAT,True,GATAC,True,...,,,,150,150,True,TCAGATAAAGATGATAC,AGTAAACCCGGC,TCAGATAAAGATGATAC_AGTAAACCCGGC,ok
129,A00296:241:HMWTTDRXX:1:1101:27887:6073,150,150,True,GATTTG,True,TGGTGC,True,GATGT,True,...,,,69.0,150,44,True,GATTTGTGGTGCGATGT,GTTTCATGGGGG,GATTTGTGGTGCGATGT_GTTTCATGGGGG,ok
130,A00296:241:HMWTTDRXX:1:1101:27923:6073,150,150,True,TCTTTT,True,GTTCCC,True,ACAGC,True,...,,,,150,150,True,TCTTTTGTTCCCACAGC,TCGTATTGGGCG,TCTTTTGTTCCCACAGC_TCGTATTGGGCG,ok


In [44]:
#df_good[ df_good['read_id'] == "A00296:241:HMWTTDRXX:1:1101:27145:6073" ].iloc[0]
df_bad[ df_bad['read_id'] == "A00296:241:HMWTTDRXX:1:1101:27145:6073" ].iloc[0]

read_id        A00296:241:HMWTTDRXX:1:1101:27145:6073
status                                           fail
failed_step                           check_read_lens
message                         Test operation failed
Name: 108, dtype: object