In [1]:
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from tqdm import tqdm
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import json

In [None]:
run_fastq_processing = True
if run_fastq_processing:
    duplicate_counters_per_file: dict[str, dict] = {}

    data_folder = Path('/cellfile/datapublic/jkoubele/FLI_total_RNA/20240219_866_YC')
    for file_path in data_folder.iterdir():
        if 'UMI' not in file_path.name:
            continue    

        umi_duplicates = defaultdict(int)
        with gzip.open(file_path, mode='rt') as file:        
            for read in tqdm(SeqIO.parse(file, "fastq"), desc=f'Processing reads from {file_path.name}'):            
                umi_duplicates[read.seq] += 1

        duplicate_counters_per_file[file_path.name]=dict(Counter(umi_duplicates.values()))       

    with open('umi_duplicate_counters.json', 'w') as output_file:
        json.dump(duplicate_counters_per_file, output_file)

Processing reads from no010-0_OND1_UMI.fastq.gz: 1319894it [00:14, 82677.42it/s]