In [1]:
from pathlib import Path
from Bio import SeqIO
from Bio.Seq import Seq
import gzip
from tqdm import tqdm
import pandas as pd
from collections import defaultdict, Counter
import matplotlib.pyplot as plt
import json

In [2]:
run_fastq_processing = False
if run_fastq_processing:
    duplicate_counters_per_file: dict[str, dict] = {}

    data_folder = Path('/cellfile/datapublic/jkoubele/FLI_total_RNA/20240219_866_YC')
    for file_path in data_folder.iterdir():
        if 'UMI' not in file_path.name:
            continue    

        umi_duplicates = defaultdict(int)
        with gzip.open(file_path, mode='rt') as file:        
            for read in tqdm(SeqIO.parse(file, "fastq"), desc=f'Processing reads from {file_path.name}'):            
                umi_duplicates[read.seq] += 1

        duplicate_counters_per_file[file_path.name]=dict(Counter(umi_duplicates.values()))       

    with open('umi_duplicate_counters.json', 'w') as output_file:
        json.dump(duplicate_counters_per_file, output_file)

Processing reads from no010-0_OND1_UMI.fastq.gz: 61812292it [11:05, 92875.90it/s]
Processing reads from no024-0_OD6_UMI.fastq.gz: 64319131it [11:55, 89881.89it/s]
Processing reads from no005-0_OD2_UMI.fastq.gz: 56278072it [10:14, 91516.51it/s]
Processing reads from no028-0_OND4_UMI.fastq.gz: 66329880it [11:55, 92768.81it/s]
Processing reads from no026-0_ON5_UMI.fastq.gz: 65257883it [11:41, 92986.15it/s]
Processing reads from no021-0_OA6_UMI.fastq.gz: 72284370it [12:56, 93117.21it/s]
Processing reads from no020-0_OA5_UMI.fastq.gz: 53299134it [09:32, 93133.80it/s]
Processing reads from no030-0_OND6_UMI.fastq.gz: 64716704it [11:34, 93217.61it/s]
Processing reads from no034-0_ON3_UMI.fastq.gz: 58222801it [10:25, 93047.03it/s]
Processing reads from no011-0_OND2_UMI.fastq.gz: 55611686it [09:53, 93764.92it/s]
Processing reads from no019-0_OA4_UMI.fastq.gz: 52543956it [09:23, 93172.97it/s]
Processing reads from no022-0_OD4_UMI.fastq.gz: 70044023it [12:33, 92986.30it/s]
Processing reads from no