In [78]:
import pandas as pd 
import os 
import re
from tqdm import tqdm 

input_fastq_simplified = "/media/hieunguyen/HNSD01/src/ampliconSeq_UMI/src/1-TML1S1_S7501-S7701_R1_umi_seq.txt"
df = pd.read_csv(input_fastq_simplified, sep ="\t", header=None)

df.columns = ["UMI", "seq"]

count_UMI = df.groupby("UMI").count().sort_values("seq", ascending=False).reset_index()
count_UMI = count_UMI[count_UMI["UMI"].str.contains("N") == False]

from itertools import combinations

def hamming_distance(s1, s2):
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

umi_counts = dict(zip(count_UMI['UMI'], count_UMI['seq']))

merged = set()
new_umi_counts = {}

for umi in count_UMI['UMI']:
    if umi in merged:
        continue
    # Find UMIs with hamming distance <= 1
    similar = [umi2 for umi2 in count_UMI['UMI'] if umi2 != umi and hamming_distance(umi, umi2) <= 1]
    total_count = umi_counts[umi]
    for umi2 in similar:
        if umi2 not in merged:
            total_count += umi_counts[umi2]
            merged.add(umi2)
    new_umi_counts[umi] = total_count
    merged.add(umi)

combined_UMI = pd.DataFrame(list(new_umi_counts.items()), columns=['UMI', 'seq']).sort_values('seq', ascending=False).reset_index(drop=True)

if "R1" in str(input_fastq_simplified).split("/")[-1]:
    col = "forward"
else :
    col = "reverse"

def parse_fasta_to_df(fasta_path):
    records = []
    with open(fasta_path, 'r') as f:
        seq_id = None
        seq = []
        for line in f:
            line = line.strip()
            if line.startswith('>'):
                if seq_id is not None:
                    records.append({'id': seq_id, 'seq': ''.join(seq)})
                seq_id = line[1:]
                seq = []
            else:
                seq.append(line)
        if seq_id is not None:
            records.append({'id': seq_id, 'seq': ''.join(seq)})
    return pd.DataFrame(records)

forward_primerdf = parse_fasta_to_df("../primers/20250526/forward_primers.fa")
forward_primerdf.columns = ["name", "forward"]
reverse_primerdf = parse_fasta_to_df("../primers/20250526/reverse_primers.fa")
reverse_primerdf.columns = ["name", "reverse"]
primerdf = forward_primerdf.merge(reverse_primerdf, on="name", how="outer")

for i in tqdm(range(primerdf.shape[0])):
    region_name = primerdf.loc[i, "name"]
    primer_pattern = primerdf.loc[i, col]

    primer_pattern = "".join(["[C|T]" if item == "Y" else item for item in primer_pattern])
    pattern = re.compile(primer_pattern, re.IGNORECASE)
    df[region_name] = df['seq'].apply(lambda x: int(bool(pattern.search(x))))


100%|██████████| 17/17 [00:51<00:00,  3.00s/it]


In [83]:
df[[item for item in df.columns if item not in ["UMI", "seq"]]].sum()

L10_MNX1_NOM1_chr7:156798304-156798426            0
L11_MNX1_NOM1_chr7:156798407-156798535            1
L12_MNX1_NOM1_chr7:156798515-156798624            0
L13_MNX1_NOM1_chr7:156798603-156798720            0
L14_IFFO1_chr12:6664808-6664941                   0
L15_PTGER4_PRKAA1_chr5:40681146-40681273          0
L16_PTGER4_PRKAA1_chr5:40681178-40681323          0
L17_PTGER4_PRKAA1_chr5:40681281-40881401          0
L1_HEPCAM_chr11:124805885-124806030         1233183
L2_HOXA7_chr7:27196325-27196469                   0
L3_HOXA7_chr7:27196448-27196598                   0
L4_HOXA7_chr7:27196567-27196695                   0
L5_RASSF1A_chr3:50378015-50378114                 0
L6_RASSF1A_chr3:50378080-50378216                 0
L7_VEPH1_SHOX2_chr3:157812110-157812257           0
L8_VEPH1_SHOX2_chr3:157812219-157812358           0
L9_VEPH1_SHOX2_chr3:157812363-157812500           0
dtype: int64

In [93]:
df[(df["UMI"] == "CCCCCC") & (df["L1_HEPCAM_chr11:124805885-124806030"] != 1)]

Unnamed: 0,UMI,seq,L10_MNX1_NOM1_chr7:156798304-156798426,L11_MNX1_NOM1_chr7:156798407-156798535,L12_MNX1_NOM1_chr7:156798515-156798624,L13_MNX1_NOM1_chr7:156798603-156798720,L14_IFFO1_chr12:6664808-6664941,L15_PTGER4_PRKAA1_chr5:40681146-40681273,L16_PTGER4_PRKAA1_chr5:40681178-40681323,L17_PTGER4_PRKAA1_chr5:40681281-40881401,L1_HEPCAM_chr11:124805885-124806030,L2_HOXA7_chr7:27196325-27196469,L3_HOXA7_chr7:27196448-27196598,L4_HOXA7_chr7:27196567-27196695,L5_RASSF1A_chr3:50378015-50378114,L6_RASSF1A_chr3:50378080-50378216,L7_VEPH1_SHOX2_chr3:157812110-157812257,L8_VEPH1_SHOX2_chr3:157812219-157812358,L9_VEPH1_SHOX2_chr3:157812363-157812500
16061,CCCCCC,TGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAATT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18374,CCCCCC,GTGTATTTAGAGGGAACTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18687,CCCCCC,GTGTATTTAGAGGGAACTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30938,CCCCCC,GTGTATTTAGACGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
31556,CCCCCC,GTGTATTTACAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305529,CCCCCC,GTGTATTTAGAGGCAAGTTGGTAGGGTTGTGTTTGGGTAGAGGAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1308095,CCCCCC,ATGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1310146,CCCCCC,GTGTATTTAGAGCGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1314438,CCCCCC,GTGTATTTAGAGGCAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [92]:
df[(df["UMI"] == "CCCCCC") & (df["L1_HEPCAM_chr11:124805885-124806030"] == 1)]

Unnamed: 0,UMI,seq,L10_MNX1_NOM1_chr7:156798304-156798426,L11_MNX1_NOM1_chr7:156798407-156798535,L12_MNX1_NOM1_chr7:156798515-156798624,L13_MNX1_NOM1_chr7:156798603-156798720,L14_IFFO1_chr12:6664808-6664941,L15_PTGER4_PRKAA1_chr5:40681146-40681273,L16_PTGER4_PRKAA1_chr5:40681178-40681323,L17_PTGER4_PRKAA1_chr5:40681281-40881401,L1_HEPCAM_chr11:124805885-124806030,L2_HOXA7_chr7:27196325-27196469,L3_HOXA7_chr7:27196448-27196598,L4_HOXA7_chr7:27196567-27196695,L5_RASSF1A_chr3:50378015-50378114,L6_RASSF1A_chr3:50378080-50378216,L7_VEPH1_SHOX2_chr3:157812110-157812257,L8_VEPH1_SHOX2_chr3:157812219-157812358,L9_VEPH1_SHOX2_chr3:157812363-157812500
3118,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
4303,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
5941,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
6828,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7435,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1329938,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1330631,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1331451,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1332134,CCCCCC,GTGTATTTAGAGGGAAGTTGGTAGGGTTGTGTTTGGGTAGAGAAAT...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
