In [1]:
import pandas as pd
import re
from pathlib import Path
from collections import defaultdict

In [16]:
df = pd.read_csv("mada_illumina_guuids.csv")

In [17]:
df

Unnamed: 0,Id Sample,Source,Date extraction,guid,Reference Genome,Plate name,Row ID,Column ID,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,Mada-P201S,Sputum,26/11/19,a7563adf-562d-458e-959a-1d7de9cb16d1,R00000039,PR2832_CRyPTIC_Peru,G,2,,,
1,Mada-P202S,Sputum,26/11/19,c8903173-fa62-40ef-938d-b2f3bc2ded7a,R00000039,PR2832_CRyPTIC_Peru,H,2,,,
2,Mada-P203S,Sputum,7/10/19,6b45dd89-fd62-4c7c-8a11-3ba342f49484,R00000039,PR2832_CRyPTIC_Peru,A,3,,,
3,Mada-P204S,Sputum,26/11/19,58c59cd2-6749-4508-8d10-b8d9117f1c0c,R00000039,PR2832_CRyPTIC_Peru,B,3,,,
4,Mada-P205S,Sputum,26/11/19,1533e371-e622-41fe-a253-d90ef46c15cd,R00000039,PR2832_CRyPTIC_Peru,C,3,,,
...,...,...,...,...,...,...,...,...,...,...,...
77,Mada-P227S_r_1,Sputum,26/9/19,9798bdc3-b73b-4f68-87c5-efb2d346a733,R00000039,PR2832_CRyPTIC_Peru,D,12,,,
78,Mada-P228S_r_1,Sputum,26/9/19,0c62a97f-591f-4f6a-9730-e02a5a7fb03a,R00000039,PR2832_CRyPTIC_Peru,E,12,,,
79,Mada-P229S_r_1,Sputum,8/10/19,cae8c334-44a0-45e6-8ed8-fd41b3d8662a,R00000039,PR2832_CRyPTIC_Peru,F,12,,,
80,Mada-P206S_r_1,Sputum,17/10/19,af661c80-aa1e-4f83-8759-ac87e8e94ccc,R00000039,PR2832_CRyPTIC_Peru,G,12,,,


In [18]:
def validate_sample(sample, src):
    s = sample.split("_")[0][-1].lower()
    assert s == src.lower()[0], f"{s}\t{src}"
    
def transform(s):
    s = s.replace("Mada-", "")
    s = s.replace("Pctrl", "PCtrl-")
    if s.endswith("_r"):
        s += "ep1"
    elif s.endswith("_r_1"):
        s = s.replace("r_1", "rep2")
    return re.sub(r"(?P<num>\d)[SC]", "\g<num>", s)

In [19]:
i = 0
data = []
for _, row in df.iterrows():
    sample = row["Id Sample"].strip()
    src = row["Source"].strip()
    guuid = row["guid"].strip()
    validate_sample(sample, src)
    sample = transform(sample)
    i += 1
    data.append((sample, src.lower(), guuid))

In [22]:
odf = pd.DataFrame(data, columns=["sample", "source", "guuid"])
odf.set_index(["sample", "source"], inplace=True)

In [23]:
odf

Unnamed: 0_level_0,Unnamed: 1_level_0,guuid
sample,source,Unnamed: 2_level_1
P201,sputum,a7563adf-562d-458e-959a-1d7de9cb16d1
P202,sputum,c8903173-fa62-40ef-938d-b2f3bc2ded7a
P203,sputum,6b45dd89-fd62-4c7c-8a11-3ba342f49484
P204,sputum,58c59cd2-6749-4508-8d10-b8d9117f1c0c
P205,sputum,1533e371-e622-41fe-a253-d90ef46c15cd
...,...,...
P227_rep2,sputum,9798bdc3-b73b-4f68-87c5-efb2d346a733
P228_rep2,sputum,0c62a97f-591f-4f6a-9730-e02a5a7fb03a
P229_rep2,sputum,cae8c334-44a0-45e6-8ed8-fd41b3d8662a
P206_rep2,sputum,af661c80-aa1e-4f83-8759-ac87e8e94ccc


In [58]:
odf.to_csv("illumina_samplesheet.csv", index=True)

In [86]:
odf.query("sample=='Test_38'")

Unnamed: 0,sample,source,guuid


In [None]:
missing_guuids = [
    "c8903173-fa62-40ef-938d-b2f3bc2ded7a", 
    "fad80bbf-50e2-403f-beed-d40fafc5202f", 
    "02b01033-8310-4187-8007-92992893079f"
]

In [95]:
d = Path("/nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina")
prefixes = set()
for _, row in odf.iterrows():
    outdir = d / row["source"]
    prefix = outdir / row["sample"]
    if prefix in prefixes:
        raise ValueError(f"Seen {prefix}")
    else:
        prefixes.add(prefix)
    cmd = f"samtools fastq -1 {prefix}_R1.fq.gz -2 {prefix}_R2.fq.gz {row.guuid}.bam"
    print(f"echo \"Processing {prefix} ...\"")
    print(cmd)

echo "Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P201 ..."
samtools fastq -1 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P201_R1.fq.gz -2 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P201_R2.fq.gz a7563adf-562d-458e-959a-1d7de9cb16d1.bam
echo "Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P202 ..."
samtools fastq -1 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P202_R1.fq.gz -2 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P202_R2.fq.gz c8903173-fa62-40ef-938d-b2f3bc2ded7a.bam
echo "Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P203 ..."
samtools fastq -1 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P203_R1.fq.gz -2 /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P203_R2.fq.gz 6b45dd89-fd62-4c7c-8a11-3ba342f49484.bam
echo "Processing /nfs/research/zi/zi/projects/

In [77]:
# pure culture
s = """SO_9092_SET1_FMR_NP_15_R1.fastq.gz  SO_9092_SET2_FMR_NP_32_R1.fastq.gz  SO_9092_SET2_FMR_NP_41_R1.fastq.gz  SO_9092_SET2_FMR_NP_48_R1.fastq.gz  SO_9092_SET2_FMR_NP_56_R1.fastq.gz
SO_9092_SET1_FMR_NP_15_R2.fastq.gz  SO_9092_SET2_FMR_NP_32_R2.fastq.gz  SO_9092_SET2_FMR_NP_41_R2.fastq.gz  SO_9092_SET2_FMR_NP_48_R2.fastq.gz  SO_9092_SET2_FMR_NP_56_R2.fastq.gz
SO_9092_SET1_FMR_NP_16_R1.fastq.gz  SO_9092_SET2_FMR_NP_34_R1.fastq.gz  SO_9092_SET2_FMR_NP_42_R1.fastq.gz  SO_9092_SET2_FMR_NP_51_R1.fastq.gz  SO_9092_SET2_FMR_NP_60_R1.fastq.gz
SO_9092_SET1_FMR_NP_16_R2.fastq.gz  SO_9092_SET2_FMR_NP_34_R2.fastq.gz  SO_9092_SET2_FMR_NP_42_R2.fastq.gz  SO_9092_SET2_FMR_NP_51_R2.fastq.gz  SO_9092_SET2_FMR_NP_60_R2.fastq.gz
SO_9092_SET1_FMR_NP_19_R1.fastq.gz  SO_9092_SET2_FMR_NP_36_R1.fastq.gz  SO_9092_SET2_FMR_NP_43_R1.fastq.gz  SO_9092_SET2_FMR_NP_52_R1.fastq.gz  SO_9092_SET2_FMR_NP_65_R1.fastq.gz
SO_9092_SET1_FMR_NP_19_R2.fastq.gz  SO_9092_SET2_FMR_NP_36_R2.fastq.gz  SO_9092_SET2_FMR_NP_43_R2.fastq.gz  SO_9092_SET2_FMR_NP_52_R2.fastq.gz  SO_9092_SET2_FMR_NP_65_R2.fastq.gz
SO_9092_SET1_FMR_NP_20_R1.fastq.gz  SO_9092_SET2_FMR_NP_37_R1.fastq.gz  SO_9092_SET2_FMR_NP_45_R1.fastq.gz  SO_9092_SET2_FMR_NP_53_R1.fastq.gz  SO_9092_SET3_FMR_NP_22_R1.fq.gz
SO_9092_SET1_FMR_NP_20_R2.fastq.gz  SO_9092_SET2_FMR_NP_37_R2.fastq.gz  SO_9092_SET2_FMR_NP_45_R2.fastq.gz  SO_9092_SET2_FMR_NP_53_R2.fastq.gz  SO_9092_SET3_FMR_NP_22_R2.fq.gz
SO_9092_SET1_FMR_NP_28_R1.fastq.gz  SO_9092_SET2_FMR_NP_38_R1.fastq.gz  SO_9092_SET2_FMR_NP_46_R1.fastq.gz  SO_9092_SET2_FMR_NP_54_R1.fastq.gz  SO_9092_SET3_FMR_NP_50_R1.fq.gz
SO_9092_SET1_FMR_NP_28_R2.fastq.gz  SO_9092_SET2_FMR_NP_38_R2.fastq.gz  SO_9092_SET2_FMR_NP_46_R2.fastq.gz  SO_9092_SET2_FMR_NP_54_R2.fastq.gz  SO_9092_SET3_FMR_NP_50_R2.fq.gz
SO_9092_SET2_FMR_NP_23_R1.fastq.gz  SO_9092_SET2_FMR_NP_40_R1.fastq.gz  SO_9092_SET2_FMR_NP_47_R1.fastq.gz  SO_9092_SET2_FMR_NP_55_R1.fastq.gz
SO_9092_SET2_FMR_NP_23_R2.fastq.gz  SO_9092_SET2_FMR_NP_40_R2.fastq.gz  SO_9092_SET2_FMR_NP_47_R2.fastq.gz  SO_9092_SET2_FMR_NP_55_R2.fastq.gz"""

In [78]:
dests = set()
for fname in map(str.strip, s.split()):
    m = re.search(r"_(?P<num>\d\d)(?P<suf>_R[12])\.f", fname)
    if not m:
        raise ValueError(fname)
    sample = f"Test_{m.group('num')}"
    dest = f"{sample}{m.group('suf')}.fq.gz"
    if dest in dests:
        raise ValueError(f"Seen {dest} before")
    else:
        dests.add(dest)
    print(f"mv {fname} {dest}")

mv SO_9092_SET1_FMR_NP_15_R1.fastq.gz Test_15_R1.fq.gz
mv SO_9092_SET2_FMR_NP_32_R1.fastq.gz Test_32_R1.fq.gz
mv SO_9092_SET2_FMR_NP_41_R1.fastq.gz Test_41_R1.fq.gz
mv SO_9092_SET2_FMR_NP_48_R1.fastq.gz Test_48_R1.fq.gz
mv SO_9092_SET2_FMR_NP_56_R1.fastq.gz Test_56_R1.fq.gz
mv SO_9092_SET1_FMR_NP_15_R2.fastq.gz Test_15_R2.fq.gz
mv SO_9092_SET2_FMR_NP_32_R2.fastq.gz Test_32_R2.fq.gz
mv SO_9092_SET2_FMR_NP_41_R2.fastq.gz Test_41_R2.fq.gz
mv SO_9092_SET2_FMR_NP_48_R2.fastq.gz Test_48_R2.fq.gz
mv SO_9092_SET2_FMR_NP_56_R2.fastq.gz Test_56_R2.fq.gz
mv SO_9092_SET1_FMR_NP_16_R1.fastq.gz Test_16_R1.fq.gz
mv SO_9092_SET2_FMR_NP_34_R1.fastq.gz Test_34_R1.fq.gz
mv SO_9092_SET2_FMR_NP_42_R1.fastq.gz Test_42_R1.fq.gz
mv SO_9092_SET2_FMR_NP_51_R1.fastq.gz Test_51_R1.fq.gz
mv SO_9092_SET2_FMR_NP_60_R1.fastq.gz Test_60_R1.fq.gz
mv SO_9092_SET1_FMR_NP_16_R2.fastq.gz Test_16_R2.fq.gz
mv SO_9092_SET2_FMR_NP_34_R2.fastq.gz Test_34_R2.fq.gz
mv SO_9092_SET2_FMR_NP_42_R2.fastq.gz Test_42_R2.fq.gz
mv SO_9092

In [80]:
# sputum
s = """SO_9090_SET4_FMR_IS_15_R1.fastq.gz  SO_9090_SET4_FMR_IS_28_R2.fastq.gz  SO_9090_SET4_FMR_IS_45_R1.fastq.gz  SO_9090_SET4_FMR_IS_60_R2.fastq.gz  SO_9092_SET1_FMR_IS_41_R1.fastq.gz
SO_9090_SET4_FMR_IS_15_R2.fastq.gz  SO_9090_SET4_FMR_IS_32_R1.fastq.gz  SO_9090_SET4_FMR_IS_45_R2.fastq.gz  SO_9090_SET4_FMR_IS_N1_R1.fastq.gz  SO_9092_SET1_FMR_IS_41_R2.fastq.gz
SO_9090_SET4_FMR_IS_16_R1.fastq.gz  SO_9090_SET4_FMR_IS_32_R2.fastq.gz  SO_9090_SET4_FMR_IS_47_R1.fastq.gz  SO_9090_SET4_FMR_IS_N1_R2.fastq.gz  SO_9092_SET1_FMR_IS_46_R1.fastq.gz
SO_9090_SET4_FMR_IS_16_R2.fastq.gz  SO_9090_SET4_FMR_IS_34_R1.fastq.gz  SO_9090_SET4_FMR_IS_47_R2.fastq.gz  SO_9090_SET4_FMR_IS_N2_R1.fastq.gz  SO_9092_SET1_FMR_IS_46_R2.fastq.gz
SO_9090_SET4_FMR_IS_18_R1.fastq.gz  SO_9090_SET4_FMR_IS_34_R2.fastq.gz  SO_9090_SET4_FMR_IS_48_R1.fastq.gz  SO_9090_SET4_FMR_IS_N2_R2.fastq.gz  SO_9092_SET1_FMR_IS_51_R1.fastq.gz
SO_9090_SET4_FMR_IS_18_R2.fastq.gz  SO_9090_SET4_FMR_IS_36_R1.fastq.gz  SO_9090_SET4_FMR_IS_48_R2.fastq.gz  SO_9090_SET4_FMR_IS_N3_R1.fastq.gz  SO_9092_SET1_FMR_IS_51_R2.fastq.gz
SO_9090_SET4_FMR_IS_19_R1.fastq.gz  SO_9090_SET4_FMR_IS_36_R2.fastq.gz  SO_9090_SET4_FMR_IS_50_R1.fastq.gz  SO_9090_SET4_FMR_IS_N3_R2.fastq.gz  SO_9092_SET1_FMR_IS_53_R1.fastq.gz
SO_9090_SET4_FMR_IS_19_R2.fastq.gz  SO_9090_SET4_FMR_IS_37_R1.fastq.gz  SO_9090_SET4_FMR_IS_50_R2.fastq.gz  SO_9090_SET4_FMR_IS_N4_R1.fastq.gz  SO_9092_SET1_FMR_IS_53_R2.fastq.gz
SO_9090_SET4_FMR_IS_20_R1.fastq.gz  SO_9090_SET4_FMR_IS_37_R2.fastq.gz  SO_9090_SET4_FMR_IS_52_R1.fastq.gz  SO_9090_SET4_FMR_IS_N4_R2.fastq.gz  SO_9092_SET1_FMR_IS_56_R1.fastq.gz
SO_9090_SET4_FMR_IS_20_R2.fastq.gz  SO_9090_SET4_FMR_IS_40_R1.fastq.gz  SO_9090_SET4_FMR_IS_52_R2.fastq.gz  SO_9090_SET4_FMR_IS_N5_R1.fastq.gz  SO_9092_SET1_FMR_IS_56_R2.fastq.gz
SO_9090_SET4_FMR_IS_22_R1.fastq.gz  SO_9090_SET4_FMR_IS_40_R2.fastq.gz  SO_9090_SET4_FMR_IS_54_R1.fastq.gz  SO_9090_SET4_FMR_IS_N5_R2.fastq.gz  SO_9092_SET1_FMR_IS_65_R1.fastq.gz
SO_9090_SET4_FMR_IS_22_R2.fastq.gz  SO_9090_SET4_FMR_IS_42_R1.fastq.gz  SO_9090_SET4_FMR_IS_54_R2.fastq.gz  SO_9090_SET4_FMR_IS_N6_R1.fastq.gz  SO_9092_SET1_FMR_IS_65_R2.fastq.gz
SO_9090_SET4_FMR_IS_23_R1.fastq.gz  SO_9090_SET4_FMR_IS_42_R2.fastq.gz  SO_9090_SET4_FMR_IS_55_R1.fastq.gz  SO_9090_SET4_FMR_IS_N6_R2.fastq.gz
SO_9090_SET4_FMR_IS_23_R2.fastq.gz  SO_9090_SET4_FMR_IS_43_R1.fastq.gz  SO_9090_SET4_FMR_IS_55_R2.fastq.gz  SO_9092_SET1_FMR_IS_38_R1.fastq.gz
SO_9090_SET4_FMR_IS_28_R1.fastq.gz  SO_9090_SET4_FMR_IS_43_R2.fastq.gz  SO_9090_SET4_FMR_IS_60_R1.fastq.gz  SO_9092_SET1_FMR_IS_38_R2.fastq.gz"""

In [81]:
dests = set()
for fname in map(str.strip, s.split()):
    m = re.search(r"IS_(?P<num>[N0-9]\d)(?P<suf>_R[12])\.f", fname)
    if not m:
        raise ValueError(fname)
    sid = m.group("num")
    if sid[0] == "N":
        sid = sid.replace("N", "NC")

    sample = f"Test_{sid}"
    dest = f"{sample}{m.group('suf')}.fq.gz"
    if dest in dests:
        raise ValueError(f"Seen {dest} before")
    else:
        dests.add(dest)
    print(f"mv {fname} {dest}")

mv SO_9090_SET4_FMR_IS_15_R1.fastq.gz Test_15_R1.fq.gz
mv SO_9090_SET4_FMR_IS_28_R2.fastq.gz Test_28_R2.fq.gz
mv SO_9090_SET4_FMR_IS_45_R1.fastq.gz Test_45_R1.fq.gz
mv SO_9090_SET4_FMR_IS_60_R2.fastq.gz Test_60_R2.fq.gz
mv SO_9092_SET1_FMR_IS_41_R1.fastq.gz Test_41_R1.fq.gz
mv SO_9090_SET4_FMR_IS_15_R2.fastq.gz Test_15_R2.fq.gz
mv SO_9090_SET4_FMR_IS_32_R1.fastq.gz Test_32_R1.fq.gz
mv SO_9090_SET4_FMR_IS_45_R2.fastq.gz Test_45_R2.fq.gz
mv SO_9090_SET4_FMR_IS_N1_R1.fastq.gz Test_NC1_R1.fq.gz
mv SO_9092_SET1_FMR_IS_41_R2.fastq.gz Test_41_R2.fq.gz
mv SO_9090_SET4_FMR_IS_16_R1.fastq.gz Test_16_R1.fq.gz
mv SO_9090_SET4_FMR_IS_32_R2.fastq.gz Test_32_R2.fq.gz
mv SO_9090_SET4_FMR_IS_47_R1.fastq.gz Test_47_R1.fq.gz
mv SO_9090_SET4_FMR_IS_N1_R2.fastq.gz Test_NC1_R2.fq.gz
mv SO_9092_SET1_FMR_IS_46_R1.fastq.gz Test_46_R1.fq.gz
mv SO_9090_SET4_FMR_IS_16_R2.fastq.gz Test_16_R2.fq.gz
mv SO_9090_SET4_FMR_IS_34_R1.fastq.gz Test_34_R1.fq.gz
mv SO_9090_SET4_FMR_IS_47_R2.fastq.gz Test_47_R2.fq.gz
mv SO_90

Which Illumina replicates have the most reads

In [1]:
log = """Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P201 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 512952 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P202 ...
[E::hts_open_format] Failed to open file "c8903173-fa62-40ef-938d-b2f3bc2ded7a.bam" : No such file or directory
samtools bam2fq: Cannot read file "c8903173-fa62-40ef-938d-b2f3bc2ded7a.bam": No such file or directory
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P203 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 1506850 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P204 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7628062 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P205 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4865832 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3704058 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P206 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 843978 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P207 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8686918 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P208 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7044004 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P209 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5375586 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P210 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5541282 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5995356 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P211 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6204742 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P212 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5392074 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P213 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 335906 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P214 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5249570 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P215 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4562012 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-3 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5729924 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P216 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4261300 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P217 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4754716 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P218 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4127046 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P219 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6320304 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P220 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7913642 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-4 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3508394 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P221 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5506098 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P222 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3602582 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P223 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3868806 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P224 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 16259172 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P225 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 358508 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-5 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4893646 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P226 ...
[E::hts_open_format] Failed to open file "fad80bbf-50e2-403f-beed-d40fafc5202f.bam" : No such file or directory
samtools bam2fq: Cannot read file "fad80bbf-50e2-403f-beed-d40fafc5202f.bam": No such file or directory
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P227 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6836632 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P228 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3947488 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P229 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6165866 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P230 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8675034 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/PCtrl-6 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3537368 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P202 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7637918 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P203 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7369748 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P204 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8931860 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P205 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6973944 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P209 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6467238 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P210 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8136382 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P211 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6738760 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P212 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8501868 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P214 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7568100 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P215 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7355188 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P216 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6860014 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P217 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8785700 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P218 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7132708 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P221 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8635980 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P222 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7432384 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P223 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 10314772 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P224 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6996926 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P225 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7302992 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P226 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7413090 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P227 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6214300 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P228 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7635494 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P229 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8224008 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P230 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8433420 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P206_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 976840 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P219_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7823882 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P220_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8345412 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P222_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 3144502 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P225_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 363184 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P227_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7525428 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P228_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4407484 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P229_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6651428 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P228_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7566494 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P229_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8276894 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/culture/P230_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 7385096 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P201_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 643760 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P202_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4579114 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P203_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 1590352 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P204_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 9164788 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P205_rep1 ...
[E::hts_open_format] Failed to open file "02b01033-8310-4187-8007-92992893079f.bam" : No such file or directory
samtools bam2fq: Cannot read file "02b01033-8310-4187-8007-92992893079f.bam": No such file or directory
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P210_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6656830 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P211_rep1 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 6124074 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P227_rep2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 8051014 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P228_rep2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4381138 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P229_rep2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 5148456 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P206_rep2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 851262 reads
Processing /nfs/research/zi/zi/projects/tb/mada-mumbai-sputum/illumina/sputum/P222_rep2 ...
[M::bam2fq_mainloop] discarded 0 singletons
[M::bam2fq_mainloop] processed 4016770 reads"""

In [5]:
d = dict()
for line in log.splitlines():
    if line.startswith("Proce"):
        sample = Path(line.split()[1]).name
    elif "No such" in line:
        continue
    elif "processed " in line:
        n_reads = int(line.split()[2])
        d[sample] = n_reads

In [8]:
sorted((k, v) for k, v in d.items())

[('P201', 512952),
 ('P201_rep1', 643760),
 ('P202', 7637918),
 ('P202_rep1', 4579114),
 ('P203', 7369748),
 ('P203_rep1', 1590352),
 ('P204', 8931860),
 ('P204_rep1', 9164788),
 ('P205', 6973944),
 ('P206', 843978),
 ('P206_rep1', 976840),
 ('P206_rep2', 851262),
 ('P207', 8686918),
 ('P208', 7044004),
 ('P209', 6467238),
 ('P210', 8136382),
 ('P210_rep1', 6656830),
 ('P211', 6738760),
 ('P211_rep1', 6124074),
 ('P212', 8501868),
 ('P213', 335906),
 ('P214', 7568100),
 ('P215', 7355188),
 ('P216', 6860014),
 ('P217', 8785700),
 ('P218', 7132708),
 ('P219', 6320304),
 ('P219_rep1', 7823882),
 ('P220', 7913642),
 ('P220_rep1', 8345412),
 ('P221', 8635980),
 ('P222', 7432384),
 ('P222_rep1', 3144502),
 ('P222_rep2', 4016770),
 ('P223', 10314772),
 ('P224', 6996926),
 ('P225', 7302992),
 ('P225_rep1', 363184),
 ('P226', 7413090),
 ('P227', 6214300),
 ('P227_rep1', 7525428),
 ('P227_rep2', 8051014),
 ('P228', 7635494),
 ('P228_rep1', 7566494),
 ('P228_rep2', 4381138),
 ('P229', 8224008),
 

In [9]:
# sputum fastqs
sputum_fqs = """P201_R1.fq.gz       P206_rep1_R1.fq.gz  P213_R1.fq.gz       P221_R1.fq.gz       P227_rep2_R1.fq.gz  PCtrl-3_R1.fq.gz  Test_23_R1.fq.gz  Test_43_R1.fq.gz  Test_55_R1.fq.gz
P201_R2.fq.gz       P206_rep1_R2.fq.gz  P213_R2.fq.gz       P221_R2.fq.gz       P227_rep2_R2.fq.gz  PCtrl-3_R2.fq.gz  Test_23_R2.fq.gz  Test_43_R2.fq.gz  Test_55_R2.fq.gz
P201_rep1_R1.fq.gz  P206_rep2_R1.fq.gz  P214_R1.fq.gz       P222_R1.fq.gz       P228_R1.fq.gz       PCtrl-4_R1.fq.gz  Test_28_R1.fq.gz  Test_45_R1.fq.gz  Test_56_R1.fq.gz
P201_rep1_R2.fq.gz  P206_rep2_R2.fq.gz  P214_R2.fq.gz       P222_R2.fq.gz       P228_R2.fq.gz       PCtrl-4_R2.fq.gz  Test_28_R2.fq.gz  Test_45_R2.fq.gz  Test_56_R2.fq.gz
P202_R1.fq.gz       P207_R1.fq.gz       P215_R1.fq.gz       P222_rep1_R1.fq.gz  P228_rep1_R1.fq.gz  PCtrl-5_R1.fq.gz  Test_32_R1.fq.gz  Test_46_R1.fq.gz  Test_60_R1.fq.gz
P202_R2.fq.gz       P207_R2.fq.gz       P215_R2.fq.gz       P222_rep1_R2.fq.gz  P228_rep1_R2.fq.gz  PCtrl-5_R2.fq.gz  Test_32_R2.fq.gz  Test_46_R2.fq.gz  Test_60_R2.fq.gz
P202_rep1_R1.fq.gz  P208_R1.fq.gz       P216_R1.fq.gz       P222_rep2_R1.fq.gz  P228_rep2_R1.fq.gz  PCtrl-6_R1.fq.gz  Test_34_R1.fq.gz  Test_47_R1.fq.gz  Test_65_R1.fq.gz
P202_rep1_R2.fq.gz  P208_R2.fq.gz       P216_R2.fq.gz       P222_rep2_R2.fq.gz  P228_rep2_R2.fq.gz  PCtrl-6_R2.fq.gz  Test_34_R2.fq.gz  Test_47_R2.fq.gz  Test_65_R2.fq.gz
P203_R1.fq.gz       P209_R1.fq.gz       P217_R1.fq.gz       P223_R1.fq.gz       P229_R1.fq.gz       Test_15_R1.fq.gz  Test_36_R1.fq.gz  Test_48_R1.fq.gz  Test_NC1_R1.fq.gz
P203_R2.fq.gz       P209_R2.fq.gz       P217_R2.fq.gz       P223_R2.fq.gz       P229_R2.fq.gz       Test_15_R2.fq.gz  Test_36_R2.fq.gz  Test_48_R2.fq.gz  Test_NC1_R2.fq.gz
P203_rep1_R1.fq.gz  P210_R1.fq.gz       P218_R1.fq.gz       P224_R1.fq.gz       P229_rep1_R1.fq.gz  Test_16_R1.fq.gz  Test_37_R1.fq.gz  Test_50_R1.fq.gz  Test_NC2_R1.fq.gz
P203_rep1_R2.fq.gz  P210_R2.fq.gz       P218_R2.fq.gz       P224_R2.fq.gz       P229_rep1_R2.fq.gz  Test_16_R2.fq.gz  Test_37_R2.fq.gz  Test_50_R2.fq.gz  Test_NC2_R2.fq.gz
P204_R1.fq.gz       P210_rep1_R1.fq.gz  P219_R1.fq.gz       P225_R1.fq.gz       P229_rep2_R1.fq.gz  Test_18_R1.fq.gz  Test_38_R1.fq.gz  Test_51_R1.fq.gz  Test_NC3_R1.fq.gz
P204_R2.fq.gz       P210_rep1_R2.fq.gz  P219_R2.fq.gz       P225_R2.fq.gz       P229_rep2_R2.fq.gz  Test_18_R2.fq.gz  Test_38_R2.fq.gz  Test_51_R2.fq.gz  Test_NC3_R2.fq.gz
P204_rep1_R1.fq.gz  P211_R1.fq.gz       P219_rep1_R1.fq.gz  P225_rep1_R1.fq.gz  P230_R1.fq.gz       Test_19_R1.fq.gz  Test_40_R1.fq.gz  Test_52_R1.fq.gz  Test_NC4_R1.fq.gz
P204_rep1_R2.fq.gz  P211_R2.fq.gz       P219_rep1_R2.fq.gz  P225_rep1_R2.fq.gz  P230_R2.fq.gz       Test_19_R2.fq.gz  Test_40_R2.fq.gz  Test_52_R2.fq.gz  Test_NC4_R2.fq.gz
P205_R1.fq.gz       P211_rep1_R1.fq.gz  P220_R1.fq.gz       P227_R1.fq.gz       PCtrl-1_R1.fq.gz    Test_20_R1.fq.gz  Test_41_R1.fq.gz  Test_53_R1.fq.gz  Test_NC5_R1.fq.gz
P205_R2.fq.gz       P211_rep1_R2.fq.gz  P220_R2.fq.gz       P227_R2.fq.gz       PCtrl-1_R2.fq.gz    Test_20_R2.fq.gz  Test_41_R2.fq.gz  Test_53_R2.fq.gz  Test_NC5_R2.fq.gz
P206_R1.fq.gz       P212_R1.fq.gz       P220_rep1_R1.fq.gz  P227_rep1_R1.fq.gz  PCtrl-2_R1.fq.gz    Test_22_R1.fq.gz  Test_42_R1.fq.gz  Test_54_R1.fq.gz  Test_NC6_R1.fq.gz
P206_R2.fq.gz       P212_R2.fq.gz       P220_rep1_R2.fq.gz  P227_rep1_R2.fq.gz  PCtrl-2_R2.fq.gz    Test_22_R2.fq.gz  Test_42_R2.fq.gz  Test_54_R2.fq.gz  Test_NC6_R2.fq.gz""".split()
len(sputum_fqs)

180

In [10]:
# culture fastqs
culture_fqs = """P202_R1.fq.gz  P210_R1.fq.gz  P216_R1.fq.gz  P223_R1.fq.gz  P228_R1.fq.gz       P230_rep1_R1.fq.gz  Test_22_R1.fq.gz  Test_36_R1.fq.gz  Test_42_R1.fq.gz  Test_48_R1.fq.gz  Test_54_R1.fq.gz
P202_R2.fq.gz  P210_R2.fq.gz  P216_R2.fq.gz  P223_R2.fq.gz  P228_R2.fq.gz       P230_rep1_R2.fq.gz  Test_22_R2.fq.gz  Test_36_R2.fq.gz  Test_42_R2.fq.gz  Test_48_R2.fq.gz  Test_54_R2.fq.gz
P203_R1.fq.gz  P211_R1.fq.gz  P217_R1.fq.gz  P224_R1.fq.gz  P228_rep1_R1.fq.gz  Test_15_R1.fq.gz    Test_23_R1.fq.gz  Test_37_R1.fq.gz  Test_43_R1.fq.gz  Test_50_R1.fq.gz  Test_55_R1.fq.gz
P203_R2.fq.gz  P211_R2.fq.gz  P217_R2.fq.gz  P224_R2.fq.gz  P228_rep1_R2.fq.gz  Test_15_R2.fq.gz    Test_23_R2.fq.gz  Test_37_R2.fq.gz  Test_43_R2.fq.gz  Test_50_R2.fq.gz  Test_55_R2.fq.gz
P204_R1.fq.gz  P212_R1.fq.gz  P218_R1.fq.gz  P225_R1.fq.gz  P229_R1.fq.gz       Test_16_R1.fq.gz    Test_28_R1.fq.gz  Test_38_R1.fq.gz  Test_45_R1.fq.gz  Test_51_R1.fq.gz  Test_56_R1.fq.gz
P204_R2.fq.gz  P212_R2.fq.gz  P218_R2.fq.gz  P225_R2.fq.gz  P229_R2.fq.gz       Test_16_R2.fq.gz    Test_28_R2.fq.gz  Test_38_R2.fq.gz  Test_45_R2.fq.gz  Test_51_R2.fq.gz  Test_56_R2.fq.gz
P205_R1.fq.gz  P214_R1.fq.gz  P221_R1.fq.gz  P226_R1.fq.gz  P229_rep1_R1.fq.gz  Test_19_R1.fq.gz    Test_32_R1.fq.gz  Test_40_R1.fq.gz  Test_46_R1.fq.gz  Test_52_R1.fq.gz  Test_60_R1.fq.gz
P205_R2.fq.gz  P214_R2.fq.gz  P221_R2.fq.gz  P226_R2.fq.gz  P229_rep1_R2.fq.gz  Test_19_R2.fq.gz    Test_32_R2.fq.gz  Test_40_R2.fq.gz  Test_46_R2.fq.gz  Test_52_R2.fq.gz  Test_60_R2.fq.gz
P209_R1.fq.gz  P215_R1.fq.gz  P222_R1.fq.gz  P227_R1.fq.gz  P230_R1.fq.gz       Test_20_R1.fq.gz    Test_34_R1.fq.gz  Test_41_R1.fq.gz  Test_47_R1.fq.gz  Test_53_R1.fq.gz  Test_65_R1.fq.gz
P209_R2.fq.gz  P215_R2.fq.gz  P222_R2.fq.gz  P227_R2.fq.gz  P230_R2.fq.gz       Test_20_R2.fq.gz    Test_34_R2.fq.gz  Test_41_R2.fq.gz  Test_47_R2.fq.gz  Test_53_R2.fq.gz  Test_65_R2.fq.gz""".split()
len(culture_fqs)

110

In [25]:
data = []
src = "sputum"
seen = defaultdict(list)
for f in sputum_fqs:
    m = re.search(r"^(?P<sample>\S+)_(?P<num>R\d).fq", f)
    if not m:
        raise ValueError(f)
    sample = m.group("sample")
    num = m.group("num")
    seen[sample].append(num)
    try:
        guuid = odf.at[(sample, src), "guuid"]
    except KeyError:
        guuid = None
    
    data.append((sample, src, guuid))
    
for k, v in seen.items():
    assert sorted(v) == sorted(["R1", "R2"]), k
    
src = "culture"
seen = defaultdict(list)
for f in culture_fqs:
    m = re.search(r"^(?P<sample>\S+)_(?P<num>R\d).fq", f)
    if not m:
        raise ValueError(f)
    sample = m.group("sample")
    num = m.group("num")
    seen[sample].append(num)
    try:
        guuid = odf.at[(sample, src), "guuid"]
    except KeyError:
        guuid = None
    data.append((sample, src, guuid))
    
for k, v in seen.items():
    assert sorted(v) == sorted(["R1", "R2"]), k

In [31]:
illumina_df = pd.DataFrame(set(data), columns=["sample", "source", "guuid"])

In [32]:
illumina_df.set_index(["sample", "source"], inplace=True)

In [33]:
illumina_df.sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,guuid
sample,source,Unnamed: 2_level_1
P201,sputum,a7563adf-562d-458e-959a-1d7de9cb16d1
P201_rep1,sputum,71f4d2e6-1f73-4ada-9735-0f8768a865c3
P202,culture,f4c4a9b5-28b4-41b0-ab90-e587bd165918
P202,sputum,c8903173-fa62-40ef-938d-b2f3bc2ded7a
P202_rep1,sputum,da795113-966f-41a7-82f1-359b1159833a
...,...,...
Test_NC2,sputum,
Test_NC3,sputum,
Test_NC4,sputum,
Test_NC5,sputum,


In [34]:
illumina_df.to_csv("illumina_samples.csv")

## Adding the smear results to the existing sample CSV files

In [48]:
illumina_df = pd.read_csv("illumina_samples.csv")
ont_df = pd.read_csv("ont_samples.csv")

In [49]:
# mada smear results
s="""P201	  +
P202	  +
P203	  +
P204	  +
P205	  +
PCtrl-1	 -
P206	  +
P207	  +
P208	  +
P209	  +
P210	  +
PCtrl-2	 -
P211	  ++
P212	  ++
P213	  ++
P214	  ++
P215	  ++
PCtrl-3	 -
P216	  ++
P217	  ++
P218	  ++
P219	  ++
P220	  ++
PCtrl-4	 -
P221	  +++
P222	  +++
P223	  +++
P224	  +++
P225	  +++
PCtrl-5	 -
P226	  +++
P227	  +++
P228	  +++
P229	  +++
P230	  +++
PCtrl-6	 -"""

In [50]:
smear_results = dict()
for row in s.splitlines():
    sample, result = row.split()
    assert sample not in smear_results
    smear_results[sample] = result

In [51]:
# fmr smear results
s="""15	2+
16	3+
18	3+
19	1+
20	1+
22	1+
23	2+
28	2+
32	1+
34	1+
36	3+
37	3+
38	2+
40	2+
41	2+
42	1+
43	3+
45	3+
46	2+
47	3+
48	2+
50	1+
51	3+
52	1+
53	2+
54	1+
55	2+
56	3+
60	1+
65	3+
NC1	-
NC2	-
NC3	-
NC4	-
NC5	-
NC6	-
36	3+
37	3+
43	3+
45	3+
47	3+
NC3	-
15	2+
28	2+
40	2+
48	2+
55	2+
NC2	-
42	1+
54	1+
50	1+
52	1+
60	1+
NC1	-"""

In [52]:
for row in s.splitlines():
    sample, result = row.split()
    sample = f"Test_{sample}"
    if "+" in result:
        result = "+" * int(result[0])
    if sample in smear_results:
        assert smear_results[sample] == result, f"Smear results for {sample} don't match"
    else:
        smear_results[sample] = result

In [53]:
smear_results

{'P201': '+',
 'P202': '+',
 'P203': '+',
 'P204': '+',
 'P205': '+',
 'PCtrl-1': '-',
 'P206': '+',
 'P207': '+',
 'P208': '+',
 'P209': '+',
 'P210': '+',
 'PCtrl-2': '-',
 'P211': '++',
 'P212': '++',
 'P213': '++',
 'P214': '++',
 'P215': '++',
 'PCtrl-3': '-',
 'P216': '++',
 'P217': '++',
 'P218': '++',
 'P219': '++',
 'P220': '++',
 'PCtrl-4': '-',
 'P221': '+++',
 'P222': '+++',
 'P223': '+++',
 'P224': '+++',
 'P225': '+++',
 'PCtrl-5': '-',
 'P226': '+++',
 'P227': '+++',
 'P228': '+++',
 'P229': '+++',
 'P230': '+++',
 'PCtrl-6': '-',
 'Test_15': '++',
 'Test_16': '+++',
 'Test_18': '+++',
 'Test_19': '+',
 'Test_20': '+',
 'Test_22': '+',
 'Test_23': '++',
 'Test_28': '++',
 'Test_32': '+',
 'Test_34': '+',
 'Test_36': '+++',
 'Test_37': '+++',
 'Test_38': '++',
 'Test_40': '++',
 'Test_41': '++',
 'Test_42': '+',
 'Test_43': '+++',
 'Test_45': '+++',
 'Test_46': '++',
 'Test_47': '+++',
 'Test_48': '++',
 'Test_50': '+',
 'Test_51': '+++',
 'Test_52': '+',
 'Test_53': '++'

In [54]:
ont_samples = set(ont_df["sample"])

In [55]:
illumina_samples = set(illumina_df["sample"])

In [56]:
samples = ont_samples.union(illumina_samples)

In [57]:
samples = {s.replace("-singleplex", "").replace("-multiplex", "").replace("_rep1", "").replace("_rep2", "") for s in samples}

In [58]:
len(samples)

72

In [59]:
len(smear_results)

72

In [60]:
smear_results.keys() - samples

set()

In [64]:
smear_df = pd.DataFrame(smear_results.items(), columns=["sample", "smear_result"])

In [65]:
smear_df

Unnamed: 0,sample,smear_result
0,P201,+
1,P202,+
2,P203,+
3,P204,+
4,P205,+
...,...,...
67,Test_NC2,-
68,Test_NC3,-
69,Test_NC4,-
70,Test_NC5,-


In [66]:
smear_df.to_csv("smear_results.csv", index=False)