# concat_fastq
This notebook takes a list of FASTQ files that reprsent multiple sequencing runs of the same samples. It groups them by sample and tile, concatenates them, and saves the concatenated FASTQ.

Notebook Setup:

In [1]:
import os
import pandas as pd

Paths:

In [2]:
datadir = '/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/'
samplelist = 'concat_fastq_samplelist.csv'
outdir = '/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/'
pseudo_date = '220622-230109'

os.makedirs(outdir, exist_ok=True)

Load samplelist:

In [3]:
samples = pd.read_csv(samplelist)[['library','selection','antibody','percent-infectivity','date','R1']]

# We are going to drop ZKA-64 from future analysis as it was previously analyzed in Sourisseau et al
samples = samples.query('antibody != "ZKA-64"')

samples.style.hide_index()

library,selection,antibody,percent-infectivity,date,R1
lib1,C8-1800,EDE1-C8,0.01%,220622,/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C8_1600_S1_R1_001.fastq.gz
lib1,C10-300,EDE1-C10,0.11%,220622,/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz
lib1,MZ4-4800,MZ4,0.46%,220622,/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_MZ4_4800_S5_R1_001.fastq.gz
lib1,ZV67-40000,ZV-67,0.84%,220622,/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_ZV67_4000_S10_R1_001.fastq.gz
lib1,no-antibody,no-antibody,100,220622,/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_unsel_S11_R1_001.fastq.gz
lib1,C8-1800,EDE1-C8,0.01%,220627,/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C8_1600_S1_R1_001.fastq.gz
lib1,C10-300,EDE1-C10,0.11%,220627,/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz
lib1,MZ4-4800,MZ4,0.46%,220627,/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_MZ4_4800_S5_R1_001.fastq.gz
lib1,ZV67-40000,ZV-67,0.84%,220627,/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_ZV67_4000_S10_R1_001.fastq.gz
lib1,no-antibody,no-antibody,100,220627,/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_unsel_S11_R1_001.fastq.gz


Generate list of R1 files grouped by sample information. Ignore date, since we are combining sequencing runs. Sample information will be:
* library
* selection

In [4]:
R1_files = (
    pd.DataFrame(
        samples.groupby(['library', 'antibody', 'selection', 'percent-infectivity'])
        ['R1']
        .apply(list))
     .reset_index()
     .rename(columns={'R1': 'R1_list'})
    )

R1_files.style.hide_index()

library,antibody,selection,percent-infectivity,R1_list
lib1,EDE1-C10,C10-300,0.11%,"['/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz']"
lib1,EDE1-C8,C8-1800,0.01%,"['/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C8_1600_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C8_1600_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_C8_1600_S1_R1_001.fastq.gz']"
lib1,MZ4,MZ4-4800,0.46%,"['/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_MZ4_4800_S5_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_MZ4_4800_S5_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_MZ4_4800_S5_R1_001.fastq.gz']"
lib1,SIgN-3C,SIgN-20000,0.21%,['/shared/ngs/illumina/ckikawa/230607_VH01189_121_AACVMNWM5/Unaligned/Project_ckikawa/lib1_sign_20000_S2_R1_001.fastq.gz']
lib1,ZV-67,ZV67-40000,0.84%,"['/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_ZV67_4000_S10_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_ZV67_4000_S10_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_ZV67_4000_S10_R1_001.fastq.gz']"
lib1,no-antibody,no-antibody,100,"['/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_unsel_S11_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_unsel_S11_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_unsel_S11_R1_001.fastq.gz']"
lib2,EDE1-C10,C10-300,0.23%,['/shared/ngs/illumina/ckikawa/230607_VH01189_121_AACVMNWM5/Unaligned/Project_ckikawa/lib2_C10_250_S4_R1_001.fastq.gz']
lib2,EDE1-C8,C8-1800,0.13%,"['/shared/ngs/illumina/ckikawa/220915_M04866_0608_000000000-KHT5Y/Unaligned/Project_ckikawa/lib2_C8_1800_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220916_M00492_0090_000000000-KHT4G/Unaligned/Project_ckikawa/lib2_C8_1800_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221010_M04866_0616_000000000-KKW9K/Unaligned/Project_ckikawa/lib2_C8_1800_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221011_M00492_0099_000000000-KL69Y/Unaligned/Project_ckikawa/lib2_C8_1800_S1_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221011_M08474_0004_000000000-KKPGV/Unaligned/Project_ckikawa/lib2_C8_1800_S1_R1_001.fastq.gz']"
lib2,MZ4,MZ4-4800,0.02%,"['/shared/ngs/illumina/ckikawa/220915_M04866_0608_000000000-KHT5Y/Unaligned/Project_ckikawa/lib2_MZ4_4800_S7_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/220916_M00492_0090_000000000-KHT4G/Unaligned/Project_ckikawa/lib2_MZ4_4800_S7_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221010_M04866_0616_000000000-KKW9K/Unaligned/Project_ckikawa/lib2_MZ4_4800_S7_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221011_M00492_0099_000000000-KL69Y/Unaligned/Project_ckikawa/lib2_MZ4_4800_S7_R1_001.fastq.gz', '/shared/ngs/illumina/ckikawa/221011_M08474_0004_000000000-KKPGV/Unaligned/Project_ckikawa/lib2_MZ4_4800_S7_R1_001.fastq.gz']"
lib2,SIgN-3C,SIgN-10000,0.35%,"['/shared/ngs/illumina/ckikawa/230105_M04866_0630_000000000-KNBVD/Unaligned/Project_ckikawa/lib2_sign_20000_S3_R1_001.fastq.gz ', '/shared/ngs/illumina/ckikawa/230106_M00492_0117_000000000-KNCB5/Unaligned/Project_ckikawa/lib2_sign_20000_S3_R1_001.fastq.gz ', '/shared/ngs/illumina/ckikawa/230109_M04866_0631_000000000-KNBR4/Unaligned/Project_ckikawa/lib2_sign_20000_S3_R1_001.fastq.gz ', '/shared/ngs/illumina/ckikawa/230110_M00492_0119_000000000-KP7C8/Unaligned/Project_ckikawa/lib2_sign_20000_S3_R1_001.fastq.gz ']"


Generate output directory with name "concat_fastq_out"

In [5]:
os.system('mkdir concat_fastq_out -p')

0

Make `cat` command string for each sample in R1_files

In [6]:
# Keep track of output files
output_samples = list()

for index, row in R1_files.iterrows():
    print(f"Generating command for {row['library']} {row['selection']}.\n")
    
    # make strings with all R1_files and R2_files for each sample
    print("The R1 and R2 files for this sample are:")
    R1_ls = []
    R2_ls = []

    for r1 in row["R1_list"]:
        print(r1)
        R1_ls.append(r1)
        assert r1.count('_R1') == 1, ("Can't guess R2 file for R1 "
                        "file {0}".format(r1))
        r2 = r1.replace('_R1', '_R2')
        print(r2)
        R2_ls.append(r2)
    
    R1_string = " ".join(map(str, R1_ls))
    R2_string = " ".join(map(str, R2_ls))
    
    # make R1 strings with output file name
    r1_output_file = (outdir +
                   row['library'] + '_' +
                   row['selection'].replace('-','_') + '_' +
                   "R1" +
                   '.fastq.gz')
    print(f"\nThe R1 output file will be: {r1_output_file}\n")

    r1_command = "cat " + R1_string + " > " + r1_output_file

    print("The command we run to concatentate these R1 files will be:")
    print(r1_command)
    
    # make R2 strings with output file name
    r2_output_file = (outdir +
                   row['library'] + '_' +
                   row['selection'].replace('-','_') + '_' +
                   "R2" +
                   '.fastq.gz')
    print(f"\nThe R2 output file will be: {r2_output_file}\n")

    r2_command = "cat " + R2_string + " > " + r2_output_file

    print("The command we run to concatentate these R2 files will be:")
    print(r2_command)
    print("\n")
    
    # run cat strings in the command line
    print("\nRunning.")
    os.system(r1_command)
    os.system(r2_command)

    # make csv formatted lists of sample library, selection, R1
    # do not need to include paths to R2
    # dms_tools2 identifies R2 file paths from R1 file paths
    print("Adding CSV data to output_samples list.")
    csv_string = ','.join([row['library'],
                           row['antibody'],
                           row['selection'],
                           row['percent-infectivity'],
                           pseudo_date,
                           str(r1_output_file)])
    output_samples.append(csv_string)
    print("Done.\n\n")


Generating command for lib1 C10-300.

The R1 and R2 files for this sample are:
/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz
/shared/ngs/illumina/ckikawa/220622_M00492_0049_000000000-KD83K/Unaligned/Project_ckikawa/lib1_C10_300_S3_R2_001.fastq.gz
/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz
/shared/ngs/illumina/ckikawa/220627_M00492_0050_000000000-KFMCM/Unaligned/Project_ckikawa/lib1_C10_300_S3_R2_001.fastq.gz
/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_C10_300_S3_R1_001.fastq.gz
/shared/ngs/illumina/ckikawa/220919_M04866_0609_000000000-KHNRY/Unaligned/Project_ckikawa/lib1_C10_300_S3_R2_001.fastq.gz

The R1 output file will be: /fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_C10_300_R1.fastq.gz

The command we run to concate

cat: /shared/ngs/illumina/ckikawa/2230106_M00492_0117_000000000-KNCB5/Unaligned/Project_ckikawa/lib3_sign_20000_S9_R1_001.fastq.gz: No such file or directory
cat: /shared/ngs/illumina/ckikawa/2230106_M00492_0117_000000000-KNCB5/Unaligned/Project_ckikawa/lib3_sign_20000_S9_R2_001.fastq.gz: No such file or directory


Adding CSV data to output_samples list.
Done.


Generating command for lib3 SIgN-20000.

The R1 and R2 files for this sample are:
/shared/ngs/illumina/ckikawa/230105_M04866_0630_000000000-KNBVD/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R1_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230105_M04866_0630_000000000-KNBVD/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R2_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230106_M00492_0117_000000000-KNCB5/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R1_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230106_M00492_0117_000000000-KNCB5/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R2_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230109_M04866_0631_000000000-KNBR4/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R1_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230109_M04866_0631_000000000-KNBR4/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R2_001.fastq.gz 
/shared/ngs/illumina/ckikawa/230110_M00492_0119_000000000-KP7C8/Unaligned/Project_ckikawa/lib3_sign_40000_S8_R1_00

Generate CSV formatted list of output files to copy into samplelists:

In [7]:
samplesfile = os.path.join(datadir, 'samplelist.csv')
print(samplesfile)

with open(samplesfile, 'w') as f:
    f.write('library,antibody,selection,percent_infectivity,date,R1\n')
    for line in output_samples:
        f.write(f'{line}\n')
        print(line)
        
# print('\n'.join(output_samples))

/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/samplelist.csv
lib1,EDE1-C10,C10-300,0.11%,220622-230109,/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_C10_300_R1.fastq.gz
lib1,EDE1-C8,C8-1800,0.01%,220622-230109,/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_C8_1800_R1.fastq.gz
lib1,MZ4,MZ4-4800,0.46%,220622-230109,/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_MZ4_4800_R1.fastq.gz
lib1,SIgN-3C,SIgN-20000,0.21%,220622-230109,/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_SIgN_20000_R1.fastq.gz
lib1,ZV-67,ZV67-40000,0.84%,220622-230109,/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_MAP_GooLab/data/concat_fastq/concat_fastq_out/lib1_ZV67_40000_R1.fastq.gz
lib1,no-antibody,no-antibody,100,220622-230109