# concat_fastqs
This notebook takes a list of FASTQ files that reprsent multiple sequencing runs of the same samples. It groups them by sample and tile, concatenates them, and saves the concatenated FASTQ.

Notebook Setup:

In [1]:
import os
import pandas as pd

Paths:

In [2]:
samplelist = 'concat_fastq_samplelist.csv'
out_dir = 'concat_fastq_out/'
pseudo_date = 'multi'

Load samplelist:

In [3]:
samples = pd.read_csv(samplelist)
display(samples)

Unnamed: 0,library,selection,date,R1
0,wt,C6-36,220225,/shared/ngs/illumina/bloom_lab/220225_M04866_0...
1,lib1,C6-36,220225,/shared/ngs/illumina/bloom_lab/220225_M04866_0...
2,lib2,C6-36,220225,/shared/ngs/illumina/bloom_lab/220225_M04866_0...
3,lib3,C6-36,220225,/shared/ngs/illumina/bloom_lab/220225_M04866_0...
4,wt,C6-36,220415,/shared/ngs/illumina/bloom_lab/220415_M00492_0...
5,lib1,C6-36,220415,/shared/ngs/illumina/bloom_lab/220415_M00492_0...
6,lib2,C6-36,220415,/shared/ngs/illumina/bloom_lab/220415_M00492_0...
7,lib3,C6-36,220415,/shared/ngs/illumina/bloom_lab/220415_M00492_0...


Generate list of R1 files grouped by sample information. Ignore date, since we are combining sequencing runs. Sample information will be:
* library
* selection

In [4]:
R1_files = (
    pd.DataFrame(
        samples.groupby(['library', 'selection'])
        ['R1']
        .apply(list))
     .reset_index()
     .rename(columns={'R1': 'R1_list'})
)
display(R1_files)

Unnamed: 0,library,selection,R1_list
0,lib1,C6-36,[/shared/ngs/illumina/bloom_lab/220225_M04866_...
1,lib2,C6-36,[/shared/ngs/illumina/bloom_lab/220225_M04866_...
2,lib3,C6-36,[/shared/ngs/illumina/bloom_lab/220225_M04866_...
3,wt,C6-36,[/shared/ngs/illumina/bloom_lab/220225_M04866_...


Generate output directory with name "concat_fastq_out"

In [5]:
os.system('mkdir concat_fastq_out -p')

0

Make `cat` command string for each sample in R1_files

In [6]:
# Keep track of output files
output_samples = list()

for index, row in R1_files.iterrows():
    print(f"Generating command for {row['library']} {row['selection']}.\n")
    
    # make strings with all R1_files and R2_files for each sample
    print("The R1 and R2 files for this sample are:")
    R1_ls = []
    R2_ls = []

    for r1 in row["R1_list"]:
        print(r1)
        R1_ls.append(r1)
        assert r1.count('_R1') == 1, ("Can't guess R2 file for R1 "
                        "file {0}".format(r1))
        r2 = r1.replace('_R1', '_R2')
        print(r2)
        R2_ls.append(r2)
    
    R1_string = " ".join(map(str, R1_ls))
    R2_string = " ".join(map(str, R2_ls))
    
    # make R1 strings with output file name
    r1_output_file = (out_dir +
                   row['library'] + '_' +
                   row['selection'].replace('-','') + '_' +
                   "R1" +
                   '.fastq.gz')
    print(f"\nThe R1 output file will be: {r1_output_file}\n")

    r1_command = "cat " + R1_string + " > " + r1_output_file

    print("The command we run to concatentate these R1 files will be:")
    print(r1_command)
    
    # make R2 strings with output file name
    r2_output_file = (out_dir +
                   row['library'] + '_' +
                   row['selection'].replace('-','') + '_' +
                   "R2" +
                   '.fastq.gz')
    print(f"\nThe R2 output file will be: {r2_output_file}\n")

    r2_command = "cat " + R2_string + " > " + r2_output_file

    print("The command we run to concatentate these R2 files will be:")
    print(r2_command)
    print("\n")
    
    # run cat strings in the command line
    print("\nRunning.")
    os.system(r1_command)
    os.system(r2_command)

    # make csv formatted lists of sample library, selection, R1
    # do not need to include paths to R2
    # dms_tools2 identifies R2 file paths from R1 file paths
    print("Adding CSV data to output_samples list.")
    csv_string = ','.join([row['library'],
                           row['selection'],
                           pseudo_date,
                           str('/fh/fast/bloom_j/computational_notebooks/dbacsik/2022/ZIKV_DMS_NS5_EvansLab/data/concat_fastqs/'+
                               r1_output_file +
                               ',')])
    output_samples.append(csv_string)
    print("Done.\n\n")


Generating command for lib1 C6-36.

The R1 and R2 files for this sample are:
/shared/ngs/illumina/bloom_lab/220225_M04866_0515_000000000-K5JDB/Unaligned/Project_bloom_lab/Lib1_S1_R1_001.fastq.gz
/shared/ngs/illumina/bloom_lab/220225_M04866_0515_000000000-K5JDB/Unaligned/Project_bloom_lab/Lib1_S1_R2_001.fastq.gz
/shared/ngs/illumina/bloom_lab/220415_M00492_0013_000000000-KC5RK/Unaligned/Project_bloom_lab/Lib1_S1_R1_001.fastq.gz
/shared/ngs/illumina/bloom_lab/220415_M00492_0013_000000000-KC5RK/Unaligned/Project_bloom_lab/Lib1_S1_R2_001.fastq.gz

The R1 output file will be: concat_fastq_out/lib1_C636_R1.fastq.gz

The command we run to concatentate these R1 files will be:
cat /shared/ngs/illumina/bloom_lab/220225_M04866_0515_000000000-K5JDB/Unaligned/Project_bloom_lab/Lib1_S1_R1_001.fastq.gz /shared/ngs/illumina/bloom_lab/220415_M00492_0013_000000000-KC5RK/Unaligned/Project_bloom_lab/Lib1_S1_R1_001.fastq.gz > concat_fastq_out/lib1_C636_R1.fastq.gz

The R2 output file will be: concat_fastq_

Generate CSV formatted list of output files to copy into samplelists:

In [7]:
print('\n'.join(output_samples))

lib1,C6-36,multi,/fh/fast/bloom_j/computational_notebooks/dbacsik/2022/ZIKV_DMS_NS5_EvansLab/data/concat_fastqs/concat_fastq_out/lib1_C636_R1.fastq.gz,
lib2,C6-36,multi,/fh/fast/bloom_j/computational_notebooks/dbacsik/2022/ZIKV_DMS_NS5_EvansLab/data/concat_fastqs/concat_fastq_out/lib2_C636_R1.fastq.gz,
lib3,C6-36,multi,/fh/fast/bloom_j/computational_notebooks/dbacsik/2022/ZIKV_DMS_NS5_EvansLab/data/concat_fastqs/concat_fastq_out/lib3_C636_R1.fastq.gz,
wt,C6-36,multi,/fh/fast/bloom_j/computational_notebooks/dbacsik/2022/ZIKV_DMS_NS5_EvansLab/data/concat_fastqs/concat_fastq_out/wt_C636_R1.fastq.gz,
