# concat_fastqs
This notebook takes a list of FASTQ files that reprsent multiple sequencing runs of the same samples. It groups them by sample and tile, concatenates them, and saves the concatenated FASTQ.

Notebook Setup:

In [None]:
import pandas as pd

Paths:

In [None]:
samplelist = 'concat_fastq_samplelist.csv'
out_dir = 'concat_fastq_out/'

Load samplelist:

In [None]:
samples = pd.read_csv(samplelist)
display(samples)

Generate list of R1 files grouped by sample information. Ignore date, since we are combining sequencing runs. Sample information will be:
* library
* selection

In [None]:
R1_files = (
    pd.DataFrame(
        samples.groupby(['library', 'selection'])
        ['R1']
        .apply(list))
     .reset_index()
     .rename(columns={'R1': 'R1_list'})
)
display(R1_files)

Generate output directory with name "concat_fastq_out"

In [None]:
! mkdir concat_fastq_out -p

Make `cat` command string for each sample in R1_files

In [None]:
for index, row in R1_files.iterrows():
    print(f"Generating command for {row['library']} {row['selection']}.\n")
    
    # make string with all R1_files for each sample
    print("The files for this sample are:")
    ls = []
    for s in row["R1_list"]:
        print(s)
        ls.append(s)
    
    R1_string = " ".join(map(str, ls))
    
    # make string with output file name
    output_file = (out_dir +
                   row['library'] + '_' +
                   row['selection'].replace('-','') + '_' +
                   'R1' +
                   '.fastq.gz')
    print(f"\nThe output file will be: {output_file}\n")
    
    command = "cat " + R1_string + " > " + output_file
    
    print("The command we run to concatentate these files will be:")
    print(command)
    print("\n")