In [1]:
%%bash

#1. Pair, merge and trim raw reads. 

# User must input paths to in and out directories. 
# User must input length of expected amplicon 

# 1. Pair and merge raw reads

/Users/katbraun/anaconda3/bin/bbmerge.sh \
in=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R1.fastq.gz \
in2=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R2.fastq.gz \
out=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged.fastq \
ow=t

#trim the merged reads to 143 bps

/Users/katbraun/anaconda3/bin/reformat.sh \
in=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged.fastq \
out=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged_trimmed.fastq \
minlength=143 \
maxlength=143 \
ow=t

java -Djava.library.path=/Users/katbraun/anaconda3/opt/bbmap-38.22-0/jni/ -ea -Xmx1000m -Xms1000m -cp /Users/katbraun/anaconda3/opt/bbmap-38.22-0/current/ jgi.BBMerge in=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R1.fastq.gz in2=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R2.fastq.gz out=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged.fastq ow=t
Executing jgi.BBMerge [in=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R1.fastq.gz, in2=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/ZIKV_UMI_rep1_1,1_R2.fastq.gz, out=/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_U

In [2]:
# 2. Creating a TSV file. Column 1 = UMI. Column 2 = fastqID. Quality_raw_reads = reads that were successfully paired, merged, and trimmed. 

# User must input location of UMI
# User must input the length of the expected amplicon. This should be the same number of base pairs that were input in cell #2

from Bio import SeqIO
import csv

input_file = '/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged_trimmed.fastq' 
fastq_sequences = SeqIO.parse(open(input_file),'fastq')

UMI_list = []

for fastq in fastq_sequences: 
    sequence = str(fastq.seq)
    fastqID = str(fastq.id)
    # print(sequence[131:143]) #UMI, 12 bps
    # print(sequence[:143]) #entire amplicon
    
    UMI = str(sequence[131:143])
    amplicon = str(sequence[:143])
    fastqID = str(fastqID[29:43])
    
    UMI_list.append([UMI, amplicon, fastqID])
    
with open('UMI_list_ZIKV_rep1_group1_1,1.tsv','w') as f:
    for i in UMI_list:
        f.write(i[0] + '\t' + i[1] + '\t' + i[2] + '\n')
        
with open('UMI_list_ZIKV_rep1_group1_1,1.tsv') as f: 
    quality_raw_reads = 0 
    for line in f: 
        quality_raw_reads += 1
        
print('Number of quality raw reads is:', quality_raw_reads)

Number of quality raw reads is: 348547


In [3]:
# 3. Creating a dictionary. Key = UMI. Values = fastqIDs. 

# User must input location of UMI. This should be the same as in cell number 3. 
# User must input the length of the expected amplicon. This should be the same number of base pairs that were input in cell #2

from Bio import SeqIO

input_file = '/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged_trimmed.fastq'
fastq_sequences = SeqIO.parse(open(input_file),'fastq')

UMI_dict = {}

for fastq in fastq_sequences: 
    sequence = fastq.seq.tostring()
    fastqID = str(fastq.id)
    # print(sequence[131:143]) #UMI, 12 bps, key 
    # print(sequence[:143]) #FastqID
    
    fastqID = str(fastqID[0:])
    UMI = str(sequence[131:143])
    
    if not UMI in UMI_dict:
        UMI_dict[UMI] = [fastqID]
    else:
        UMI_dict[UMI].append(fastqID)

#We won't need to print the dictionary when working with non-downsampled files. It won't print the dictionary when working with full datasets.  
#print(UMI_dict)



In [4]:
# 4. Converts the UMI dictionary to a pandas dataframe. Index = UMI. Values = fastqID, which are listed in columns.
# The last column is equal to the number of reads per unique UMI.

import pandas as pd
#from numpy import nan

UMI_DF = pd.DataFrame.from_dict(UMI_dict, orient='index')
#UMI_DF.fillna(value=nan, inplace=True)
UMI_DF['read_count'] = UMI_DF.count(axis=1)

UMI_DF

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,read_count
GGACATCTTTGA,M01472:143:000000000-BTDJR:1:1101:18682:5058,M01472:143:000000000-BTDJR:1:1109:8718:9995,M01472:143:000000000-BTDJR:1:1109:18065:25490,M01472:143:000000000-BTDJR:1:2102:18035:27937,M01472:143:000000000-BTDJR:1:2103:27429:14719,M01472:143:000000000-BTDJR:1:2104:15659:8207,M01472:143:000000000-BTDJR:1:2105:18820:23315,M01472:143:000000000-BTDJR:1:2108:6164:16442,M01472:143:000000000-BTDJR:1:2113:17353:12675,,...,,,,,,,,,,9
TGAAATCCGTGT,M01472:143:000000000-BTDJR:1:1101:23221:5062,M01472:143:000000000-BTDJR:1:1106:9062:20235,M01472:143:000000000-BTDJR:1:1110:26857:19310,M01472:143:000000000-BTDJR:1:1111:20287:18602,M01472:143:000000000-BTDJR:1:2101:24456:7203,M01472:143:000000000-BTDJR:1:2103:12926:11239,M01472:143:000000000-BTDJR:1:2103:12908:11250,M01472:143:000000000-BTDJR:1:2106:26082:19578,M01472:143:000000000-BTDJR:1:2107:12803:9903,M01472:143:000000000-BTDJR:1:2111:7488:8187,...,,,,,,,,,,12
GAACAGTCAAAA,M01472:143:000000000-BTDJR:1:1101:12596:5066,,,,,,,,,,...,,,,,,,,,,1
CAGGTCTTGACT,M01472:143:000000000-BTDJR:1:1101:8177:5068,M01472:143:000000000-BTDJR:1:1104:24849:13078,M01472:143:000000000-BTDJR:1:1105:6897:14660,M01472:143:000000000-BTDJR:1:1106:16534:7230,M01472:143:000000000-BTDJR:1:1107:11005:23464,M01472:143:000000000-BTDJR:1:1110:8583:17597,M01472:143:000000000-BTDJR:1:1110:27177:21690,M01472:143:000000000-BTDJR:1:1111:21374:6519,M01472:143:000000000-BTDJR:1:2101:25159:6995,M01472:143:000000000-BTDJR:1:2111:9654:3849,...,,,,,,,,,,10
AAGCAGAGCAGA,M01472:143:000000000-BTDJR:1:1101:18572:5071,M01472:143:000000000-BTDJR:1:1104:16954:11340,M01472:143:000000000-BTDJR:1:1107:20551:26764,M01472:143:000000000-BTDJR:1:1114:12187:4645,M01472:143:000000000-BTDJR:1:2101:11872:11687,M01472:143:000000000-BTDJR:1:2101:18937:17051,M01472:143:000000000-BTDJR:1:2107:14803:14027,M01472:143:000000000-BTDJR:1:2108:8975:3289,M01472:143:000000000-BTDJR:1:2112:17928:9761,M01472:143:000000000-BTDJR:1:2112:17937:9777,...,,,,,,,,,,12
GTGGTGGGGTTA,M01472:143:000000000-BTDJR:1:1101:15003:5078,M01472:143:000000000-BTDJR:1:1103:17948:7797,M01472:143:000000000-BTDJR:1:1103:9819:19761,M01472:143:000000000-BTDJR:1:2102:22540:19606,M01472:143:000000000-BTDJR:1:2104:25563:9334,,,,,,...,,,,,,,,,,5
AAGCATGAAATC,M01472:143:000000000-BTDJR:1:1101:7854:5079,M01472:143:000000000-BTDJR:1:2101:10227:8863,M01472:143:000000000-BTDJR:1:2111:26603:17824,,,,,,,,...,,,,,,,,,,3
TTGGCATGAAAA,M01472:143:000000000-BTDJR:1:1101:12993:5079,M01472:143:000000000-BTDJR:1:1103:25968:17305,M01472:143:000000000-BTDJR:1:1104:8078:5833,M01472:143:000000000-BTDJR:1:1105:13320:3876,M01472:143:000000000-BTDJR:1:1105:10334:17383,M01472:143:000000000-BTDJR:1:1114:14494:12337,M01472:143:000000000-BTDJR:1:1114:14494:12358,M01472:143:000000000-BTDJR:1:2101:14363:9193,M01472:143:000000000-BTDJR:1:2102:19107:25520,M01472:143:000000000-BTDJR:1:2109:11898:8468,...,,,,,,,,,,11
GATAGCTTCGAC,M01472:143:000000000-BTDJR:1:1101:6709:5088,,,,,,,,,,...,,,,,,,,,,1
AAAAACCCCATT,M01472:143:000000000-BTDJR:1:1101:24409:5091,,,,,,,,,,...,,,,,,,,,,1


In [5]:
#5. Determine m. m = maximum number of reads associated with any unique UMI.
#6. Apply m to read number cutoff model to determine c. c = minimum number of reads per UMI in order for an UMI to be considered 'real'. 

import pandas as pd
from math import exp, expm1

UMI_DF = pd.DataFrame.from_dict(UMI_dict, orient='index')
UMI_DF['read_count'] = UMI_DF.count(axis=1)
m = UMI_DF['read_count'].max()

c = ((-1.24e-21)*(m**6)) + ((3.53e-17)*(m**5)) - ((3.9e-13)*(m**4)) + ((2.12e-9)*(m**3)) - ((6.06e-6)*(m**2)) + (.018*m) + 3.15

print('Maximum number of reads associated with any unique UMI, m =', m)
print('Minimum number of reads required per UMI to pass quality cutoff, c =', c)

Maximum number of reads associated with any unique UMI, m = 28
Minimum number of reads required per UMI to pass quality cutoff, c = 3.6492952591310885


In [6]:
# 7. Apply read number cutoff model to UMI_dataframe. Discard all UMIs with </ c. 

import math 

c_up = math.ceil(c)
print('Read number cutoff is:', c_up)

UMI_DF_cutoff = UMI_DF[UMI_DF['read_count'] >= c_up]

number_of_consensus_sequences = len(UMI_DF_cutoff)
print('Number of consensus sequences passing cutoff is:', number_of_consensus_sequences)
UMI_DF_cutoff

Read number cutoff is: 4
Number of consensus sequences passing cutoff is: 37573


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,read_count
GGACATCTTTGA,M01472:143:000000000-BTDJR:1:1101:18682:5058,M01472:143:000000000-BTDJR:1:1109:8718:9995,M01472:143:000000000-BTDJR:1:1109:18065:25490,M01472:143:000000000-BTDJR:1:2102:18035:27937,M01472:143:000000000-BTDJR:1:2103:27429:14719,M01472:143:000000000-BTDJR:1:2104:15659:8207,M01472:143:000000000-BTDJR:1:2105:18820:23315,M01472:143:000000000-BTDJR:1:2108:6164:16442,M01472:143:000000000-BTDJR:1:2113:17353:12675,,...,,,,,,,,,,9
TGAAATCCGTGT,M01472:143:000000000-BTDJR:1:1101:23221:5062,M01472:143:000000000-BTDJR:1:1106:9062:20235,M01472:143:000000000-BTDJR:1:1110:26857:19310,M01472:143:000000000-BTDJR:1:1111:20287:18602,M01472:143:000000000-BTDJR:1:2101:24456:7203,M01472:143:000000000-BTDJR:1:2103:12926:11239,M01472:143:000000000-BTDJR:1:2103:12908:11250,M01472:143:000000000-BTDJR:1:2106:26082:19578,M01472:143:000000000-BTDJR:1:2107:12803:9903,M01472:143:000000000-BTDJR:1:2111:7488:8187,...,,,,,,,,,,12
CAGGTCTTGACT,M01472:143:000000000-BTDJR:1:1101:8177:5068,M01472:143:000000000-BTDJR:1:1104:24849:13078,M01472:143:000000000-BTDJR:1:1105:6897:14660,M01472:143:000000000-BTDJR:1:1106:16534:7230,M01472:143:000000000-BTDJR:1:1107:11005:23464,M01472:143:000000000-BTDJR:1:1110:8583:17597,M01472:143:000000000-BTDJR:1:1110:27177:21690,M01472:143:000000000-BTDJR:1:1111:21374:6519,M01472:143:000000000-BTDJR:1:2101:25159:6995,M01472:143:000000000-BTDJR:1:2111:9654:3849,...,,,,,,,,,,10
AAGCAGAGCAGA,M01472:143:000000000-BTDJR:1:1101:18572:5071,M01472:143:000000000-BTDJR:1:1104:16954:11340,M01472:143:000000000-BTDJR:1:1107:20551:26764,M01472:143:000000000-BTDJR:1:1114:12187:4645,M01472:143:000000000-BTDJR:1:2101:11872:11687,M01472:143:000000000-BTDJR:1:2101:18937:17051,M01472:143:000000000-BTDJR:1:2107:14803:14027,M01472:143:000000000-BTDJR:1:2108:8975:3289,M01472:143:000000000-BTDJR:1:2112:17928:9761,M01472:143:000000000-BTDJR:1:2112:17937:9777,...,,,,,,,,,,12
GTGGTGGGGTTA,M01472:143:000000000-BTDJR:1:1101:15003:5078,M01472:143:000000000-BTDJR:1:1103:17948:7797,M01472:143:000000000-BTDJR:1:1103:9819:19761,M01472:143:000000000-BTDJR:1:2102:22540:19606,M01472:143:000000000-BTDJR:1:2104:25563:9334,,,,,,...,,,,,,,,,,5
TTGGCATGAAAA,M01472:143:000000000-BTDJR:1:1101:12993:5079,M01472:143:000000000-BTDJR:1:1103:25968:17305,M01472:143:000000000-BTDJR:1:1104:8078:5833,M01472:143:000000000-BTDJR:1:1105:13320:3876,M01472:143:000000000-BTDJR:1:1105:10334:17383,M01472:143:000000000-BTDJR:1:1114:14494:12337,M01472:143:000000000-BTDJR:1:1114:14494:12358,M01472:143:000000000-BTDJR:1:2101:14363:9193,M01472:143:000000000-BTDJR:1:2102:19107:25520,M01472:143:000000000-BTDJR:1:2109:11898:8468,...,,,,,,,,,,11
ATTATAGCTACA,M01472:143:000000000-BTDJR:1:1101:15436:5093,M01472:143:000000000-BTDJR:1:1102:9329:8597,M01472:143:000000000-BTDJR:1:1107:13608:8887,M01472:143:000000000-BTDJR:1:1109:14715:17702,M01472:143:000000000-BTDJR:1:1111:24987:12001,M01472:143:000000000-BTDJR:1:1111:6901:18881,M01472:143:000000000-BTDJR:1:1113:2298:11598,,,,...,,,,,,,,,,7
ATCCTAGATACA,M01472:143:000000000-BTDJR:1:1101:18984:5097,M01472:143:000000000-BTDJR:1:1110:17963:23723,M01472:143:000000000-BTDJR:1:1112:26334:15613,M01472:143:000000000-BTDJR:1:1112:26331:15631,M01472:143:000000000-BTDJR:1:1113:21200:22233,M01472:143:000000000-BTDJR:1:2101:20334:21265,M01472:143:000000000-BTDJR:1:2107:17983:22218,M01472:143:000000000-BTDJR:1:2109:22816:25825,,,...,,,,,,,,,,8
GTGAGTGTTTTG,M01472:143:000000000-BTDJR:1:1101:19447:5100,M01472:143:000000000-BTDJR:1:1105:11177:28459,M01472:143:000000000-BTDJR:1:1113:27125:14862,M01472:143:000000000-BTDJR:1:2108:14759:27563,M01472:143:000000000-BTDJR:1:2109:21186:8891,M01472:143:000000000-BTDJR:1:2110:18501:24457,,,,,...,,,,,,,,,,6
TAGAGCAATTCA,M01472:143:000000000-BTDJR:1:1101:17379:5101,M01472:143:000000000-BTDJR:1:1112:13893:11783,M01472:143:000000000-BTDJR:1:2102:9335:6323,M01472:143:000000000-BTDJR:1:2102:9320:6335,M01472:143:000000000-BTDJR:1:2103:20278:21351,,,,,,...,,,,,,,,,,5


In [7]:
# 8. Determine quality coverage and conversion rate. 

# User must input the number of input templates as 'x' in order to determine coverage and conversion rate. 

import math

input_templates = 109000


quality_coverage = quality_raw_reads/input_templates
conversion_rate = (number_of_consensus_sequences/input_templates)*100

print('Quality coverage is:', quality_coverage)
print('Conversion rate is:', conversion_rate, '%')

Quality coverage is: 3.1976788990825686
Conversion rate is: 34.47064220183486 %


In [8]:
# 9. Convert dataframe indices, which are the UMIs (as identified by fastq ID) passing filter into a list. 

import pandas as pd
import csv

UMI_DF2_cutoff = UMI_DF_cutoff[UMI_DF_cutoff.columns.difference(['read_count'])]
UMIs_passing_cutoff = UMI_DF2_cutoff.values.tolist()
UMIs_passing_cutoff_filtered = [[i for i in x if i != None] for x in UMIs_passing_cutoff]

#print(UMIs_passing_cutoff_filtered)

with open ("UMIs_passing_cutoff_filtered_rep1_group1_1,1.csv","w") as f: 
    write = csv.writer(f)
    for i in UMIs_passing_cutoff_filtered:
        write.writerows(i)
        
with open ("UMIs_passing_cutoff_filtered_rep1_group1_1,1.csv") as infile, open("UMIs_passing_cutoff_filtered2_rep1_group1_1,1.lst","w") as outfile:
    for line in infile:
        outfile.write(line.replace(",", ""))
       
    
#UMIs_passing_cutoff_UMIs = UMI_DF_cutoff.index.tolist()
#print(UMIs_passing_cutoff_UMIs)

#UMIs_passing_cutoff_fastqID = UMI_DF_cutoff.column(0).tolist()
#print(UMIs_passing_cutoff_fastqID)

In [9]:
%%bash
#10. Search merged and trimmed fastq for all reads with fastQ-IDs passing cutoff. 

# User must input path to downsampled-merged-trimmed.fastq exists
# User must input path to the list of UMIs, as identified by fastq ID, whcih was the final output file in cell 9 
# User must input desired path to output fastQ file 

seqtk subseq /Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/ZIKV_UMI_rep1_1,1_merged_trimmed.fastq /Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/UMIs_passing_cutoff_filtered2_rep1_group1_1,1.lst > /Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_rep1_group1_1,1.fastq
ow=t

In [10]:
#11. Pass the UMI of each read into the fastQ header.  

# User must input the path to output fastQ from cell #9
# User must input desired path to output fasta file 
# User must input location of UMI sequence, which should be identical to cell #2

from Bio import SeqIO
import csv

input_file = '/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_rep1_group1_1,1.fastq' 
fastq_sequences = SeqIO.parse(open(input_file),'fastq')
corrected_file = '/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader_rep1_group1_1,1.fastq'
    
with open(corrected_file, 'a') as corrected: 
    fastq_sequences = SeqIO.parse(input_file, 'fastq')
    for fastq in fastq_sequences:
        sequence = str(fastq.seq)
        #print(fastq.seq)
        UMI = sequence[131:143]
        #print(UMI)
        fastq.id = UMI
        #fastq.description = UMI + fastq.id
        SeqIO.write(fastq, corrected, 'fasta')


In [13]:
%%bash

#delete part of the fasta header so it only contains the UMI

sed 's/M.*//' '/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader_rep1_group1_1,1.fastq' > '/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader2_rep1_group1_1,1.fasta'

#seqkit to split fasta files according to the header

seqkit split --by-id --id-regexp "\[(.+)\]" "/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader2_rep1_group1_1,1.fasta"

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [14]:
#12. Group fastQ sequences by header and create one consensus sequence for each quality UMI.

from Bio import AlignIO
from Bio.Align import AlignInfo
from Bio import SeqIO
import os 
import glob

directory='/Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader2_rep1_group1_1,1.fasta.split/'

files_in_direct=glob.glob(directory+"*.fasta")

print("Script has detected "+str(len(files_in_direct))+" files in the directory "+directory)

for file in files_in_direct:
    align=AlignIO.read(file, "fasta")
    summary_align=AlignInfo.SummaryInfo(align)
    consensus=summary_align.dumb_consensus(threshold=0.5, ambiguous='N')
    str_con=str(consensus)
    #print(consensus)
    
    filesplit = file.split("/")
    filesplitfurther=filesplit[-1].split("_")
    filesplitevenfurther=filesplitfurther[-1].split(".")
    ID=filesplitevenfurther[0]
    
#     output_con = open('/Users/katbraun/Desktop/raw_data/out/concensus_seqs/'+ID+".consensus.fasta","w")
    
#     output_con.write(">"+ID+"\n")
#     output_con.write(str_con)
    
#     output_con.close()
    
    
    output_con= open('/Users/katbraun/Documents/research/kat_braun/projects/UMI_method/data_derived/run_467/UMI_rep1/group1/out_1/consensus_rep1_group1_1,1.fasta',"a")
    
    output_con.write(">"+ID+"\n")
    output_con.write(str_con+"\n")
    
    output_con.close()

Script has detected 37573 files in the directory /Volumes/KMB_hard_drive_1/SNP_reproducibility/UMI_1/group_1/ZIKV_reads_passing_cutoff_newheader2_rep1_group1_1,1.fasta.split/
