# Extract barcodes and write GenBank files for plasmid log
I have sequence-confirmed plasmids. I now want to extract the 3 verified barcodes for each construct and save this information along with a plasmid map as a GenBank file. I will use this file to submit to the Bloom lab plasmid log. 

Author: Caroline Kikawa

In [1]:
# Import relevant packages
import os
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd
import glob
import re

First, annotate the library ID file with a column called 'clone.' Write each unique clone identifier in this column. Then, put all Primordium files in the 'data' directory. If a given clone would have 2 sequences for any reason, choose only 1 to upload to the data directory. 

If a given library sequence doesn't have 3 (or any) chosen clones, a genbank file with barcode ID 'NNNNNNNNNNNNNNNN' will be generated. 

In [2]:
# ID input and output
datadir = '../data'
resultsdir = '../results'
os.makedirs(datadir, exist_ok=True)
os.makedirs(resultsdir, exist_ok=True)

genbankdir = os.path.join('../plasmids')
os.makedirs(genbankdir, exist_ok=True)

clone_ID_file = os.path.join(datadir, 'barcode_ID_sheet.csv')
clone_ID_df = pd.read_csv(clone_ID_file)

# Get new strain name map
strain_to_id_map = (pd.read_csv(os.path.join(datadir, 'strain_to_id_map.csv'))
                    .rename(columns = {'strain': 'GISAID_strain',
                                       'GISAID_id': 'epi'})
                   )

# Map new IDs
clone_ID_df = (clone_ID_df
               .merge(strain_to_id_map, on = 'epi')
               .drop(columns = 'strain-name')
               .rename(columns = {'GISAID_strain': 'strain-name'})
              )


clone_ID_df

Unnamed: 0,lib-id,plasmid,epi,clone,expected-seq,strain-name
0,1,4142_phh_A-Switzerland-9715293-2013_HA_WSNflan...,EPI_ISL_166310,1E,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Switzerland/9715293/2013
1,1,4143_phh_A-Switzerland-9715293-2013_HA_WSNflan...,EPI_ISL_166310,1K,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Switzerland/9715293/2013
2,1,4144_phh_A-Switzerland-9715293-2013_HA_WSNflan...,EPI_ISL_166310,1L,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Switzerland/9715293/2013
3,2,4145_phh_A-Switzerland-9715293-2013-NIB-88_HA_...,EPI_ISL_198223,2E,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Switzerland/9715293/2013 NIB-88
4,2,4146_phh_A-Switzerland-9715293-2013-NIB-88_HA_...,EPI_ISL_198223,2F,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Switzerland/9715293/2013 NIB-88
...,...,...,...,...,...,...
340,78,4375_phh_A-South-Dakota-22-2023_HA_WSNflank_bc3,EPI_ISL_17391847,78C,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/South Dakota/22/2023
341,78,4375_phh_A-South-Dakota-22-2023_HA_WSNflank_bc3,EPI_ISL_17391847,78C,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/South Dakota/22/2023
342,79,4636_phh_A-Thailand-8-2022_HA_WSNflank_bc1,EPI_ISL_16014504,79B,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Thailand/8/2022
343,79,4637_phh_A-Thailand-8-2022_HA_WSNflank_bc2,EPI_ISL_16014504,79E,actcttcctttttcaatattattgaagcatttatcagggttattgt...,A/Thailand/8/2022


In [3]:
# Define endodomain that should be directly 3' upstream of 16N barcode
endodomain = 'gttgagctgaagtcaggatacaaagattggatcctatggatttcctttgccATGtcTtgCttCCtActGtgCgtAgcACtACtAggCttTatTatgtgggcGtgTcaGaaAggCtcCCtAcaAtgTCgGatTtgTatTTAATAG'
last20endodomain = endodomain[-20:].upper()

In [7]:
# Initialize empty list for all barcodes and strains
barcode_strain_list = []

# Iterate through plasmids
# Identify unique 16N barcode
# Write GenBank files
for plasmid in clone_ID_df.plasmid.unique():
    
    # Subset clone ID df
    plasmid_df = clone_ID_df.query(f'plasmid == "{plasmid}"')
    
    # Define basic info for each plasmid
    clone = str(plasmid_df['clone'].values[0])
    expected_sequence = plasmid_df['expected-seq'].values[0]
    strain_name = plasmid_df['strain-name'].values[0]
    epi = plasmid_df['epi'].values[0]
    library_id = plasmid_df['lib-id'].values[0]
        
    # Empty barcoded sequence string that will be replaced later
    barcoded_sequence = 'notreadyyet'
    barcode = 'NNNNNNNNNNNNNNNN'
    
    # Get barcode
    if clone == 'nan':
        pass
    else:
        sequence_files = glob.glob(f'../data/plasmidsaurus_sequencing/*/*_genbank_files/*_{clone}_*.gbk')
        assert len(sequence_files) == 1, f'Non-redundant clone IDs or multiple sequences for clone {clone}, see {sequence_files}'

        with open(sequence_files[0]) as f:
            record = SeqIO.read(f, format = 'gb')
            fwd_sequence = str(record.seq)
            rev_sequence = str(record.seq.reverse_complement())

            for seq in [fwd_sequence, rev_sequence]:
                if re.search(last20endodomain, seq):
                    # Find index of endodomain match
                    query_start = re.search(last20endodomain, seq).start()
                    query_end = re.search(last20endodomain, seq).end()
                    # Barcode is directly 5' to endodomain and 16N long
                    barcode_start = query_end
                    barcode_end = query_end + 16
                    # String index for barcode sequence
                    barcode = seq[barcode_start:barcode_end]
                    # Write new sequence with barcode
                    barcoded_sequence = expected_sequence.replace('NNNNNNNNNNNNNNNN', barcode)

    # Now write the GenBank file
    # Write detailed description
    if library_id <=16:
        variant_type = '2012-2021 vaccine strain'
    else:
        variant_type = 'late 2022 or 2023 strain'
    definition = f"This pHH plasmid contains the HA sequence for a H3N2 variant from a {variant_type}. Signal peptide and 3'NCR from WSN, ectodomain from H3N2 HA {strain_name} with accession {epi}, and last 46 aa recoded WSN transmembrane and c-terminal domain. With duplicated 5' packaging signals from WSN with a single stop codon in the duplicated packaging signal, with the barcode {barcode} This plasmid was cloned and sequence confirmed by Caroline Kikawa."
    
    # Write sequence features
    from Bio.SeqFeature import SeqFeature, FeatureLocation
    f1 = SeqFeature(FeatureLocation(543, 709, -1), type="terminator", qualifiers = {'label': 'mouse PolI terminator'})
    f2 = SeqFeature(FeatureLocation(709, 721, -1), type="misc_feature", qualifiers = {'label': 'U12'})
    f3 = SeqFeature(FeatureLocation(709, 741, -1), type="misc_feature", qualifiers = {'label': "3' NCR"})
    f4 = SeqFeature(FeatureLocation(741, 798, +1), type="misc_feature", qualifiers = {'label': 'WSN first 19 aa'})
    f5 = SeqFeature(FeatureLocation(798, 2310, +1), type="misc_feature", qualifiers = {'label': f'HA gene from {strain_name}'})
    f6 = SeqFeature(FeatureLocation(2310, 2454, +1), type="misc_feature", qualifiers = {'label': 'constant H3 endodomain'})
    f7 = SeqFeature(FeatureLocation(2322, 2420, +1), type="misc_feature", qualifiers = {'label': 'HK19 coded packaging signal with I193M'})
    f8 = SeqFeature(FeatureLocation(2421, 2454, +1), type="misc_feature", qualifiers = {'label': 'WSN mutated packaging signal'})
    f9 = SeqFeature(FeatureLocation(2454, 2470, +1), type="misc_feature", qualifiers = {'label': 'barcode'})
    f10 = SeqFeature(FeatureLocation(2470, 2503, +1), type="misc_feature", qualifiers = {'label': 'IlluminaBarcodeRegion'})
    f11 = SeqFeature(FeatureLocation(2512, 2617, +1), type="misc_feature", qualifiers = {'label': 'packaging signal'})
    f12 = SeqFeature(FeatureLocation(2617, 2662, -1), type="misc_feature", qualifiers = {'label': "5' NCR"})
    f13 = SeqFeature(FeatureLocation(2650, 2662, -1), type="misc_feature", qualifiers = {'label': 'U13'})
    f14 = SeqFeature(FeatureLocation(2662, 3065, -1), type="misc_feature", qualifiers = {'label': 'Human PolI promoter'})
    f15 = SeqFeature(FeatureLocation(3932, 4790, -1), type="CDS", qualifiers = {'label': 'AmpR'})

    features_list = [f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f15]
    
    # Write sequence record and save 
    record = SeqRecord(Seq(barcoded_sequence), 
                       id = '.', 
                       name = plasmid, description = definition, 
                       features = features_list, 
                       annotations = {'source': 'synthetic DNA construct',
                                      'organism': 'synthetic DNA construct',
                                      'molecule_type': 'ds-DNA',
                                      'topology': 'circular',
                                      'date': '14-DEC-2023'})
    
    outfile = os.path.join(genbankdir, f'{plasmid}.gb')
    with open(outfile, 'w') as f:
        SeqIO.write(record, f, 'genbank')

    # If barcode has been identified,
    # Add barcode and strain to list
    if barcode != 'NNNNNNNNNNNNNNNN':
        barcode_strain_list.append([barcode, strain_name])
    




## Save a barcode-to-strain map for use in `seqneut-pipeline`

In [5]:
# Save list of barcodes and their corresponding strains
barcode_strain_df = (pd.DataFrame(barcode_strain_list, columns = ['barcode', 'strain'])
                     .query('strain != "A/Kansas/14/2017 X-327"') # Remove egg-passaged Kansas strain
                     .replace({'strain':  # Rename A/Singapore/NUH0526/2023 as A/Massachusetts/18/2022
                               {'A/Singapore/NUH0526/2023': 'A/Massachusetts/18/2022'}
                              })
                    )

# The phylogenetic tree has spaces stripped out of all strain names
# Strip spaces from this output file to match!
barcode_strain_df['strain'] = barcode_strain_df['strain'].str.replace(' ', '')

outfile = os.path.join(resultsdir, 'barcode_to_strain.csv')
barcode_strain_df.to_csv(outfile, index=False)
barcode_strain_df

Unnamed: 0,barcode,strain
0,TCGATTACTAGCCGGA,A/Switzerland/9715293/2013
1,AGCTGAATTAAGTATG,A/Switzerland/9715293/2013
2,CCAATCCCAGCCTTTA,A/Switzerland/9715293/2013
3,CGGGAAATGTAAATGA,A/Switzerland/9715293/2013NIB-88
4,ATAGGATATATGGCTG,A/Switzerland/9715293/2013NIB-88
...,...,...
232,TCAACCCTTCGATGTA,A/SouthDakota/22/2023
233,GAAGTAACAAACTATG,A/SouthDakota/22/2023
234,CGTACGTATGTCCCAG,A/Thailand/8/2022
235,AGACCATCGCACCCAA,A/Thailand/8/2022


In [6]:
# # Save list of all library strains
# library_strains_df = barcode_strain_df[['strain']].drop_duplicates().reset_index(drop=True)
# outfile = os.path.join(resultsdir, 'H3N2library_2023-2024_strain_names.csv')
# library_strains_df.to_csv(outfile, index=False)
# library_strains_df