# Design HA inserts
These will be added into our custom onboarded HA expression vector with Twist. In the Bloom lab, this plasmid has the ID# 2851. 

In [1]:
# Import relevant packages
import os
import Bio
from Bio import SeqIO
import pandas as pd
import random


# ID input and output
datadir = '../data'
resultsdir = '../results'

ordersheetsdir = os.path.join(resultsdir, 'ordersheets')

os.makedirs(datadir, exist_ok=True)
os.makedirs(resultsdir, exist_ok=True)
os.makedirs(ordersheetsdir, exist_ok=True)

There is a fixed upstream WSN packaging signal (19 amino acids) that we need to append the HA ectodomain for each of our sequences.

The HA endodomain is kept constant for H3s and H1s.
* For H3s, this represents amino acids 521-560 (when numbering from methionine start codon as 1)
* For H1s, Andrea uses WSN sequence from amino acids 521 to end (position 568)

Downstream, we need to add the mutated WSN packaging signal and a 16 nucleotide barcode. 

In [2]:
# First 19 amino acids of WSN
wsn_upstream_19aa = 'atgaaggcaaaactactggtcctgttatatgcatttgtagctacagatgcagacaca'

# First 20 amino acids of WSN
wsn_upstream_20aa = 'atgaaggcaaaactactggtcctgttatatgcatttgtagctacagatgcagacacaata'

# H1 endodomain, representing amino acids 521 to 568 (48 amino acids)
h1_last_46aa_from_WSN = 'aaattggaatcaatgggagtgtatcagattctggcgatatattctacagtggcaagctccttagtactgctagtttctttaggagcgattagcttttggatgtgctccaacggCtcCCtAcaAtgTCgGatTtgTatTTAATAG'

# H3 endodomain, representing amino acids 521 to 560 (40 amino acids)
h3_endodomain = 'atcaagggagttgagctgaagtcaggatacaaagattggatcctatggatttcctttgccATGtcTtgCttCCtActGtgCgtAgcACtACtAggCttTatTatgtgggcGtgTcaGaaA'
# Downstream mutated WSN packaging signal with double stop codon (11 amino acids)
wsn_downstream = 'ggCtcCCtAcaAtgTCgGatTtgTatTTAATAG'

# Barcode
barcode = 'NNNNNNNNNNNNNNNN'
nucleotides = ['a', 'c', 'g', 't']

# Initialize barcode index (list of barcodes that have been used by a construct already)
barcode_index = []

# Add barcodes already used by prior viral libraries
kikawa2023 = pd.read_csv(os.path.join(datadir, 'viral_libraries/2023_H3N2_Kikawa.csv'))['barcode'].tolist()
loes2023 = pd.read_csv(os.path.join(datadir, 'viral_libraries/pdmH1N1_lib2023_loes.csv'))['barcode'].tolist()
barcode_index.extend(loes2023)
barcode_index.extend(kikawa2023)

Define a function for designing inserts

In [7]:
def design_inserts(subtype, 
                   insert_filepath, 
                   library_nucleotide_sequences,
                   upstream_signalpep,
                   ectodomain_start,
                   ectodomain_length,
                   endodomain_sequence,
                   start_codon = "ATGAAG",
                   append_additional_upstream_sequence = '',
                   append_additional_downstream_sequence = '',
                  ):

    # Only design if the ordersheet hasn't been generated
    if os.path.exists(insert_filepath):
        print(f"File '{insert_filepath}' exists, reading that file and NOT regenerating barcodes.")
    else:
        # Input FASTA file of subtype nucleotide sequences
        fasta_file = library_nucleotide_sequences
        
        # Define the custom start codon to search for
        start_codon = start_codon
    
        # Define ordersheet name parameters
        virus_id = 1
    
        # Initialize empty ordersheet to populate with name, sequence
        inserts = []
    
        # Set the number of barcodes to design for
        n_barcodes = 2
        
        # Open FASTA file and design constructs for each entry
        with open(fasta_file, "r") as handle:
        
            for record in SeqIO.parse(handle, "fasta"):
                # Initialize barcode counter
                i=1
                
                # Each strain needs barcodes designed 
                for n in list(range(0,n_barcodes)):
        
                    # Find the position of the first instance of 'ATGAAG'
                    start_position = record.seq.find(start_codon)                
                    assert start_position = -1, f"For {record.id} - no start codon {start_codon} found"

                    # Extract the sequence starting from the found position 
                    insert_start = start_position + ectodomain_start # Insert will start after first 19 amino acids of WSN
                    insert_end = start_position + (ectodomain_length*3) # Insert stops at amino acid 521
                    ectodomain_insert_seq = record.seq[insert_start:insert_end]
                    ectodomain_insert_translated_seq = ectodomain_insert_seq.translate()                
                    # Identify the endodomain region (subtype specific)
                    endodomain = endodomain_sequence
                                    
                    # Generate a barcode
                    for n in list(range(0,100)): # Try 100 times to make a barcode 
                        barcode = ''.join(random.choices(nucleotides, k=16))
                        if barcode[0:2] == 'gg': # Don't use barcodes that start with GG
                            continue
                        if barcode in barcode_index: # Don't use barcodes that have already been used in the library
                            continue
                        if n == 100:
                            print('something really rare happened, try resetting barcode_index')
                        else:
                            barcode_index.append(barcode)
                            break
            
                    # Expected sequence, including fixed upstream WSN signal peptide
                    expected_seq = upstream_signalpep + ectodomain_insert_seq + endodomain + barcode
                    # Insert sequence we need to order
                    # Ectodomain, endodomain, and barcode
                    insert_seq = ectodomain_insert_seq + endodomain + barcode

                    # Add upstream sequence
                    if append_additional_upstream_sequence == '':
                        pass
                    else:
                        insert_seq = append_additional_upstream_sequence + insert_seq
                    # Add downstream sequence
                    if append_additional_downstream_sequence == '':
                        pass
                    else:
                        insert_seq = insert_seq + append_additional_downstream_sequence
            
                    # Make a strain name with barcode info
                    name = record.id
                    name_barcoded = f'{subtype}_{virus_id}_bc{i}'
                    i+=1

                    # Get Genbank ID (and additional mutations) from FASTA header
                    genbank_id = record.description[len(record.id):].strip('protein identical to ')
    
                    # Add to inserts list
                    inserts.append([name, genbank_id, name_barcoded, str(insert_seq)])     

                # Add to virus counter
                virus_id+=1
    
            inserts_df = pd.DataFrame(inserts, columns = ['strain', 'genbank', 'name', 'sequence'])
            # inserts_df = inserts_df.sort_values(by = 'name').reset_index(drop=True).to_csv(insert_filepath, index=False)
            inserts_df = inserts_df.to_csv(insert_filepath, index=False)
        


## Design H1 inserts

In [8]:
design_inserts(
    subtype = 'H1N1',
    insert_filepath = os.path.join(ordersheetsdir, 'h1_inserts.csv'),
    library_nucleotide_sequences = '../results/strains_for_library/h1_nt_seqs_for_library.fasta',
    upstream_signalpep = wsn_upstream_20aa,
    ectodomain_start = 20*3,
    ectodomain_length = 520,
    endodomain_sequence = h1_last_46aa_from_WSN,
    append_additional_upstream_sequence = 'catttgtagctacagatgcagacaca' + 'ata', # Overlap with WSN signal peptide
    append_additional_downstream_sequence = 'AGATCGGAAGAGCGTCGTGT', # Overlap with Illumina R1 priming sequence             
)

h1_inserts_df = pd.read_csv(os.path.join(ordersheetsdir, 'h1_inserts.csv'))
h1_inserts_df

File '../results/ordersheets/h1_inserts.csv' exists, reading that file and NOT regenerating barcodes.


Unnamed: 0,strain,name,sequence
0,A/Maryland/64/2024_H1N1,H1N1_1_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
1,A/Maryland/64/2024_H1N1,H1N1_1_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
2,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,H1N1_2_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
3,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,H1N1_2_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
4,A/Ulsan/492/2025_H1N1,H1N1_3_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
...,...,...,...
71,A/Utah/39/2025_H1N1,H1N1_36_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
72,A/Hawaii/ISC-1140/2025_H1N1,H1N1_37_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
73,A/Hawaii/ISC-1140/2025_H1N1,H1N1_37_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
74,A/NovaScotia/ET1801CP00018S/2025_H1N1,H1N1_38_bc1,catttgtagctacagatgcagacacaataTGCATAGGTTATCATGC...


## Design H3 inserts

In [9]:
design_inserts(
    subtype = 'H3N2',
    insert_filepath = os.path.join(ordersheetsdir, 'h3_inserts.csv'),
    library_nucleotide_sequences = '../results/strains_for_library/h3_nt_seqs_for_library.fasta',
    upstream_signalpep = wsn_upstream_19aa,
    ectodomain_start = 16*3,
    ectodomain_length = 517,
    endodomain_sequence = h3_endodomain + wsn_downstream,
    append_additional_upstream_sequence = 'catttgtagctacagatgcagacaca', # Overlap with WSN signal peptide
    append_additional_downstream_sequence = 'AGATCGGAAGAGCGTCGTGT', # Overlap with Illumina R1 priming sequence                    
)

h3_inserts_df = pd.read_csv(os.path.join(ordersheetsdir, 'h3_inserts.csv'))
h3_inserts_df

File '../results/ordersheets/h3_inserts.csv' exists, reading that file and NOT regenerating barcodes.


Unnamed: 0,strain,name,sequence
0,A/Washington/284/2024_H3N2,H3N2_1_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
1,A/Washington/284/2024_H3N2,H3N2_1_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
2,A/Wisconsin/NIRC-IS-1028/2024_H3N2,H3N2_2_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
3,A/Wisconsin/NIRC-IS-1028/2024_H3N2,H3N2_2_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
4,A/Maldives/2147/2024_H3N2,H3N2_3_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
...,...,...,...
147,A/Amapa/021563-IEC/2024_H3N2,H3N2_74_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
148,A/New_York/39/2025_H3N2,H3N2_75_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
149,A/New_York/39/2025_H3N2,H3N2_75_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
150,A/New_York/GKISBBBE61555/2025_H3N2,H3N2_76_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...


## Write ordersheet

In [34]:
inserts_df = pd.concat([h1_inserts_df, h3_inserts_df])

# Save ordersheet
inserts_df[['name', 'sequence']].to_csv(os.path.join(ordersheetsdir, 'ordersheet.csv'), index=False)
inserts_df

Unnamed: 0,strain,genbank,name,sequence
0,A/Maryland/64/2024_H1N1,PV283376,H1N1_1_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
1,A/Maryland/64/2024_H1N1,PV283376,H1N1_1_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
2,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,PQ850248,H1N1_2_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
3,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,PQ850248,H1N1_2_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
4,A/Ulsan/492/2025_H1N1,PV100011,H1N1_3_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...
...,...,...,...,...
147,A/Amapa/021563-IEC/2024_H3N2,PV507313,H3N2_74_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
148,A/New_York/39/2025_H3N2,PV509521,H3N2_75_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
149,A/New_York/39/2025_H3N2,PV509521,H3N2_75_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
150,A/New_York/GKISBBBE61555/2025_H3N2,PV271403,H3N2_76_bc1,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...
