# Design base library

In [1]:
import os
import subprocess
import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter

In [2]:
summarydir = '../results/summary_tables'
os.makedirs(summarydir, exist_ok=True)

os.makedirs("../results/selected_library_strains", exist_ok=True)

## Load summary tables of strains from the GISAID download

In [3]:
all_h3 = pd.read_csv('../data/accessions_to_download/H3_candidate_summary.csv')
all_h1 = pd.read_csv('../data/accessions_to_download/H1_candidate_summary.csv')

## Process aligned sequences

In [4]:
alignments = [
    'h1n1pdm', 'h3n2', 
]

# First 6 bp of HA coding sequence
motif = "atgaag"

# Iterate through each sequence
for alignment in alignments:

    sequences = []

    for record in SeqIO.parse(f'../results/alignments/2025-04-07_{alignment}_ha_aligned.fasta', "fasta"):
        sequence = str(record.seq).lower()
        positions = []
    
        # Search for all occurrences of the motif
        # Use to find the start and end index
        start_index = sequence.find(motif)
        end_index = start_index+1500

        trimmed_ha_nuc = (sequence[start_index:end_index])
        trimmed_ha_ectodomain_prot = (Seq(trimmed_ha_nuc, 'unambiguous_dna').translate())

        if alignment == 'h1n1pdm':
            trimmed_ha_ha1_prot = trimmed_ha_ectodomain_prot[17:345] # Trim for H1
        elif alignment == 'h3n2':
            trimmed_ha_ha1_prot = trimmed_ha_ectodomain_prot[20:348] # Trim for H3

        strain, gisaid_id = (record.id.split('|'))

        sequences.append([strain, gisaid_id,trimmed_ha_ha1_prot])

    outfile = os.path.join(summarydir, f'{alignment}_aligned_ha1.csv')
    pd.DataFrame(sequences, columns = ['name', 'accession_ha', 'ha1_sequence']).to_csv(outfile, index=False)
        

In [5]:
# ID map
# Need to know the isolate ID to HA ID
h1_map = pd.read_csv('../data/sequences/h1_id_map.csv')
h3_map = pd.read_csv('../data/sequences/h3_id_map.csv')

For each unique HA1 sequence, see which methods picked each sequence. We want some convergence but not total convergence, since each of the methods are looking at slightly different metrics.

In [6]:
# Read in alignment
h3 = pd.read_csv(os.path.join(summarydir,'h3n2_aligned_ha1.csv'))

all_h3 = (all_h3
          .merge(h3_map.rename(columns = {'ha_id': 'accession_ha'})) # get HA ID to isolate ID
          .merge(h3.rename(columns = {'accession_ha': 'isolate_id'})) 
          # .rename(columns = {'isolate_id': 'gisaid_id'})
         )

# Initialize empty list for matching HA1 sequences to methods 
ha1_to_method = []

for ha1_sequence in (all_h3.ha1_sequence.unique()):
    

    temp_df = (all_h3.query(f'ha1_sequence == "{ha1_sequence}"'))
    method = (temp_df.method.to_list())

    ha1_to_method.append([ha1_sequence,method])

    
all_h3_tidy = (all_h3
               .merge(h3.rename(columns = {'accession_ha': 'isolate_id'}))
               .merge(pd.DataFrame(ha1_to_method, columns = ['ha1_sequence', 'methods']))
               .drop(columns=['method', 'mutations', 'haplotype'])
               .drop_duplicates(subset=['ha1_sequence'], keep='first')
               .reset_index(drop=True)
              )

# pd.DataFrame(ha1_to_method, columns = ['ha1_sequence', 'methods'])

all_h3_tidy.to_csv(os.path.join(summarydir, '2025-04-07_h3_sequences.csv'), index=False)

In [7]:
h1 = pd.read_csv(os.path.join(summarydir, 'h1n1pdm_aligned_ha1.csv'))

all_h1 = (all_h1
          .merge(h1_map.rename(columns = {'ha_id': 'accession_ha'})) # get HA ID to isolate ID
          .merge(h1.rename(columns = {'accession_ha': 'isolate_id'})) 
          # .rename(columns = {'isolate_id': 'gisaid_id'})
         )

ha1_to_method = []

for ha1_sequence in (all_h1.ha1_sequence.unique()):

    temp_df = (all_h1.query(f'ha1_sequence == "{ha1_sequence}"'))
    method = (temp_df.method.to_list())

    ha1_to_method.append([ha1_sequence,method])

    
all_h1.merge(h1.rename(columns = {'accession_ha': 'isolate_id'}))

all_h1_tidy = (all_h1
               .merge(h1.rename(columns = {'accession_ha': 'isolate_id'}))
               .merge(pd.DataFrame(ha1_to_method, columns = ['ha1_sequence', 'methods']))
               .drop(columns=['method', 'mutations', 'haplotype'])
               .drop_duplicates(subset=['ha1_sequence'], keep='first')
               .reset_index(drop=True)
              )

# pd.DataFrame(ha1_to_method, columns = ['ha1_sequence', 'methods'])

all_h1_tidy.to_csv(os.path.join(summarydir, '2025-04-07_h1_sequences.csv'), index=False)
# all_h1_tidy

## Compare strains in H3 and H1 libraries to list of strains used to vaccinate ferrets
We want HA1 exact matches to these strains. If they don't exist in the library we should add them. 

In [8]:
alignments = [
    'ferret_strains_h3',
    'ferret_strains_h1'

]

# First 6 bp of HA coding sequence
motif = "atgaag"

# Iterate through each sequence
for alignment in alignments:

    sequences = []

    for record in SeqIO.parse(f'../results/alignments/2025-04-08_{alignment}.fasta', "fasta"):
        sequence = str(record.seq).lower()
        positions = []
    
        # Search for all occurrences of the motif
        # Use to find the start and end index
        start_index = sequence.find(motif)
        end_index = start_index+1500

        trimmed_ha_nuc = (sequence[start_index:end_index])
        trimmed_ha_ectodomain_prot = (Seq(trimmed_ha_nuc, 'unambiguous_dna').translate())

        if alignment == 'ferret_strains_h1':
            trimmed_ha_ha1_prot = trimmed_ha_ectodomain_prot[17:345] # Trim for H1
        elif alignment == 'ferret_strains_h3':
            trimmed_ha_ha1_prot = trimmed_ha_ectodomain_prot[20:348] # Trim for H3

        strain, gisaid_id = (record.id.split('|'))

        sequences.append([strain, gisaid_id,trimmed_ha_ha1_prot])

    outfile = os.path.join(summarydir, f'{alignment}_aligned_ha1.csv')
    pd.DataFrame(sequences, columns = ['name', 'accession_ha', 'ha1_sequence']).to_csv(outfile, index=False)
        

In [9]:
ferret_h3 = pd.read_csv(os.path.join(summarydir, 'ferret_strains_h3_aligned_ha1.csv'))
ferret_h1 = pd.read_csv(os.path.join(summarydir, 'ferret_strains_h1_aligned_ha1.csv'))


library_ha1s = pd.concat([h3, h1]).ha1_sequence.to_list()



ferret_h1

Unnamed: 0,name,accession_ha,ha1_sequence
0,A/Norway/07606/2024,EPI_ISL_19723534,DTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRG...
1,A/Lisboa/188/2023,EPI_ISL_18950107,DTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRG...
2,A/Tajikistan/02-1057/2024,EPI_ISL_19440507,DTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLKG...


In [10]:
missing_ha1s = []

for ha1 in pd.concat([ferret_h3,ferret_h1]).ha1_sequence.unique():
    if ha1 in library_ha1s:
        pass
    else:
        missing_ha1s.append(ha1)

In [11]:
library_df = pd.concat([h3, h1])
ferret_df = pd.concat([ferret_h3, ferret_h1])

ferret_df[~ferret_df['ha1_sequence'].isin(missing_ha1s)]

Unnamed: 0,name,accession_ha,ha1_sequence
3,A/Netherlands/10685/2024,EPI_ISL_19654875,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...
0,A/Norway/07606/2024,EPI_ISL_19723534,DTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRG...
2,A/Tajikistan/02-1057/2024,EPI_ISL_19440507,DTLCIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLKG...


In [12]:
missing_ha1s

['GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGKICNSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSSCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVKQNGTSSACKRGSSSSFFSRLNWLTSLNNIYPAQNVTMPNKERFDKLYIWGVHHPDTDRNQFSLFAQSSGRITVSTKRSQQAVIPNIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGECKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIF',
 'GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGKICNSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSSCYPYDVPDYASLRSLVASSGTLEFKDENFNWTGVKQNGTSSACKRGSSSSFFSRLNWLTSLNNIYPAQNVTMPNKEQFDKLYIWGVHHPDTDKNQFSLFAQSSGRITVSTKRSQQAVIPNIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGECKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIF',
 'GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGKICNSPHQILDGGNCTLIDALLGDPQCDGFQNKEWDLFVERSRANSSCYPYDVPDYASLRSLVASSGTLEFKNESFNWTGVKQNGTSSACKRRSSSSFFSRLNWLTSLNNIYPAQNVTMPNKERFDKLYIWGVHHPDTDKNQFSLFAQSSGRITVSTKRSQQAVIPNIGSRPRVRDIPSRISIYWTIVKPGDILLINSTGNLIAPRGYFKIRSGKSSIMRSDAPIGECKSECITPNGSIPNDKPFQNVNRITYGACPRYVKQSTLKLATGMRNVPEKQTRGIF',
 

## Haplotype caller
I want to call mutations from consensus so I have an easier way to look at all my strains. 

In [13]:
def extract_coding_window(seq, start_pattern="ATGAAG", window_size=1500):
    """Extract window starting from first instance of start_pattern."""
    seq = seq.upper()
    start_index = seq.find(start_pattern.upper())
    if start_index == -1:
        return None  # Could not find pattern
    return seq[start_index:start_index + window_size]

def translate_sequences(dna_seqs):
    """Translate DNA sequences into protein sequences (1st frame)."""
    return [str(Seq(dna).translate(to_stop=False)) for dna in dna_seqs]

def get_consensus_sequence(protein_seqs):
    """Compute consensus sequence from a list of aligned protein sequences."""
    min_len = min(len(seq) for seq in protein_seqs)
    consensus = ""
    for i in range(min_len):
        column = [seq[i] for seq in protein_seqs if len(seq) > i]
        most_common = Counter(column).most_common(1)[0][0]
        consensus += most_common
    return consensus

def get_mutations(seq, consensus, offset=16):
    """Return amino acid mutations relative to consensus, starting at position `offset + 1` as position 1."""
    mutations = []
    for i in range(offset, min(len(seq), len(consensus))):
        cons_aa = consensus[i]
        seq_aa = seq[i]
        if cons_aa != seq_aa and seq_aa != '*':
            mutations.append(f"{cons_aa}{i - offset + 1}{seq_aa}")
    return mutations


def annotate_aa_mutations_from_dna_fastas(fasta_paths):
    if isinstance(fasta_paths, str):
        fasta_paths = [fasta_paths]

    records = []
    for path in fasta_paths:
        for rec in SeqIO.parse(path, "fasta"):
            trimmed = extract_coding_window(str(rec.seq))
            if trimmed and len(trimmed) >= 3:
                rec.seq = Seq(trimmed)
                rec.id = f"{os.path.basename(path)}|{rec.id}"
                records.append(rec)
            else:
                print(f"Warning: {rec.id} in {path} was skipped (no valid window)")

    names = [rec.id for rec in records]
    dna_seqs = [str(rec.seq) for rec in records]

    if not dna_seqs:
        print("No valid sequences found.")
        return []

    protein_seqs = translate_sequences(dna_seqs)
    consensus = get_consensus_sequence(protein_seqs)

    annotated = []
    for name, prot_seq in zip(names, protein_seqs):
        muts = get_mutations(prot_seq, consensus)
        strain_name = name.split('|')[1]
        source = name.split('|')[0]
        accession = name.split('|')[2]
        annotated.append([strain_name, source, accession, muts])

    return annotated


## Remove some H3 sequences on recommendation by ST

In [14]:
# Compare fasta file of sequences to a consensus
# Name haplotypes based on mutations from consensus
# Eg HA1 S145N_X276E

ST_sequences_to_remove = [
    'A/Netherlands/01500/2025',
    'A/Wisconsin/NIRC-IS-1125/2025',
    'A/Minnesota/141/2024',
    'A/Indonesia/BIOKES-IMDN985/2024',
    # Personal removes
    'A/Texas/15527/2024',
    'A/Tennessee/95/2024',
    'A/Hungary/335/2024',
    'A/Texas/ISC-1316/2025'
]


ST_sequences_to_add = [
    'EPI_ISL_19754645',
    'EPI_ISL_19755351',
    'EPI_ISL_19789808',
    'EPI_ISL_19266713',
    'EPI_ISL_19769038',
    'EPI_ISL_19720065',
    'EPI_ISL_19708803',
    'EPI_ISL_19754053',
    'EPI_ISL_19775350',
    'EPI_ISL_19790412',
    'EPI_ISL_19777744',
    'EPI_ISL_19731877',
]

all_h3_tidy[all_h3_tidy['name'].isin(ST_sequences_to_remove)]
# all_h3_tidy

Unnamed: 0,name,div,num_date,clade_membership,subclade,accession_ha,country,lbi,date,isolate_id,ha1_sequence,methods
7,A/Indonesia/BIOKES-IMDN985/2024,0.0307,2024.78,2a.3a.1,J.2.2,EPI4071556,Indonesia,0.464,,EPI_ISL_19769191,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...,"[CK, JH]"
12,A/Texas/ISC-1316/2025,0.0347,2025.084,2a.3a.1,J.2,EPI4049461,Usa,0.442,,EPI_ISL_19759737,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...,[CK]
14,A/Minnesota/141/2024,0.0352,2024.862,2a.3a.1,J.2.2,EPI3758592,Usa,0.437,,EPI_ISL_19646927,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSISK...,[CK]
25,A/Texas/15527/2024,0.033,2024.933,2a.3a.1,J.2,EPI4096405,Usa,,,EPI_ISL_19776814,GNDNSTATLCLGHHAVPNGTVVKTITNDRIEVTNATELVQNSSIGK...,[AL]
33,A/Hungary/335/2024,0.0296,2024.96,2a.3a.1,J.2,EPI3861657,Hungary,,,EPI_ISL_19695255,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...,[AL]
48,A/Tennessee/95/2024,,,,J.2,EPI3731297,Usa,,2024-11-02,EPI_ISL_19628046,GNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...,[JH]
50,A/Wisconsin/NIRC-IS-1125/2025,,,,J.2,EPI4150712,Usa,,2025-03-01,EPI_ISL_19792606,GNDNSTATLCLGHHAVPNGTIVKTITNARIEVTNATELVQNSSMGK...,[JH]
52,A/Netherlands/01500/2025,,,,J.2,EPI4120795,Netherlands,,2025-03-10,EPI_ISL_19786780,GNGNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNSSIGK...,[JH]


Make trimmed alignments of H3 sequences

In [15]:
# Get ferret haplotypes
# Compare to library and Sam's recommended strains
# Prioritize those strains with overlap

output = annotate_aa_mutations_from_dna_fastas([
    '../results/alignments/2025-04-07_h3n2_ha_aligned.fasta', 
    '../results/alignments/2025-04-08_ferret_strains_h3.fasta', 
    '../results/alignments/ST_h3n2_metadata.fasta'
])



all_h3_tidy_mutations = pd.DataFrame(output, columns = ['name','fasta','accession','mutations']).merge(
    all_h3_tidy, 
    on='name', 
    how = 'left'
)

# Remove Sam's recommended eliminations
all_h3_tidy_mutations = all_h3_tidy_mutations[~all_h3_tidy_mutations['name'].isin(ST_sequences_to_remove)]

# Get list of unique mutation sets
mut_list = []
for mut in all_h3_tidy_mutations.mutations:
    if mut not in mut_list:
        mut_list.append(mut)

# Choose single strain for each mutation set with multiple strains
strains_to_keep = ['A/Washington/284/2024', 'A/Washington/15245/2025', 'A/Canberra/613/2024', 'A/Texas/15550/2024',
                   'A/Texas/ISC-1148/2025', 'A/Colombia/1851/2024', 'A/Netherlands/01502/2025', 'A/Tasmania/836/2024',
                   'A/France/PAC-RELAB-HCL024172122101/2024',
                   'A/Switzerland/860423897313/2023', 'A/Queensland/IN000692/2024', 'A/Massachusetts/93/2024', 
                   'A/Texas/ISC-1274/2025',
                   'A/Indiana/46/2024'
                  ]
selected_strains = []
for mut in mut_list:
    temp_df = all_h3_tidy_mutations[all_h3_tidy_mutations['mutations'].apply(lambda x: x == mut)]

    if len(temp_df) >= 2: # For mutation sets with more than one strain
        # print('\nMatching mutation sets idenfied for these strains...')
        # print(temp_df)

        names = temp_df.name.to_list()

        for name in names:
            if name in strains_to_keep:
                selected_strains.append(name)

    elif len(temp_df) == 1: # For mutation sets with one strain
        selected_strains.append(temp_df.name.values[0])


# Save dataframes of the strain names, accessions, and haplotypes
all_h3_tidy_mutations = all_h3_tidy_mutations[all_h3_tidy_mutations['name'].isin(selected_strains)].reset_index(drop=True)

all_h3_tidy_mutations[['name', 'mutations']].to_csv('../results/selected_library_strains/h3_strains.csv', index=False)



# Save the list of FASTA files for library 
input_fastas = [
    '../results/alignments/2025-04-07_h3n2_ha_aligned.fasta', 
    '../results/alignments/2025-04-08_ferret_strains_h3.fasta', 
    '../results/alignments/ST_h3n2_metadata.fasta'
]

# Initialize list of targets
target_accessions = all_h3_tidy_mutations.accession.to_list()  

# Store all matching sequences
matched_nucleotide_seqs = []
matched_protein_seqs = []

# Define the custom start codon to search for
start_codon = "atgaag"

# Loop through each specified FASTA file
for fasta_file in input_fastas:
    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            # Find the position of the first instance of 'ATGAAG'
            start_position = record.seq.find(start_codon)
            
            if start_position == -1:
                print(f"Skipping {record.id} - no 'atgaag' found")
                continue  # Skip if 'ATGAAG' is not found
            
            # Extract the sequence starting from the found position
            end_position = start_position + 1602
            nucleotide_seq = record.seq[start_position:end_position]
            translated_seq = nucleotide_seq.translate()

            # Make separate SeqRecords for nucleotide and protein
            nucleotide_record = SeqRecord(nucleotide_seq, id=record.id, description=record.description)
            protein_record = SeqRecord(translated_seq, id=record.id, description=record.description)

            # Check if the sequence ID matches any target accessions
            if record.id.split('|')[1] in target_accessions:
                matched_nucleotide_seqs.append(nucleotide_record)
                matched_protein_seqs.append(protein_record)

                
# Write all matched protein sequences to a new FASTA file, remove GISAID IDs 
h3_output_file = "../results/selected_library_strains/h3_strains_prot.fasta"
with open(h3_output_file, "w") as output_handle:
    for seq in matched_protein_seqs:
        output_handle.write(f">{seq.id.split('|')[0]}_H3N2\n{str(seq.seq)}\n")

print(f"Matched sequences saved to {h3_output_file}")


Matched sequences saved to ../results/selected_library_strains/h3_strains_prot.fasta


## Repeat the above analysis for H1s
Compare our list of H1s to the list of strains used to vaccinate ferrets. ST made no recommendations for H1s so we can skip that part. 

### Add recent high frequency haplotype H1s from circulating analysis

My analysis of circulating sequences showed that we were missing a lot of H1 HA1 haplotypes present in the past few months. To rectify this, I added 4 additional strains. Those strains were downloaded and placed in `../data/sequences/2025-04-25_additional_H1s.fasta` and aligned in `../results/alignments/2025-04-25_additional_H1s.fasta`

In [16]:
# Use custom offset for the H1s
def get_mutations(seq, consensus, offset=17):
    """Return amino acid mutations relative to consensus, starting at position `offset + 1` as position 1."""
    mutations = []
    for i in range(offset, min(len(seq), len(consensus))):
        cons_aa = consensus[i]
        seq_aa = seq[i]
        if cons_aa != seq_aa and seq_aa != '*':
            mutations.append(f"{cons_aa}{i - offset + 1}{seq_aa}")
    return mutations

output = annotate_aa_mutations_from_dna_fastas([
    '../results/alignments/2025-04-07_h1n1pdm_ha_aligned.fasta', 
    '../results/alignments/2025-04-08_ferret_strains_h1.fasta', 
    '../results/alignments/2025-04-25_missing_h1s_nuc.fasta'
])


all_h1_tidy_mutations = pd.DataFrame(output, columns = ['name','fasta','accession','mutations']).merge(
    all_h1_tidy, 
    on='name', 
    how = 'left'
)

h1_to_keep = all_h1_tidy.name.tolist()
h1_to_keep.extend(['A/Saint-Petersburg/RII-04/2025', 'A/Utah/39/2025', 'A/Hawaii/ISC-1140/2025', 'A/NovaScotia/ET1801CP00018S/2025'])

all_h1_tidy_mutations = all_h1_tidy_mutations[all_h1_tidy_mutations['name'].isin(h1_to_keep)].reset_index(drop=True)

all_h1_tidy_mutations[['name', 'mutations']].to_csv('../results/selected_library_strains/h1_strains.csv', index=False)


In [17]:
# Save the list of FASTA files for library 
input_fastas = [
    '../results/alignments/2025-04-07_h1n1pdm_ha_aligned.fasta', 
    '../results/alignments/2025-04-08_ferret_strains_h1.fasta',
    '../results/alignments/2025-04-25_additional_H1s.fasta'
]

# Initialize list of targets
target_accessions = all_h1_tidy_mutations.accession.to_list()  

# Store all matching sequences
matched_nucleotide_seqs = []
matched_protein_seqs = []

# Define the custom start codon to search for
start_codon = "atgaag"

# Loop through each specified FASTA file
for fasta_file in input_fastas:
    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            # Find the position of the first instance of 'ATGAAG'
            start_position = record.seq.find(start_codon)
            
            if start_position == -1:
                print(f"Skipping {record.id} - no 'atgaag' found")
                continue  # Skip if 'ATGAAG' is not found
            
            # Extract the sequence starting from the found position
            end_position = start_position + 1602
            nucleotide_seq = record.seq[start_position:end_position]
            translated_seq = nucleotide_seq.translate()

            # Make separate SeqRecords for nucleotide and protein
            nucleotide_record = SeqRecord(nucleotide_seq, id=record.id, description=record.description)
            protein_record = SeqRecord(translated_seq, id=record.id, description=record.description)

            # Check if the sequence ID matches any target accessions
            if record.id.split('|')[1] in target_accessions:
                matched_nucleotide_seqs.append(nucleotide_record)
                matched_protein_seqs.append(protein_record)



# Write all matched protein sequences to a new FASTA file, writing only strain names
h1_output_file = "../results/selected_library_strains/h1_strains_prot.fasta"
with open(h1_output_file, "w") as output_handle:
    for seq in matched_protein_seqs:
        output_handle.write(f">{seq.id.split('|')[0]}_H1N1\n{str(seq.seq)}\n")

print(f"Matched sequences saved to {h1_output_file}")

Matched sequences saved to ../results/selected_library_strains/h1_strains_prot.fasta


## Concatenate the H1 and H3 protein sequences:

In [18]:
with open("../results/selected_library_strains/h3_and_h1_prots.fasta", "w") as f:
    subprocess.run(
        ["cat", h3_output_file, h1_output_file],
        check=True,
        stdout=f,
    )