# Write library sequences to various file formats
We want all the library sequences and their associated barcodes in a few output files:
* FASTA file of all *actual* HA ectodomain protein sequences (containing none of the chimeric WSN sequence!)
* Barcode-to-strain mapping CSV file (containing only columns `barcode` and `strain`
* CSV for logging purposes (containing columns `strain`, `HA_ectodomain_protein_sequence`, `HA_ectodomain_nuc_sequence`, `associated_barcodes`, `plasmid_log_ID`)

In [1]:
# Import relevant packages
import os
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import random


# ID input and output
datadir = '../data'
resultsdir = '../results'

ordersheetsdir = os.path.join(resultsdir, 'ordersheets')

os.makedirs(datadir, exist_ok=True)
os.makedirs(resultsdir, exist_ok=True)
os.makedirs(ordersheetsdir, exist_ok=True)

## Generate FASTA file of actual HA ectodomain sequences

In [2]:
# Get inserts from ordersheets
inserts_df = pd.concat([
    pd.read_csv(os.path.join(ordersheetsdir, 'h1_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'h3_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'vaccine_inserts.csv')),
    pd.read_csv(os.path.join(ordersheetsdir, 'lisboa_inserts.csv')),
]).reset_index(drop=True)

inserts_df = inserts_df.assign(
    subtype = lambda x: x['name'].str.split('_').str[0]
)

# For each strain, trim for HA nucleotide sequence, HA protein sequence (NO CHIMERIC SEQUENCE AT ALL!!)
# Get barcode
seq_list = []

for n in inserts_df.name.unique():
    seq = inserts_df.query(f'name == "{n}"')['sequence'].iloc[0]

    if inserts_df.query(f'name == "{n}"').subtype.iloc[0] == 'H1N1':
        ha_ecto_seq = seq[29:-180]
    elif inserts_df.query(f'name == "{n}"').subtype.iloc[0] == 'H3N2':
        ha_ecto_seq = seq[26:-189]
        
    barcode = seq[-36:-20]
    seq_list.append([n, ha_ecto_seq, barcode])
        

# Merge trimmed sequence
inserts_df = inserts_df.merge(pd.DataFrame(seq_list, columns = ['name', 'trimmed_ha_ecto_sequence', 'barcode']))

# Get protein HA ectodomain sequence
inserts_df['trimmed_ha_ecto_sequence_protein'] = inserts_df['trimmed_ha_ecto_sequence'].apply(lambda x: str(Seq(x).translate(to_stop=True)))
inserts_df

Unnamed: 0,strain,genbank,name,sequence,subtype,trimmed_ha_ecto_sequence,barcode,trimmed_ha_ecto_sequence_protein
0,A/Maryland/64/2024_H1N1,PV283376,H1N1_1_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,ctggaggcctggcccc,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...
1,A/Maryland/64/2024_H1N1,PV283376,H1N1_1_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,aggtggacgggcatgg,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...
2,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,PQ850248,H1N1_2_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,gcatggaactaactcc,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...
3,A/Qinghai-Chengzhong/SWL1410/2024_H1N1,PQ850248,H1N1_2_bc2,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,aatttatccgagagcg,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...
4,A/Ulsan/492/2025_H1N1,PV100011,H1N1_3_bc1,catttgtagctacagatgcagacacaataTGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,tcgagttaatatgcgc,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...
...,...,...,...,...,...,...,...,...
227,A/New_York/GKISBBBE61555/2025_H3N2,PV271403,H3N2_76_bc2,catttgtagctacagatgcagacacaCAAAAAATACCTGGAAATGA...,H3N2,CAAAAAATACCTGGAAATGACGATAGCACGGCAACGCTGTGCCTTG...,aacagaagtccatgta,QKIPGNDDSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...
228,A/Croatia/10136RV/2023_H3N2,PV262858_D202A,H3N2_77_bc1,catttgtagctacagatgcagacacacaaaaaatacctggaaatga...,H3N2,caaaaaatacctggaaatgacaatagcacggcaacgctgtgccttg...,gaagtgctgctgaagt,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...
229,A/Croatia/10136RV/2023_H3N2,PV262858_D202A,H3N2_77_bc2,catttgtagctacagatgcagacacacaaaaaatacctggaaatga...,H3N2,caaaaaatacctggaaatgacaatagcacggcaacgctgtgccttg...,gtcgccgctaatccga,QKIPGNDNSTATLCLGHHAVPNGTIVKTITNDRIEVTNATELVQNS...
230,A/Lisboa/188/2023_H1N1,PV286748_V520I,H1N1_39_bc1,CATTTGTAGCTACAGATGCAGACACAATATGTATAGGTTATCATGC...,H1N1,TGTATAGGTTATCATGCGAACAATTCAACAGACACTGTGGACACAG...,ACCCCCGGAGCTTGGC,CIGYHANNSTDTVDTVLEKNVTVTHSVNLLEDKHNGKLCKLRGVAP...


In [3]:
# Now actually write FASTA

# Initialize output directory
output_dir = os.path.join(resultsdir, "inputs_for_seqneut-pipeline")
os.makedirs(output_dir, exist_ok=True)

# Write one FASTA file for all protein sequences
output_fasta_path = os.path.join(output_dir, "library_2025_HA_ectodomain_protein_sequences.fasta")
fasta_df = inserts_df.drop(columns = ['name', 'sequence', 'barcode']).drop_duplicates().reset_index(drop=True)


with open(output_fasta_path, "w") as f:
    for _, row in fasta_df.iterrows():
        header = f">{row['strain']}_{row['genbank']}"
        sequence = row['trimmed_ha_ecto_sequence_protein']
        fasta_entry = f"{header}\n{sequence}\n"
        f.write(fasta_entry)      

## Generate barcode-to-strain mapping CSV

In [4]:
barcode_to_strain_df = inserts_df[['barcode', 'strain']]
output_path = os.path.join(output_dir, "flu-seqneut-2025-barcode-to-strain.csv")
barcode_to_strain_df.to_csv(output_path)

## Generate logging CSV

In [5]:
log_df = inserts_df[['strain', 'trimmed_ha_ecto_sequence', 'trimmed_ha_ecto_sequence_protein']]

log_df = (log_df
          .rename(columns = {'trimmed_ha_ecto_sequence': 'nt_sequence',
                             'trimmed_ha_ecto_sequence_protein': 'protein_sequence'})
          .assign(status = 'unlogged')
          .drop_duplicates()
          .reset_index(drop=True)
         )

output_path = os.path.join(output_dir, "flu-seqneut-2025-library.csv")
log_df.to_csv(output_path)