# Pull EPI ISL identifiers for all sequences used in library design and MLR analysis
We need to generate an EPISET identifier to properly acknowledge all the contributing authors who deposited the sequences we used in our analysis. I'll scrape those together here from across non-pipeline analyses notebooks. I can then generate a unique DOI for the sequences per the instructions here: [https://gisaid.org/episet/](https://gisaid.org/episet/)

Author: Caroline Kikawa

In [1]:
import os
import pandas as pd

In [36]:
library_strains = []

library_epi = pd.concat(
    [
        pd.read_csv('../../nextstrain_tree_build/data/download/library_ha_accessions.csv', names = ['epi']), # 2023-circulating strains
        pd.read_csv('../data/gisaid_query/vaccine_accession_numbers.csv', names = ['epi']) # Vaccine strains
    ]
)


Unnamed: 0,epi
0,EPI_ISL_18108949
1,EPI_ISL_18303933
2,EPI_ISL_18374389
3,EPI_ISL_17391841
4,EPI_ISL_18108925
...,...
12,EPI_ISL_944639
13,EPI_ISL_3534319
14,EPI_ISL_806547
15,EPI_ISL_2233240


In [37]:
mlr_strains = []

gisaid_fasta = '../../../../flu_H3_2023_seqneut_vs_growth/data/gisaid_flu_h3_prots.fa'

with open(gisaid_fasta) as f:
    for line in f:
        if '>' in line:
            line = line.strip('>')
            epi = (line.split('|'))[0]
            strain_name = (line.split('|'))[1]

            mlr_strains.append([epi, strain_name])

mlr_epi = pd.DataFrame(mlr_strains, columns = ['epi', 'name'])

In [39]:
all_epi = pd.concat(
    [
        library_epi,
        mlr_epi[['epi']]
    ]
)

all_epi.to_csv('epi_for_episet.csv', header=False, index=False)