In [1]:
import numpy as np
import csv

# Use biopython library to process fasta files
from Bio import SeqIO

In [3]:
# Read in fasta file

fasta_filename = "data/spikeprot0309.fasta"
fasta_entries = SeqIO.parse(fasta_filename, "fasta")

In [4]:
# Process each fasta sample into a dictionary
# The key is the EPI_ISL ID, the value is a dictionary with metadata and the protein sequence

fasta_samples = {}
for entry in fasta_entries:
    metadata_list = entry.description.split("|")
    protein_sequence = str(entry.seq)
    
    # Account for rows with missing data
    while len(metadata_list) < 11:
        metadata_list.append("")
        
    # Remove "hCoV-19/" prefix and remove spaces from fasta strain
    strain = metadata_list[1]
    if strain[:8] == "hCoV-19/":
        strain = strain[8:]
    strain = strain.replace(" ", "")
        
    # Create dictionary from entry
    sample = {
        "Strain": strain,
        "Submission Date": metadata_list[2],
        "EPI_ISL": metadata_list[3],
        "Division of Exposure": metadata_list[5],
        "Originating Lab": metadata_list[7],
        "Submitting Lab": metadata_list[8],
        "Author": metadata_list[9],
        "Country of Exposure": metadata_list[10],
        "Sequence": protein_sequence,
    }
    
    # Add sample to fasta_samples
    epi_isl = metadata_list[3]
    assert(epi_isl[:7] == "EPI_ISL")
    fasta_samples[epi_isl] = sample
    
print("Number of FASTA entries: ", len(fasta_samples.keys()))
print("")
print("Example FASTA entry: ")
for (_, value) in fasta_samples.items():
    print(value)
    break

Number of FASTA entries:  702408

Example FASTA entry: 
{'Strain': 'Wuhan/WIV04/2019', 'Submission Date': '2019-12-30', 'EPI_ISL': 'EPI_ISL_402124', 'Division of Exposure': 'hCoV-19^^Hubei', 'Originating Lab': 'Wuhan Jinyintan Hospital', 'Submitting Lab': 'Wuhan Institute of Virology', 'Author': 'Shi', 'Country of Exposure': 'China', 'Sequence': 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLI

In [5]:
# Process nextstrain global metadata into a dictionary

metadata_filename = "data/nextstrain_ncov_global_metadata.tsv"
metadata_samples = {}
with open(metadata_filename, "r") as f:
    tsv_reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
    header = next(tsv_reader)
    
    for row in tsv_reader:
        # Create dictionary from row
        sample = {}
        for i in range(len(row)):
            sample[header[i]] = row[i]
            
        # Add sample to metadata_samples
        epi_isl = row[10]
        assert(epi_isl[:7] == "EPI_ISL")
        metadata_samples[epi_isl] = sample
        
    print("Number of metadata entries: ", len(metadata_samples.keys()))
    print("")
    print("Example metadata entry: ")
    for (_, value) in metadata_samples.items():
        print(value)
        break

Number of metadata entries:  3860

Example metadata entry: 
{'Strain': 'Wuhan/WH01/2019', 'GISAID Clade': 'L', 'S1 mutations': '', 'Age': '44', 'Clade': '19A', 'Country': 'China', 'Country of Exposure': 'China', 'Admin Division': 'Hubei', 'Division of Exposure': 'Hubei', 'genbank_accession': 'LR757998.1', 'gisaid_epi_isl': 'EPI_ISL_406798', 'Host': 'Human', 'Location': 'Wuhan', 'Originating Lab': "General Hospital of Central Theater Command of People's Liberation Army of China", 'PANGO Lineage': 'B', 'Submission Date': 'Older', 'Region': 'Asia', 'Sex': 'Male', 'Emerging Clade': '19A', 'Submitting Lab': "BGI & Institute of Microbiology, Chinese Academy of Sciences & Shandong First Medical University & Shandong Academy of Medical Sciences & General Hospital of Central Theater Command of People's Liberation Army of China", 'url': '', 'Collection Data': '2019-12-26', 'Author': 'Weijun Chen et al (https://dx.doi.org/10.1016/S0140-6736(20)30251-8)', 'Region of Exposure': ''}


In [6]:
# Find matches between metadata and fasta

metadata_fasta_matches = []
for epi_isl in metadata_samples.keys():
    if epi_isl in fasta_samples:
        metadata_fasta_matches.append(epi_isl)
        
print("Number of matching EPI_ISL ID's between NextStrain metadata and FASTA: ", len(metadata_fasta_matches))

Number of matching EPI_ISL ID's between NextStrain metadata and FASTA:  2488


In [7]:
# Read in nextstrain global timetree file

timetree_filename = "data/nextstrain_ncov_global_timetree.nexus"
with open(timetree_filename, "r") as f:
    timetree_str = f.read()
    timetree_str = timetree_str[33:-6] # Remove start and end

In [8]:
# Separate nexstrain global timetree data into a tree structure, and an array of individual entries

tree_structure = []
timetree_entries = []
entry_start = 0

in_strain_name = False
in_description = False
in_tree_length = False

for (i, char) in enumerate(timetree_str):
    # Advance parser state, keeping track of when each tree entry starts and ends
    # Parser states advances from none -> in_strain_name -> in_description -> in_tree_length
    # Separately keep track of the tree structure, using each entry's ID in timetree_entries
    if in_strain_name:
        if char == '[':
            in_strain_name = False
            in_description = True
            
    elif in_description:
        if char == ']':
            in_description = False
            in_tree_length = True
            
    elif in_tree_length:
        if not char.isdigit() and char != ':' and char != '.':
            in_tree_length = False
            timetree_entries.append(timetree_str[entry_start:i])
            
            tree_structure.append(str(len(timetree_entries)))
            tree_structure.append(char)
            
    else:
        if char != ',' and char != '(' and char != ')':
            in_strain_name = True
            entry_start = i
        else:
            tree_structure.append(char)
            
tree_structure = "".join(tree_structure)

In [9]:
# Parse timetree entries into a dictionary

timetree_samples = {}
for (timetree_id, entry) in enumerate(timetree_entries):
    (strain, entry) = entry.split("[&")
    (entry, path_length) = entry.split("]:")
    tokens = entry.split(",")
    
    # Create dictionary from entry
    sample = {
        "strain": strain,
        "path_length": path_length,
        "timetree_id": timetree_id,
    }
    for token in tokens:
        try:
            (name, value) = token.split("=")
            sample[name] = value
        except ValueError:
            # Handle comma within num_date_CI
            sample["num_date_CI"] += "," + token
            
    # Add sample to timetree_samples
    timetree_samples[strain] = sample

print("Number of global timetree entries: ", len(timetree_entries))
print("")
print("Example global timetree entry: ")
for (_, value) in timetree_samples.items():
    print(value)
    break

Number of global timetree entries:  7354

Example global timetree entry: 
{'strain': 'Wuhan/WH01/2019', 'path_length': '0.0020887245693756995', 'timetree_id': 0, 'clade_membership': '19A', 'num_date': '2019.9849315068493', 'num_date_CI': '{2019.9849315068493,2019.9849315068493}', 'subclade_membership': '19A', 'pango_lineage': 'B', 'GISAID_clade': 'L', 'location': 'Wuhan', 'division': 'Hubei', 'country': 'China', 'region': 'Asia', 'host': 'Human', 'age': '44', 'sex': 'Male', 'recency': 'Older', 'country_exposure': 'China', 'division_exposure': 'Hubei', 'div': '2'}


In [10]:
# Find matches between valid EPI_ISL IDs and global timetree data

matches = []
for epi_isl in metadata_fasta_matches:
    strain = metadata_samples[epi_isl]["Strain"]
    if strain in timetree_samples:
        matches.append(epi_isl)
        
print("Number of matches between EPI_ISL ID's and NextStrain global timetree: ", len(matches))

Number of matches between EPI_ISL ID's and NextStrain global timetree:  683


In [11]:
# Build dictionary of entries that match between all files

matched_samples = {}
train_data = []
train_labels = []
for epi_isl in matches:
    fasta = fasta_samples[epi_isl]
    metadata = metadata_samples[epi_isl]
    timetree = timetree_samples[metadata["Strain"]]
    
    sample = {
        "strain": metadata["Strain"],
        "divergence": int(timetree["div"]),
        "clade": timetree["clade_membership"],
        "subclade": timetree["subclade_membership"],
        "sequence": fasta["Sequence"],
    }
    
    matched_samples[epi_isl] = sample

print("Example matched sample: ")
for (_, value) in matched_samples.items():
    print(value)
    break

Example matched sample: 
{'strain': 'Wuhan/WH01/2019', 'divergence': 2, 'clade': '19A', 'subclade': '19A', 'sequence': 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAG

In [43]:
# Map from amino acids and clades to ints

with open("data/amino_list.txt", encoding="utf8") as f:
    amino_list = f.read().strip().split(',')
amino_codes = {}
for (i, v) in enumerate(amino_list):
    amino_codes[v] = i
    
clade_list = list(set([value["clade"] for (_, value) in matched_samples.items()]))
clade_codes = {}
for (i, v) in enumerate(clade_list):
    clade_codes[v] = i

In [44]:
# Convert training and validation data from string to numerical format
    
def amino_to_num(data_list, amino_codes):
    new_data = []
    
    for seq in data_list:
        new_seq = np.array([amino_codes[char] for char in seq])
        new_data.append(new_seq)
        
    return np.array(new_data, dtype=np.object)

def clade_to_num(data, clade_codes):
    new_data = [clade_codes[clade] for clade in data]
    return np.array(new_data)

In [45]:
# Generate training and validation datasets
    
train_data = []
train_data_numerical = []
train_label = []

validation_data = []
validation_data_numerical = []
validation_label = []

for (i, (_, value)) in enumerate(matched_samples.items()):
    if i % 10 == 0:
        validation_data.append(value["sequence"])
        validation_label.append(value["clade"])
    else:
        train_data.append(value["sequence"])
        train_label.append(value["clade"])
        
train_data_numerical = amino_to_num(train_data, amino_codes)
train_label_numerical = clade_to_num(train_label, clade_codes)

validation_data_numerical = amino_to_num(validation_data, amino_codes)
validation_label_numerical = clade_to_num(validation_label, clade_codes)

In [49]:
# Save all to numpy file

np.save("data/matched_samples.npy", matched_samples, allow_pickle=True)
np.save("data/fasta_samples.npy", fasta_samples, allow_pickle=True)
np.save("data/metadata_samples.npy", metadata_samples, allow_pickle=True)
np.save("data/timetree_samples.npy", timetree_samples, allow_pickle=True)

np.save("data/amino_mapping.npy", amino_list, allow_pickle=True)
np.save("data/clade_mapping.npy", clade_list, allow_pickle=True)

np.save("data/train_data.npy", train_data, allow_pickle=True)
np.save("data/train_data_numerical.npy", train_data_numerical, allow_pickle=True)
np.save("data/train_label.npy", train_label, allow_pickle=True)
np.save("data/train_label_numerical.npy", train_label_numerical, allow_pickle=True)

np.save("data/validation_data.npy", validation_data, allow_pickle=True)
np.save("data/validation_data_numerical.npy", validation_data_numerical, allow_pickle=True)
np.save("data/validation_label.npy", validation_label, allow_pickle=True)
np.save("data/validation_label_numerical.npy", validation_label_numerical, allow_pickle=True)