In [99]:
import numpy as np
import csv

# Use biopython library to process fasta files
from Bio import SeqIO

In [100]:
# Read in fasta file

fasta_filename = "data/spikeprot0309.fasta"
fasta_entries = SeqIO.parse(fasta_filename, "fasta")

In [101]:
# Process each fasta sample into a dictionary
# The key is the EPI_ISL ID, the value is a dictionary with metadata and the protein sequence

fasta_samples = {}
for entry in fasta_entries:
    metadata_list = entry.description.split("|")
    protein_sequence = str(entry.seq)
    
    # Account for rows with missing data
    while len(metadata_list) < 11:
        metadata_list.append("")
        
    # Remove "hCoV-19/" prefix and remove spaces from fasta strain
    strain = metadata_list[1]
    if strain[:8] == "hCoV-19/":
        strain = strain[8:]
    strain = strain.replace(" ", "")
        
    # Create dictionary from entry
    sample = {
        "Strain": strain,
        "Submission Date": metadata_list[2],
        "EPI_ISL": metadata_list[3],
        "Division of Exposure": metadata_list[5],
        "Originating Lab": metadata_list[7],
        "Submitting Lab": metadata_list[8],
        "Author": metadata_list[9],
        "Country of Exposure": metadata_list[10],
        "Sequence": protein_sequence,
    }
    
    # Add sample to fasta_samples
    epi_isl = metadata_list[3]
    assert(epi_isl[:7] == "EPI_ISL")
    fasta_samples[epi_isl] = sample
    
print("Number of FASTA entries: ", len(fasta_samples.keys()))
print("")
print("Example FASTA entry: ")
for (_, value) in fasta_samples.items():
    print(value)
    break

Number of FASTA entries:  702408

Example FASTA entry: 
{'Strain': 'Wuhan/WIV04/2019', 'Submission Date': '2019-12-30', 'EPI_ISL': 'EPI_ISL_402124', 'Division of Exposure': 'hCoV-19^^Hubei', 'Originating Lab': 'Wuhan Jinyintan Hospital', 'Submitting Lab': 'Wuhan Institute of Virology', 'Author': 'Shi', 'Country of Exposure': 'China', 'Sequence': 'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLTPTWRVYSTGSNVFQTRAGCLI

In [102]:
# Process nextstrain global metadata into a dictionary

metadata_filename = "data/nextstrain_ncov_global_metadata.tsv"
metadata_samples = {}
with open(metadata_filename, "r") as f:
    tsv_reader = csv.reader(f, delimiter="\t", quoting=csv.QUOTE_NONE)
    header = next(tsv_reader)
    
    for row in tsv_reader:
        # Create dictionary from row
        sample = {}
        for i in range(len(row)):
            sample[header[i]] = row[i]
            
        # if s1 mutation is empty, means no mutation
        if sample['S1 mutations'] == '':
            sample['S1 mutations'] = 0
        sample['S1 mutations']  = int(float(sample['S1 mutations']))
            
            
        # Add sample to metadata_samples
        epi_isl = row[10]
        assert(epi_isl[:7] == "EPI_ISL")
        metadata_samples[epi_isl] = sample
        
    print("Number of metadata entries: ", len(metadata_samples.keys()))
    print("")
    print("Example metadata entry: ")
    for (_, value) in metadata_samples.items():
        print(value)
        break

Number of metadata entries:  3860

Example metadata entry: 
{'Strain': 'Wuhan/WH01/2019', 'GISAID Clade': 'L', 'S1 mutations': 0, 'Age': '44', 'Clade': '19A', 'Country': 'China', 'Country of Exposure': 'China', 'Admin Division': 'Hubei', 'Division of Exposure': 'Hubei', 'genbank_accession': 'LR757998.1', 'gisaid_epi_isl': 'EPI_ISL_406798', 'Host': 'Human', 'Location': 'Wuhan', 'Originating Lab': "General Hospital of Central Theater Command of People's Liberation Army of China", 'PANGO Lineage': 'B', 'Submission Date': 'Older', 'Region': 'Asia', 'Sex': 'Male', 'Emerging Clade': '19A', 'Submitting Lab': "BGI & Institute of Microbiology, Chinese Academy of Sciences & Shandong First Medical University & Shandong Academy of Medical Sciences & General Hospital of Central Theater Command of People's Liberation Army of China", 'url': '', 'Collection Data': '2019-12-26', 'Author': 'Weijun Chen et al (https://dx.doi.org/10.1016/S0140-6736(20)30251-8)', 'Region of Exposure': ''}


In [103]:
# Find matches between metadata and fasta

metadata_fasta_matches = []
for epi_isl in metadata_samples.keys():
    if epi_isl in fasta_samples:
        metadata_fasta_matches.append(epi_isl)
        
print("Number of matching EPI_ISL ID's between NextStrain metadata and FASTA: ", len(metadata_fasta_matches))

Number of matching EPI_ISL ID's between NextStrain metadata and FASTA:  2488


In [105]:
# Map from amino acids and clades to ints

with open("data/amino_list.txt", encoding="utf8") as f:
    amino_list = f.read().strip().split(',')
amino_codes = {}
for (i, v) in enumerate(amino_list):
    amino_codes[v] = i
    
clade_list = list(set([value["clade"] for (_, value) in matched_samples.items()]))
clade_codes = {}
for (i, v) in enumerate(clade_list):
    clade_codes[v] = i
    
gisaid_clade_list = list(set([value["gisaid_clade"] for (_, value) in matched_samples.items()]))
gisaid_clade_codes = {}
for (i, v) in enumerate(gisaid_clade_list):
    gisaid_clade_codes[v] = i

In [106]:
print('clade codes:', clade_codes)
print('gisaid clade codes', gisaid_clade_codes)

clade codes: {'20B': 0, '20H/501Y.V2': 1, '20F': 2, '20C': 3, '20J/501Y.V3': 4, '20G': 5, '20D': 6, '19A': 7, '20I/501Y.V1': 8, '20A': 9, '20E (EU1)': 10, '19B': 11}
gisaid clade codes {'GV': 0, 'O': 1, 'GRY': 2, 'GH': 3, 'S': 4, 'GR': 5, 'L': 6, 'G': 7, 'V': 8}


In [107]:
# Convert training and validation data from string to numerical format

def amino_to_num(data_list, amino_codes):
    new_data = []
    
    for seq in data_list:
        new_seq = np.array([amino_codes[char] for char in seq])
        new_data.append(new_seq)
        
    return np.array(new_data, dtype=np.object)

def clade_to_num(data, clade_codes):
    new_data = [clade_codes[clade] for clade in data]
    return np.array(new_data)

In [108]:
# set up list to store train and val data

train_data_metadata_fasta_match = []
train_label_clade = []
train_label_gisaid_clade = []
train_label_mutation = []

validation_data_metadata_fasta_match = []
validation_label_clade = []
validation_label_gisaid_clade = []
validation_label_mutation = []

In [113]:
# Generate training and validation datasets

for (i, (_, value)) in enumerate(matched_samples.items()):
    if i % 10 == 0:
        validation_data_metadata_fasta_match.append(value["sequence"])
        validation_label_clade.append(value["clade"])
        validation_label_gisaid_clade.append(value["gisaid_clade"])
        validation_label_mutation.append(value["s1_mutation"])
    else:
        train_data_metadata_fasta_match.append(value["sequence"])
        train_label_clade.append(value["clade"])
        train_label_gisaid_clade.append(value["gisaid_clade"])
        train_label_mutation.append(value["s1_mutation"])
        
train_data_metadata_fasta_match_num = amino_to_num(train_data_metadata_fasta_match, amino_codes)
train_label_clade_num = clade_to_num(train_label_clade, clade_codes)
train_label_gisaid_clade_num = clade_to_num(train_label_gisaid_clade, gisaid_clade_codes)
train_label_mutation = np.array(train_label_mutation)

validation_data_metadata_fasta_match_num = amino_to_num(validation_data_metadata_fasta_match, amino_codes)
validation_label_clade_num = clade_to_num(validation_label_clade, clade_codes)
validation_label_gisaid_clade_num = clade_to_num(validation_label_gisaid_clade, gisaid_clade_codes)
validation_label_mutation = np.array(validation_label_mutation)

In [116]:
train_label_mutation

array([0, 0, 0, ..., 2, 2, 2])

In [95]:
np.save("data_updated/matched_samples_metadata_fasta.npy", matched_samples, allow_pickle=True)

np.save("data_updated/amino_mapping_new.npy", amino_list, allow_pickle=True)
np.save("data_updated/clade_mapping_new.npy", clade_list, allow_pickle=True)
np.save("data_updated/gisaid_clade_mapping_new.npy", gisaid_clade_list, allow_pickle=True)

np.save("data_updated/train_data_metadata_fasta_match.npy", train_data_metadata_fasta_match, allow_pickle=True)
np.save("data_updated/train_data_metadata_fasta_match_num.npy", train_data_metadata_fasta_match_num, allow_pickle=True)
np.save("data_updated/train_label_clade.npy", train_label_clade, allow_pickle=True)
np.save("data_updated/train_label_clade_num.npy", train_label_clade_num, allow_pickle=True)
np.save("data_updated/train_label_gisaid_clade.npy", train_label_gisaid_clade, allow_pickle=True)
np.save("data_updated/train_label_gisaid_clade_num.npy", train_label_gisaid_clade_num, allow_pickle=True)
np.save("data_updated/train_label_mutation.npy", train_label_mutation, allow_pickle=True)

np.save("data_updated/validation_data_metadata_fasta_match.npy", validation_data_metadata_fasta_match, allow_pickle=True)
np.save("data_updated/validation_data_metadata_fasta_match_num.npy", validation_data_metadata_fasta_match_num, allow_pickle=True)
np.save("data_updated/validation_label_clade.npy", validation_label_clade, allow_pickle=True)
np.save("data_updated/validation_label_clade_num.npy", validation_label_clade_num, allow_pickle=True)
np.save("data_updated/validation_label_gisaid_clade.npy", validation_label_gisaid_clade, allow_pickle=True)
np.save("data_updated/validation_label_gisaid_clade_num.npy", validation_label_gisaid_clade_num, allow_pickle=True)
np.save("data_updated/validation_label_mutation.npy", validation_label_mutation, allow_pickle=True)