In [1]:
from collections import defaultdict, Counter
import pandas as pd
from itertools import islice

# Parsing amino acid FASTA files

In [2]:
def parse_fasta_amino_acid(file_path):
    """
    Parse a FASTA file to extract sequences along with their metadata.
    Returns a dictionary organized by protein type and virus type.
    """
    sequences = defaultdict(lambda: defaultdict(list))  # {virus_type: {protein_type: [sequences]}}
    with open(file_path, "r") as file:
        current_virus = None
        current_protein = None
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                # Parse the header line
                header = line[1:]
                parts = header.split("|")
                protein_info = parts[1].split(" [")
                current_protein = protein_info[0].strip()
                current_virus = protein_info[1][:-1].strip()
            else:
                # Append sequence to the appropriate category
                sequences[current_virus][current_protein].append(line)
    return sequences

In [4]:
def compute_ngrams(sequence, n, is_amino_acid=True):
    """
    Compute all n-grams of length n from a given sequence,
    excluding n-grams with ambiguous characters.
    """
    if is_amino_acid:
        ambiguous_characters = {"B", "J", "O", "U", "X", "Z"}  # Ambiguous for amino acids
    else:
        ambiguous_characters = {"B", "D", "H", "K", "M", "R", "S", "V", "W", "Y", "N"}  # Ambiguous for nucleotides

    return [sequence[i:i+n] for i in range(len(sequence) - n + 1) 
            if not any(char in ambiguous_characters for char in sequence[i:i+n])]

In [10]:
def generate_ngram_matrix_amino_acid(fasta_path, n):
    """
    Generate an n-gram frequency matrix for clustering, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    sequences = parse_fasta_amino_acid(fasta_path)
    ngram_counts = defaultdict(Counter)  # {protein_identifier: Counter of n-grams}
    protein_identifiers = []  # List of unique identifiers for clustering
    
    # Extract n-grams by protein and virus type
    for virus_type, proteins in sequences.items():
        for protein_type, seq_list in proteins.items():
            combined_sequence = "".join(seq_list)  # Combine all sequence fragments
            ngrams = compute_ngrams(combined_sequence, n, is_amino_acid=True)
            identifier = f"{virus_type} | {protein_type}"
            protein_identifiers.append(identifier)
            ngram_counts[identifier].update(ngrams)
    
    # Create a DataFrame for clustering
    all_ngrams = set(ngram for counter in ngram_counts.values() for ngram in counter)
    ngram_matrix = pd.DataFrame(index=protein_identifiers, columns=sorted(all_ngrams), dtype=int).fillna(0)
    
    for identifier, counter in ngram_counts.items():
        for ngram, count in counter.items():
            ngram_matrix.at[identifier, ngram] = count
            
    # Convert the DataFrame to integer type
    ngram_matrix = ngram_matrix.astype(int)

    # Reset the index to make 'virus_type' and 'protein_type' separate columns
    ngram_matrix.reset_index(inplace=True)
    ngram_matrix[['virus_type', 'protein_type']] = ngram_matrix['index'].str.split(' \| ', expand=True)
    ngram_matrix.drop(columns=['index'], inplace=True)
    columns_order = ['virus_type', 'protein_type'] + [col for col in ngram_matrix.columns if col not in ['virus_type', 'protein_type']]
    ngram_matrix = ngram_matrix[columns_order]

    return ngram_matrix

In [21]:
fasta_file = "data/ebola-amino-acid.fasta"
n = 3
ngram_matrix = generate_ngram_matrix_amino_acid(fasta_file, n)
ngram_matrix.to_csv("csv_data/ngram_matrix_amino_acid_ebola.csv", index=False)
ngram_matrix

Unnamed: 0,virus_type,protein_type,AAA,AAD,AAE,AAF,AAG,AAI,AAK,AAL,...,YYH,YYI,YYK,YYL,YYM,YYN,YYP,YYS,YYW,YYY
0,Bombali ebolavirus,nucleoprotein,7,0,0,0,0,0,0,7,...,7,0,0,0,0,0,0,0,0,0
1,Bombali ebolavirus,polymerase complex protein,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bombali ebolavirus,matrix protein,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,4,0,0,0
3,Bombali ebolavirus,spike glycoprotein,5,4,4,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bombali ebolavirus,small secreted glycoprotein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,"Ebola virus - Mayinga, Zaire, 1976",ssGP,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
280,"Ebola virus - Mayinga, Zaire, 1976",polymerase complex protein,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
281,"Ebola virus - Mayinga, Zaire, 1976",VP24,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
282,"Ebola virus - Mayinga, Zaire, 1976",polymerase,0,1,0,1,1,0,0,1,...,0,0,0,3,0,1,0,0,0,0


In [22]:
fasta_file = "data/mers-amino-acid.fasta"
n = 3
ngram_matrix = generate_ngram_matrix_amino_acid(fasta_file, n)
ngram_matrix.to_csv("csv_data/ngram_matrix_amino_acid_mers.csv", index=False)
ngram_matrix

Unnamed: 0,virus_type,protein_type,AAA,AAC,AAD,AAE,AAF,AAG,AAH,AAI,...,YYL,YYM,YYN,YYP,YYQ,YYR,YYS,YYT,YYV,YYY
0,Betacoronavirus England 1,ORF8b,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Betacoronavirus England 1,nsp1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Betacoronavirus England 1,nsp2,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,Betacoronavirus England 1,nsp3,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2,0,2,0
4,Betacoronavirus England 1,nsp4,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,Human betacoronavirus 2c EMC/2012,ORF5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
206,Human betacoronavirus 2c EMC/2012,E protein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207,Human betacoronavirus 2c EMC/2012,M protein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208,Human betacoronavirus 2c EMC/2012,N protein,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [23]:
fasta_file = "data/marburg-amino-acid.fasta"
n = 3
ngram_matrix = generate_ngram_matrix_amino_acid(fasta_file, n)
ngram_matrix.to_csv("csv_data/ngram_matrix_amino_acid_marburg.csv", index=False)
ngram_matrix

Unnamed: 0,virus_type,protein_type,AAA,AAD,AAE,AAF,AAG,AAH,AAI,AAL,...,YWF,YWG,YWL,YWR,YWT,YYA,YYD,YYG,YYN,YYW
0,Orthomarburgvirus marburgense,nucleoprotein,0,0,62,0,0,62,0,0,...,0,0,0,0,0,0,0,0,62,0
1,Orthomarburgvirus marburgense,polymerase complex protein,3,3,0,3,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2,Orthomarburgvirus marburgense,matrix protein,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,Orthomarburgvirus marburgense,glycoprotein,0,0,0,0,55,0,0,0,...,0,0,0,0,55,0,0,0,0,0
4,Orthomarburgvirus marburgense,minor nucleoprotein,0,0,0,0,0,0,0,9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,"Marburg virus - Musoke, Kenya, 1980",VP40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
86,"Marburg virus - Musoke, Kenya, 1980",glycoprotein,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
87,"Marburg virus - Musoke, Kenya, 1980",VP30,0,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
88,"Marburg virus - Musoke, Kenya, 1980",VP24,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Parsing nucleotide FASTA files


In [24]:
def parse_fasta_nucleotide(file_path):
    """
    Function to parse a FASTA file and extract virus types and sequences
    """
    virus_sequences = defaultdict(str)
    
    with open(file_path, 'r') as file:
        current_virus = None
        for line in file:
            if line.startswith('>'):
                header = line.strip()
                virus_index = header.find("virus")
                if virus_index != -1:
                    current_virus = header[:virus_index + len("virus")].replace(">", "").strip()
                    if '|' in current_virus:
                        current_virus = current_virus.split('|')[1].strip()
                    if "UNVERIFIED:" in current_virus:
                        current_virus = current_virus.split('UNVERIFIED:')[1].strip()
                else:
                    current_virus = "Unknown Virus"
            else:
                virus_sequences[current_virus] += line.strip()
    
    return virus_sequences

In [25]:
def generate_ngram_matrix_nucleotide(fasta_path, n):
    """
    Generate an n-gram frequency matrix for clustering, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    all_ngrams = set()
    ngram_counts = {}

    virus_sequences = parse_fasta_nucleotide(fasta_file)

    # Count n-grams for each virus type
    for virus, sequence in virus_sequences.items():
        ngrams = compute_ngrams(sequence, n, is_amino_acid=False)
        ngram_counts[virus] = Counter(ngrams)
        all_ngrams.update(ngrams)

    # Create a DataFrame
    ngram_matrix = pd.DataFrame(index=ngram_counts.keys(), columns=sorted(all_ngrams), dtype=int).fillna(0)

    for virus, counts in ngram_counts.items():
        for ngram, count in counts.items():
            ngram_matrix.at[virus, ngram] = count

    # Convert float values to int
    ngram_matrix = ngram_matrix.astype(int)
    
    return ngram_matrix

In [26]:
fasta_file = "data/ebola-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)

ngram_matrix.to_csv("csv_data/ngram_matrix_nucleotide_ebola.csv")
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAACC,AAACG,AAACT,AAAGA,AAAGC,...,TTTCG,TTTCT,TTTGA,TTTGC,TTTGG,TTTGT,TTTTA,TTTTC,TTTTG,TTTTT
Bombali ebolavirus,407,228,266,327,215,290,45,163,337,103,...,34,202,229,140,184,181,169,179,222,225
Bundibugyo ebolavirus,848,486,447,761,569,448,84,268,467,176,...,116,430,353,231,204,306,372,254,470,492
Tai Forest ebolavirus,217,150,119,159,150,132,33,72,148,78,...,27,89,117,66,60,68,63,82,113,112
Sudan ebolavirus,1433,807,994,1490,918,517,225,632,886,405,...,234,752,637,302,721,480,653,702,723,688
Reston ebolavirus,1401,798,817,1093,780,435,244,478,750,291,...,216,954,582,512,468,573,631,718,825,857
Zaire ebolavirus,33534,22311,20319,25559,19896,12224,6292,16972,19797,7776,...,6135,16300,15016,4374,7980,12812,15069,14857,15553,17351
Mutant Zaire ebolavirus,874,588,561,695,551,333,171,382,523,210,...,132,452,381,125,217,340,422,441,426,518
Mutant Bombali ebolavirus,50,28,37,40,26,37,5,22,43,15,...,4,25,28,17,25,23,22,23,29,29
Ebola virus,6952,5174,4695,5864,4369,2735,1644,3937,4960,1851,...,1375,3616,3483,1184,1954,2841,3514,3490,3884,4270
Bundibugyo virus,70,40,37,64,47,39,7,23,39,14,...,9,36,29,19,17,26,31,21,38,40


In [27]:
print(ngram_matrix.iloc[:, 0])

Bombali ebolavirus             407
Bundibugyo ebolavirus          848
Tai Forest ebolavirus          217
Sudan ebolavirus              1433
Reston ebolavirus             1401
Zaire ebolavirus             33534
Mutant Zaire ebolavirus        874
Mutant Bombali ebolavirus       50
Ebola virus                   6952
Bundibugyo virus                70
Cote d'Ivoire ebolavirus        72
Zaire Ebola virus              217
Reston Ebola virus              66
Name: AAAAA, dtype: int64


In [28]:
fasta_file = "data/marburg-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)
ngram_matrix.to_csv("csv_data/ngram_matrix_nucleotide_marburg.csv")
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAACC,AAACG,AAACT,AAAGA,AAAGC,...,TTTCG,TTTCT,TTTGA,TTTGC,TTTGG,TTTGT,TTTTA,TTTTC,TTTTG,TTTTT
Marburg marburgvirus,2945,2620,1704,2653,2093,1348,564,1513,1978,686,...,249,1658,1602,779,1209,1366,1938,1804,1532,2165
Orthomarburgvirus,934,733,671,958,581,435,184,433,665,259,...,144,531,618,269,397,400,580,544,583,801
Mutant Orthomarburgvirus,71,69,36,59,53,33,14,38,47,13,...,6,37,36,17,29,33,47,41,35,54
Mutant Marburg marburgvirus,213,207,108,177,159,99,42,114,141,39,...,18,111,108,51,87,99,141,123,105,162
Lake Victoria marburgvirus,1914,1614,1246,1906,1248,921,395,938,1394,490,...,208,1075,1162,488,863,815,1274,1103,1105,1435
Marburg virus,151,113,86,134,90,61,24,65,90,35,...,15,81,86,43,59,54,93,83,74,97


In [29]:
fasta_file = "data/mers-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)
ngram_matrix.to_csv("csv_data/ngram_matrix_nucleotide_mers.csv")
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAACC,AAACG,AAACT,AAAGA,AAAGC,...,TTTCG,TTTCT,TTTGA,TTTGC,TTTGG,TTTGT,TTTTA,TTTTC,TTTTG,TTTTT
Betacoronavirus,85,76,66,101,88,76,22,98,82,56,...,36,132,130,130,112,168,154,88,128,90
Middle East respiratory syndrome-related coronavirus,19124,16830,14966,22601,19739,16846,5011,21760,18953,12568,...,7999,29567,29040,28498,24943,37349,35282,19264,28494,21262
MAG: Middle East respiratory syndrome-related coronavirus,82,67,80,105,91,56,26,91,90,62,...,32,125,124,114,96,205,180,91,165,120
Mutant Middle East respiratory syndrome-related coronavirus,61,38,33,50,44,38,11,49,42,28,...,18,65,64,64,55,83,78,42,63,50
Middle East respiratory syndrome coronavirus,7645,7294,6393,9584,8459,7304,2107,9382,8038,5335,...,3445,12579,12475,12331,10652,15944,15104,8176,12158,9038
Unknown Virus,29,15,27,42,33,27,13,43,38,37,...,15,55,68,55,53,103,63,41,67,45
Human betacoronavirus,153,152,133,203,176,152,44,196,165,112,...,72,267,262,265,222,326,313,172,251,181
Coronavirus,41,31,34,55,40,40,14,56,39,28,...,16,65,66,61,68,111,92,44,89,46
