In [1]:
from collections import defaultdict, Counter
import pandas as pd
from itertools import islice

# Parsing amino acid FASTA files

In [2]:
def parse_fasta_amino_acid(file_path):
    """
    Parse a FASTA file to extract sequences along with their metadata.
    Returns a dictionary organized by protein type and virus type.
    """
    sequences = defaultdict(lambda: defaultdict(list))  # {virus_type: {protein_type: [sequences]}}
    with open(file_path, "r") as file:
        current_virus = None
        current_protein = None
        for line in file:
            line = line.strip()
            if line.startswith(">"):
                # Parse the header line
                header = line[1:]
                parts = header.split("|")
                protein_info = parts[1].split(" [")
                current_protein = protein_info[0].strip()
                current_virus = protein_info[1][:-1].strip()
            else:
                # Append sequence to the appropriate category
                sequences[current_virus][current_protein].append(line)
    return sequences

In [3]:
def compute_ngrams(sequence, n):
    """
    Compute all n-grams of length n from a given sequence.
    """
    return [sequence[i:i+n] for i in range(len(sequence) - n + 1)]

In [4]:
def generate_ngram_matrix_amino_acid(fasta_path, n):
    """
    Generate an n-gram frequency matrix for clustering, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    sequences = parse_fasta_amino_acid(fasta_path)
    ngram_counts = defaultdict(Counter)  # {protein_identifier: Counter of n-grams}
    protein_identifiers = []  # List of unique identifiers for clustering
    
    # Extract n-grams by protein and virus type
    for virus_type, proteins in sequences.items():
        for protein_type, seq_list in proteins.items():
            combined_sequence = "".join(seq_list)  # Combine all sequence fragments
            ngrams = compute_ngrams(combined_sequence, n)
            identifier = f"{virus_type} | {protein_type}"
            protein_identifiers.append(identifier)
            ngram_counts[identifier].update(ngrams)
    
    # Create a DataFrame for clustering
    all_ngrams = set(ngram for counter in ngram_counts.values() for ngram in counter)
    ngram_matrix = pd.DataFrame(index=protein_identifiers, columns=sorted(all_ngrams), dtype=int).fillna(0)
    
    for identifier, counter in ngram_counts.items():
        for ngram, count in counter.items():
            ngram_matrix.at[identifier, ngram] = count
            
    # Convert the DataFrame to integer type
    ngram_matrix = ngram_matrix.astype(int)

    # Reset the index to make 'virus_type' and 'protein_type' separate columns
    ngram_matrix.reset_index(inplace=True)
    ngram_matrix[['virus_type', 'protein_type']] = ngram_matrix['index'].str.split(' \| ', expand=True)
    ngram_matrix.drop(columns=['index'], inplace=True)
    columns_order = ['virus_type', 'protein_type'] + [col for col in ngram_matrix.columns if col not in ['virus_type', 'protein_type']]
    ngram_matrix = ngram_matrix[columns_order]

    return ngram_matrix

In [5]:
fasta_file = "data/ebola-amino-acid.fasta"
n = 3
ngram_matrix = generate_ngram_matrix_amino_acid(fasta_file, n)
#ngram_matrix.to_csv("ngram_matrix_amino_acid.csv")
ngram_matrix

Unnamed: 0,virus_type,protein_type,AAA,AAD,AAE,AAF,AAG,AAI,AAK,AAL,...,YYH,YYI,YYK,YYL,YYM,YYN,YYP,YYS,YYW,YYY
0,Bombali ebolavirus,nucleoprotein,7,0,0,0,0,0,0,7,...,7,0,0,0,0,0,0,0,0,0
1,Bombali ebolavirus,polymerase complex protein,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Bombali ebolavirus,matrix protein,0,0,0,0,0,4,0,0,...,0,0,0,0,0,0,4,0,0,0
3,Bombali ebolavirus,spike glycoprotein,5,4,4,4,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Bombali ebolavirus,small secreted glycoprotein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279,"Ebola virus - Mayinga, Zaire, 1976",ssGP,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
280,"Ebola virus - Mayinga, Zaire, 1976",polymerase complex protein,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
281,"Ebola virus - Mayinga, Zaire, 1976",VP24,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
282,"Ebola virus - Mayinga, Zaire, 1976",polymerase,0,1,0,1,1,0,0,1,...,0,0,0,3,0,1,0,0,0,0


In [6]:
ngram_matrix.iloc[:, 0].unique()

array(['Bombali ebolavirus', 'Reston ebolavirus', 'Sudan ebolavirus',
       'Bundibugyo ebolavirus', 'Tai Forest ebolavirus',
       'Zaire ebolavirus', 'Bombali virus', 'Ebola virus',
       'Sudan ebolavirus - Nakisamata', 'Reston ebolavirus - Reston',
       'Ebola virus - Mayinga, Zaire, 1976'], dtype=object)

# Parsing nucleotide FASTA files


In [7]:
def parse_fasta_nucleotide(file_path):
    """
    Function to parse a FASTA file and extract virus types and sequences
    """
    virus_sequences = defaultdict(str)
    
    with open(file_path, 'r') as file:
        current_virus = None
        for line in file:
            if line.startswith('>'):
                header = line.strip()
                virus_index = header.find("virus")
                if virus_index != -1:
                    current_virus = header[:virus_index + len("virus")].replace(">", "").strip()
                    if '|' in current_virus:
                        current_virus = current_virus.split('|')[1].strip()
                    if "UNVERIFIED:" in current_virus:
                        current_virus = current_virus.split('UNVERIFIED:')[1].strip()
                else:
                    current_virus = "Unknown Virus"
            else:
                virus_sequences[current_virus] += line.strip()
    
    return virus_sequences

In [8]:
def generate_ngram_matrix_nucleotide(fasta_path, n):
    """
    Generate an n-gram frequency matrix for clustering, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    all_ngrams = set()
    ngram_counts = {}

    virus_sequences = parse_fasta_nucleotide(fasta_file)

    # Count n-grams for each virus type
    for virus, sequence in virus_sequences.items():
        ngrams = compute_ngrams(sequence, n)
        ngram_counts[virus] = Counter(ngrams)
        all_ngrams.update(ngrams)

    # Create a DataFrame
    ngram_matrix = pd.DataFrame(index=ngram_counts.keys(), columns=sorted(all_ngrams), dtype=int).fillna(0)

    for virus, counts in ngram_counts.items():
        for ngram, count in counts.items():
            ngram_matrix.at[virus, ngram] = count

    # Convert float values to int
    ngram_matrix = ngram_matrix.astype(int)
    
    return ngram_matrix

In [9]:
fasta_file = "data/ebola-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)

#ngram_matrix.to_csv("ngram_matrix_nucleotide.csv")
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAACC,AAACG,AAACT,AAACY,AAAGA,...,YCGCA,YCTCA,YGATT,YGGGG,YTAGC,YTCTG,YTTAT,YTTCT,YTTTT,YYTTT
Bombali ebolavirus,407,228,266,327,215,290,45,163,0,337,...,0,0,0,0,0,0,0,0,0,0
Bundibugyo ebolavirus,848,486,447,761,569,448,84,268,0,467,...,0,0,0,0,0,0,0,0,0,0
Tai Forest ebolavirus,217,150,119,159,150,132,33,72,0,148,...,0,0,0,0,0,0,0,0,0,0
Sudan ebolavirus,1433,807,994,1490,918,517,225,632,0,886,...,0,0,0,0,0,2,0,0,0,0
Reston ebolavirus,1401,798,817,1093,780,435,244,478,0,750,...,0,0,0,0,0,0,0,0,0,0
Zaire ebolavirus,33534,22311,20319,25559,19896,12224,6292,16972,1,19797,...,1,1,1,1,1,1,1,2,1,1
Mutant Zaire ebolavirus,874,588,561,695,551,333,171,382,0,523,...,0,0,0,0,0,0,0,0,0,0
Mutant Bombali ebolavirus,50,28,37,40,26,37,5,22,0,43,...,0,0,0,0,0,0,0,0,0,0
Ebola virus,6952,5174,4695,5864,4369,2735,1644,3937,0,4960,...,0,0,0,0,0,0,0,0,0,0
Bundibugyo virus,70,40,37,64,47,39,7,23,0,39,...,0,0,0,0,0,0,0,0,0,0


In [10]:
print(ngram_matrix.iloc[:, 0])

Bombali ebolavirus             407
Bundibugyo ebolavirus          848
Tai Forest ebolavirus          217
Sudan ebolavirus              1433
Reston ebolavirus             1401
Zaire ebolavirus             33534
Mutant Zaire ebolavirus        874
Mutant Bombali ebolavirus       50
Ebola virus                   6952
Bundibugyo virus                70
Cote d'Ivoire ebolavirus        72
Zaire Ebola virus              217
Reston Ebola virus              66
Name: AAAAA, dtype: int64


In [11]:
fasta_file = "data/marburg-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAT,AAACA,AAACC,AAACG,AAACT,AAAGA,AAAGC,...,TTTGC,TTTGG,TTTGT,TTTTA,TTTTC,TTTTG,TTTTT,TTYGG,TYGGA,YGGAG
Marburg marburgvirus,2945,2620,1704,2653,2093,1348,564,1513,1978,686,...,779,1209,1366,1938,1804,1532,2165,1,1,1
Orthomarburgvirus,934,733,671,958,581,435,184,433,665,259,...,269,397,400,580,544,583,801,0,0,0
Mutant Orthomarburgvirus,71,69,36,59,53,33,14,38,47,13,...,17,29,33,47,41,35,54,0,0,0
Mutant Marburg marburgvirus,213,207,108,177,159,99,42,114,141,39,...,51,87,99,141,123,105,162,0,0,0
Lake Victoria marburgvirus,1914,1614,1246,1906,1248,921,395,938,1394,490,...,488,863,815,1274,1103,1105,1435,0,0,0
Marburg virus,151,113,86,134,90,61,24,65,90,35,...,43,59,54,93,83,74,97,0,0,0


In [12]:
fasta_file = "data/mers-nucleotide.fasta"
n = 5 
ngram_matrix = generate_ngram_matrix_nucleotide(fasta_file, n)
ngram_matrix

Unnamed: 0,AAAAA,AAAAC,AAAAG,AAAAK,AAAAT,AAAAY,AAACA,AAACC,AAACG,AAACR,...,YTCTA,YTCTT,YTGCA,YTGCC,YTTAC,YTTAG,YTTCT,YTTTC,YTTTG,YTTTT
Betacoronavirus,85,76,66,0,101,0,88,76,22,0,...,0,0,0,0,0,0,0,0,0,0
Middle East respiratory syndrome-related coronavirus,19124,16830,14966,2,22601,1,19739,16846,5011,0,...,2,1,2,1,0,0,1,1,1,4
MAG: Middle East respiratory syndrome-related coronavirus,82,67,80,0,105,0,91,56,26,0,...,0,0,0,0,0,0,0,0,0,0
Mutant Middle East respiratory syndrome-related coronavirus,61,38,33,0,50,0,44,38,11,0,...,0,0,0,0,0,0,0,0,0,0
Middle East respiratory syndrome coronavirus,7645,7294,6393,0,9584,0,8459,7304,2107,1,...,0,0,0,0,1,1,0,0,0,2
Unknown Virus,29,15,27,0,42,0,33,27,13,0,...,0,0,0,0,0,0,0,0,0,0
Human betacoronavirus,153,152,133,0,203,0,176,152,44,0,...,0,0,0,0,0,0,0,0,0,0
Coronavirus,41,31,34,0,55,0,40,40,14,0,...,0,0,0,0,0,0,0,0,0,0
