In [9]:
from collections import defaultdict, Counter
import pandas as pd
from itertools import islice
from glob import glob

# Parsing amino acid FASTA files

In [10]:
def parse_fasta_amino_acid(file_paths):
    """
    Parse multiple FASTA files to extract sequences along with their metadata.
    Returns a dictionary organized by protein type and virus type.
    """
    sequences = defaultdict(lambda: defaultdict(list))  # {virus_type: {protein_type: [sequences]}}
    for file_path in file_paths:
        with open(file_path, "r") as file:
            current_virus = None
            current_protein = None
            for line in file:
                line = line.strip()
                if line.startswith(">"):
                    # Parse the header line
                    header = line[1:]
                    parts = header.split("|")
                    protein_info = parts[1].split(" [")
                    current_protein = protein_info[0].strip()
                    current_virus = protein_info[1][:-1].strip()
                else:
                    # Append sequence to the appropriate category
                    sequences[current_virus][current_protein].append(line)
    return sequences

In [11]:
def compute_ngrams(sequence, n, is_amino_acid=True):
    """
    Compute all n-grams of length n from a given sequence,
    excluding n-grams with ambiguous characters.
    """
    if is_amino_acid:
        ambiguous_characters = {"B", "J", "O", "U", "X", "Z"}  # Ambiguous for amino acids
    else:
        ambiguous_characters = {"B", "D", "H", "K", "M", "R", "S", "V", "W", "Y", "N"}  # Ambiguous for nucleotides

    return [sequence[i:i+n] for i in range(len(sequence) - n + 1) 
            if not any(char in ambiguous_characters for char in sequence[i:i+n])]

In [12]:
def generate_ngram_matrix_amino_acid(fasta_paths, n):
    """
    Generate n-gram frequency matrix from multiple FASTA files, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    sequences = parse_fasta_amino_acid(fasta_paths)
    ngram_counts = defaultdict(Counter)  # {protein_identifier: Counter of n-grams}
    protein_identifiers = []  # List of unique identifiers for clustering
    
    # Extract n-grams by protein and virus type
    for virus_type, proteins in sequences.items():
        for protein_type, seq_list in proteins.items():
            combined_sequence = "".join(seq_list)  # Combine all sequence fragments
            ngrams = compute_ngrams(combined_sequence, n, is_amino_acid=True)
            identifier = f"{virus_type} | {protein_type}"
            protein_identifiers.append(identifier)
            ngram_counts[identifier].update(ngrams)
    
    # Create a DataFrame for clustering
    all_ngrams = set(ngram for counter in ngram_counts.values() for ngram in counter)
    ngram_matrix = pd.DataFrame(index=protein_identifiers, columns=sorted(all_ngrams), dtype=int).fillna(0)
    
    for identifier, counter in ngram_counts.items():
        for ngram, count in counter.items():
            ngram_matrix.at[identifier, ngram] = count
            
    # Convert the DataFrame to integer type
    ngram_matrix = ngram_matrix.astype(int)

    # Reset the index to make 'virus_type' and 'protein_type' separate columns
    ngram_matrix.reset_index(inplace=True)
    ngram_matrix[['virus_type', 'protein_type']] = ngram_matrix['index'].str.split(' \| ', expand=True)
    ngram_matrix.drop(columns=['index'], inplace=True)
    columns_order = ['virus_type', 'protein_type'] + [col for col in ngram_matrix.columns if col not in ['virus_type', 'protein_type']]
    ngram_matrix = ngram_matrix[columns_order]

    return ngram_matrix

In [13]:
fasta_files = glob("data/*-amino-acid.fasta")
ns = [3, 4, 5]
for n in ns:
    ngram_matrix = generate_ngram_matrix_amino_acid(fasta_files, n)
    ngram_matrix.to_csv(f"csv_data/{n}gram_matrix_amino_acid_combined.csv", index=False)
ngram_matrix

Unnamed: 0,virus_type,protein_type,AAAAK,AAACH,AAACY,AAADG,AAADL,AAAFD,AAAFH,AAAFS,...,YYVTF,YYVWK,YYWGM,YYWTA,YYYAT,YYYCI,YYYDA,YYYLM,YYYMM,YYYVT
0,Severe acute respiratory syndrome coronavirus 2,leader protein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Severe acute respiratory syndrome coronavirus 2,nsp2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Severe acute respiratory syndrome coronavirus 2,nsp3,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
3,Severe acute respiratory syndrome coronavirus 2,nsp4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Severe acute respiratory syndrome coronavirus 2,3C-like proteinase,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,"Ebola virus - Mayinga, Zaire, 1976",ssGP,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
613,"Ebola virus - Mayinga, Zaire, 1976",polymerase complex protein,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
614,"Ebola virus - Mayinga, Zaire, 1976",VP24,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
615,"Ebola virus - Mayinga, Zaire, 1976",polymerase,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Parsing nucleotide FASTA files


In [14]:
def parse_fasta_nucleotide(file_paths):
    """
    Parses multiple FASTA files and extracts virus types and sequences
    """
    virus_sequences = defaultdict(str)

    for file_path in file_paths:
        with open(file_path, 'r') as file:
            current_virus = None
            for line in file:
                if line.startswith('>'):
                    header = line.strip()
                    virus_index = header.find("virus")
                    if virus_index != -1:
                        current_virus = header[:virus_index + len("virus")].replace(">", "").strip()
                        if '|' in current_virus:
                            current_virus = current_virus.split('|')[1].strip()
                        if "UNVERIFIED:" in current_virus:
                            current_virus = current_virus.split('UNVERIFIED:')[1].strip()
                    else:
                        current_virus = "Unknown Virus"
                else:
                    virus_sequences[current_virus] += line.strip()
        
    return virus_sequences

In [15]:
def generate_ngram_matrix_nucleotide(fasta_paths, n):
    """
    Generate an n-gram frequency matrix from multiple FASTA files, organized by virus and protein type.
    Returns a pandas DataFrame with proteins as rows and n-grams as columns.
    """
    all_ngrams = set()
    ngram_counts = {}

    virus_sequences = parse_fasta_nucleotide(fasta_paths)

    # Count n-grams for each virus type
    for virus, sequence in virus_sequences.items():
        ngrams = compute_ngrams(sequence, n, is_amino_acid=False)
        ngram_counts[virus] = Counter(ngrams)
        all_ngrams.update(ngrams)

    # Create a DataFrame
    ngram_matrix = pd.DataFrame(index=ngram_counts.keys(), columns=sorted(all_ngrams), dtype=int).fillna(0)

    for virus, counts in ngram_counts.items():
        for ngram, count in counts.items():
            ngram_matrix.at[virus, ngram] = count

    # Convert float values to int
    ngram_matrix = ngram_matrix.astype(int)
    
    return ngram_matrix

In [16]:
fasta_files = glob("data/*-nucleotide.fasta")
ns = [6, 7, 8, 9] 
for n in ns:
    ngram_matrix = generate_ngram_matrix_nucleotide(fasta_files, n)
    ngram_matrix.to_csv(f"csv_data/{n}gram_matrix_nucleotide_combined.csv")
ngram_matrix

Unnamed: 0,AAAAAAAAA,AAAAAAAAC,AAAAAAAAG,AAAAAAAAT,AAAAAAACA,AAAAAAACC,AAAAAAACG,AAAAAAACT,AAAAAAAGA,AAAAAAAGC,...,TTTTTTTCG,TTTTTTTCT,TTTTTTTGA,TTTTTTTGC,TTTTTTTGG,TTTTTTTGT,TTTTTTTTA,TTTTTTTTC,TTTTTTTTG,TTTTTTTTT
Bombali ebolavirus,0,0,0,2,0,8,0,0,0,0,...,0,0,0,8,0,1,0,0,0,0
Bundibugyo ebolavirus,0,0,0,0,1,0,0,12,0,0,...,0,0,1,0,0,0,0,0,1,0
Tai Forest ebolavirus,0,0,0,0,0,1,0,3,0,0,...,3,0,0,0,0,0,0,0,0,0
Sudan ebolavirus,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,10,0,0,0,0
Reston ebolavirus,0,0,0,0,21,0,0,23,0,0,...,0,18,3,0,0,0,0,0,0,0
Zaire ebolavirus,15,10,1,2,459,462,0,1,2,2,...,120,0,0,0,1,0,1,0,0,0
Mutant Zaire ebolavirus,0,0,0,0,12,12,0,1,0,0,...,12,0,0,0,0,0,0,0,0,0
Mutant Bombali ebolavirus,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
Ebola virus,0,0,0,0,108,108,0,0,0,0,...,107,0,0,0,0,0,0,0,0,0
Bundibugyo virus,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
print(ngram_matrix.iloc[:, 0])

Bombali ebolavirus                                                0
Bundibugyo ebolavirus                                             0
Tai Forest ebolavirus                                             0
Sudan ebolavirus                                                  0
Reston ebolavirus                                                 0
Zaire ebolavirus                                                 15
Mutant Zaire ebolavirus                                           0
Mutant Bombali ebolavirus                                         0
Ebola virus                                                       0
Bundibugyo virus                                                  0
Cote d'Ivoire ebolavirus                                          0
Zaire Ebola virus                                                 0
Reston Ebola virus                                                0
Betacoronavirus                                                   9
Middle East respiratory syndrome-related coronav