In [42]:
import os 
import pandas as pd
from subprocess import call 
from collections import defaultdict
from Bio import SeqIO, AlignIO, SeqRecord, Seq

In [43]:
# Path to output directory
outpath = "../../results/phylogeny"

if not os.path.exists(outpath):
    os.mkdir(outpath)

## Spruce Tree

This data below represent the phylogenetic relationship of the haplotypes to one another. This is the final tree validated using bridging reads. 

In [44]:
# Final tree from SPRUCE/MACHINA
cluster_relationships = pd.read_csv("../../results/phylogeny/filtered_spruce_tree.csv")
cluster_relationships['from'] = cluster_relationships['from'].replace('_', ' ', regex=True)
cluster_relationships['to'] = cluster_relationships['to'].replace('_', ' ', regex=True)
rename_clusters = {"Anc": "SSPE ancestor"}
cluster_relationships['from'] = cluster_relationships['from'].replace(rename_clusters)
cluster_relationships.drop(['tree', 'treenum'], axis=1, inplace=True)
cluster_relationships

Unnamed: 0,from,to
0,G-2,cluster 8
1,G-2,cluster 10
2,G-2,cluster 9
3,G-2,cluster 11
4,G-2,cluster 7
5,G-1,cluster 4
6,G-1,cluster 2
7,G-1,cluster 1
8,G-1,cluster 6
9,G-1,cluster 5


## Haplotype Mutations

The following dataframe contains all of the variants in the 15 SSPE samples annotated by their haplotype.

In [45]:
# Import the mutations
haplotypes_df = pd.read_csv("../../results/variants/validated_variants.csv")

# Rename to standardize the naming scheme
rename_haplotypes = {"genome-1": "G-01", 
                     "genome-1-1": "G-1",
                     "genome-2": "G-2",
                     "cluster 6": "G-FC2",
                     "cluster 1": "cluster 1", 
                     "cluster 2": "cluster 2", 
                     "cluster 3": "cluster 3", 
                     "cluster 4": "cluster 4", 
                     "cluster 5": "cluster 1a", 
                     "cluster 7": "cluster 5", 
                     "cluster 8": "cluster 6", 
                     "cluster 9": "cluster 7", 
                     "cluster 10": "cluster 8", 
                     "cluster 11": "cluster 9", 
                     "cluster 12": "cluster 10", 
                     "cluster 13": "cluster 11"}

haplotypes_df['Haplotype'] = haplotypes_df['Haplotype'].replace(rename_haplotypes)

# We only need the mutations that have been assigned to a haplotype
haplotypes_df = haplotypes_df[~haplotypes_df.Haplotype.isin(['subclonal', 'both', 'fixed'])]
haplotypes_df = haplotypes_df.loc[:, ['POS', 'REF', 'ALT', 'Haplotype']].drop_duplicates()

haplotypes_df.head()

Unnamed: 0,POS,REF,ALT,Haplotype
0,537,T,C,G-2
1,684,C,T,cluster 4
5,1328,T,C,G-1
7,1632,G,A,cluster 8
9,2139,T,C,G-01


## Make Haplotypes 

Iterate through the tree structure and add in the mutations to generate a dictionary of haplotype sequences. 

In [46]:
# Function to recursively apply mutations
def apply_mutations(haplotype, tree, haplotype_df, haplotype_dict, reference):
    
    if haplotype in haplotype_dict:
        return haplotype_dict[haplotype]
    
    ancestor = tree[haplotype]
    ancestor_sequence = apply_mutations(ancestor, tree, haplotype_df, haplotype_dict, reference)
    
    haplotype_sequence = list(ancestor_sequence)  # Create a copy of the ancestor's sequence
    
    haplotype_mutations = haplotype_df[haplotype_df['Haplotype'] == haplotype]
    
    for _, mutation in haplotype_mutations.iterrows():
        pos, ref, alt = mutation['POS'], mutation['REF'], mutation['ALT']
        if haplotype_sequence[pos-1] == ref:
            haplotype_sequence[pos-1] = alt
        else:
            raise ValueError(f"Reference allele mismatch at position {pos} for {haplotype}")
    
    haplotype_sequence = ''.join(haplotype_sequence)
    haplotype_dict[haplotype] = haplotype_sequence
    
    return haplotype_sequence


In [47]:
# Import the reference sequence 
reference = [base for record in SeqIO.parse("../../config/ref/MeVChiTok-SSPE.fa", "fasta") for base in record.seq]

# Build the tree-like structure
tree = {}
for _, row in cluster_relationships.iterrows():
    tree[row['to']] = row['from']

# Initialize the haplotype dictionary with the known ancestor sequence
haplotype_dict = {'SSPE ancestor': ''.join(reference)}

# Create the DNA haplotypes_df for each haplotype based on the tree structure
for haplotype in haplotypes_df['Haplotype'].unique():
    if haplotype not in haplotype_dict:
        apply_mutations(haplotype, tree, haplotypes_df, haplotype_dict, reference)


## Build a Tree

Now, I'm going to take the haplotype sequences and build a tree using `iqtree`. 

In [48]:
# Write out to a fasta file
haplotype_records = [SeqRecord.SeqRecord(Seq.Seq(seq), id = hap) for hap, seq in haplotype_dict.items()]
SeqIO.write(haplotype_records, os.path.join(outpath, "haplotype-sequences.fa"), "fasta")

17

In [49]:
# Building the phylogeny with IQtree with 1000 bootstrap iterations with GTR+I+G (Invariable site plus discrete Gamma model) with asr
alignfasta = os.path.join(outpath, "haplotype-sequences.fa")
call(f"iqtree -s {alignfasta} -m GTR+I+G -bb 1000 -redo", shell=True)


IQ-TREE multicore version 2.1.4-beta COVID-edition for Linux 64-bit built Jun 24 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    rhino02 (AVX512, FMA3, 754 GB RAM)
Command: iqtree -s ../../results/phylogeny/haplotype-sequences.fa -m GTR+I+G -bb 1000 -redo
Seed:    336298 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Fri Apr  7 10:14:06 2023
Kernel:  AVX+FMA - 1 threads (72 CPU cores detected)

HINT: Use -nt option to specify number of threads because your CPU has 72 cores!
HINT: -nt AUTO will automatically determine the best number of threads to use.

Reading alignment file ../../results/phylogeny/haplotype-sequences.fa ... Fasta format detected
NOTE: Change sequence name 'SSPE ancestor <unknown description>' -> SSPE_ancestor
NOTE: Change sequence name 'G-2 <unknown description>' -> G-2_<unknown
NOTE: Change sequence name 'G-01 <unknown description>' -> G-01_<unknown
NOT

Iteration 280 / LogL: -23042.129 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 290 / LogL: -23042.191 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 300 / LogL: -23042.182 / Time: 0h:0m:2s (0h:0m:0s left)
Log-likelihood cutoff on original alignment: -23066.257
NOTE: Bootstrap correlation coefficient of split occurrence frequencies: 0.973
NOTE: UFBoot does not converge, continue at least 100 more iterations
Iteration 310 / LogL: -23042.146 / Time: 0h:0m:2s (0h:0m:0s left)
BETTER TREE FOUND at iteration 313: -23042.129
Iteration 320 / LogL: -23042.156 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 330 / LogL: -23042.129 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 340 / LogL: -23042.129 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 350 / LogL: -23156.154 / Time: 0h:0m:3s (0h:0m:0s left)
Log-likelihood cutoff on original alignment: -23066.257
Iteration 360 / LogL: -23042.191 / Time: 0h:0m:3s (0h:0m:0s left)
Iteration 370 / LogL: -23042.129 / Time: 0h:0m:3s (0h:0m:0s left)
Iteration 380 / LogL: -23042.129

0