In [25]:
import os 
import pandas as pd
from subprocess import call 
from collections import defaultdict
from Bio import SeqIO, AlignIO, SeqRecord, Seq

In [26]:
# Path to output directory
outpath = "../../results/phylogeny"

if not os.path.exists(outpath):
    os.mkdir(outpath)

## Spruce Tree

This data below represent the phylogenetic relationship of the haplotypes to one another. This is the final tree validated using bridging reads. 

In [27]:
# Final tree from SPRUCE/MACHINA
cluster_relationships = pd.read_csv("../../results/phylogeny/filtered_spruce_tree.csv")
cluster_relationships['from'] = cluster_relationships['from'].replace('_', ' ', regex=True)
cluster_relationships['to'] = cluster_relationships['to'].replace('_', ' ', regex=True)
rename_clusters = {"Anc": "SSPE ancestor"}
cluster_relationships['from'] = cluster_relationships['from'].replace(rename_clusters)
cluster_relationships.drop(['tree', 'treenum'], axis=1, inplace=True)
cluster_relationships

Unnamed: 0,from,to
0,genome 2,cluster 10
1,genome 2,cluster 12
2,genome 2,cluster 11
3,genome 2,cluster 13
4,genome 2,cluster 9
5,genome 1,cluster 4
6,genome 1,cluster 2
7,genome 1,cluster 1
8,genome 1,cluster 8
9,genome 1,cluster 7


## Haplotype Mutations

The following dataframe contains all of the variants in the 15 SSPE samples annotated by their haplotype.

In [28]:
# Import the mutations
haplotypes_df = pd.read_csv("../../results/variants/validated_variants.csv")

# Rename to standardize the naming scheme
rename_haplotypes = {"genome-1": "genome 01", 
                     "genome-1-1": "genome 1",
                     "genome-2": "genome 2"}
haplotypes_df['Haplotype'] = haplotypes_df['Haplotype'].replace(rename_haplotypes)

# We only need the mutations that have been assigned to a haplotype
haplotypes_df = haplotypes_df[~haplotypes_df.Haplotype.isin(['subclonal', 'both', 'fixed'])]
haplotypes_df = haplotypes_df.loc[:, ['POS', 'REF', 'ALT', 'Haplotype']].drop_duplicates()

haplotypes_df.head()

Unnamed: 0,POS,REF,ALT,Haplotype
0,537,T,C,genome 2
1,684,C,T,cluster 4
5,1328,T,C,genome 1
7,1632,G,A,cluster 10
9,2139,T,C,genome 01


## Make Haplotypes 

Iterate through the tree structure and add in the mutations to generate a dictionary of haplotype sequences. 

In [29]:
# Function to recursively apply mutations
def apply_mutations(haplotype, tree, haplotype_df, haplotype_dict, reference):
    
    if haplotype in haplotype_dict:
        return haplotype_dict[haplotype]
    
    ancestor = tree[haplotype]
    ancestor_sequence = apply_mutations(ancestor, tree, haplotype_df, haplotype_dict, reference)
    
    haplotype_sequence = list(ancestor_sequence)  # Create a copy of the ancestor's sequence
    
    haplotype_mutations = haplotype_df[haplotype_df['Haplotype'] == haplotype]
    
    for _, mutation in haplotype_mutations.iterrows():
        pos, ref, alt = mutation['POS'], mutation['REF'], mutation['ALT']
        if haplotype_sequence[pos-1] == ref:
            haplotype_sequence[pos-1] = alt
        else:
            raise ValueError(f"Reference allele mismatch at position {pos} for {haplotype}")
    
    haplotype_sequence = ''.join(haplotype_sequence)
    haplotype_dict[haplotype] = haplotype_sequence
    
    return haplotype_sequence


In [30]:
# Import the reference sequence 
reference = [base for record in SeqIO.parse("../../config/ref/MeVChiTok-SSPE.fa", "fasta") for base in record.seq]

# Build the tree-like structure
tree = {}
for _, row in cluster_relationships.iterrows():
    tree[row['to']] = row['from']

# Initialize the haplotype dictionary with the known ancestor sequence
haplotype_dict = {'SSPE ancestor': ''.join(reference)}

# Create the DNA haplotypes_df for each haplotype based on the tree structure
for haplotype in haplotypes_df['Haplotype'].unique():
    if haplotype not in haplotype_dict:
        apply_mutations(haplotype, tree, haplotypes_df, haplotype_dict, reference)


## Build a Tree

Now, I'm going to take the haplotype sequences and build a tree using `iqtree`. 

In [31]:
# Write out to a fasta file
haplotype_records = [SeqRecord.SeqRecord(Seq.Seq(seq), id = hap) for hap, seq in haplotype_dict.items()]
SeqIO.write(haplotype_records, os.path.join(outpath, "haplotype-sequences.fa"), "fasta")

17

In [32]:
# Building the phylogeny with IQtree with 1000 bootstrap iterations with GTR+I+G (Invariable site plus discrete Gamma model) with asr
alignfasta = os.path.join(outpath, "haplotype-sequences.fa")
call(f"iqtree -s {alignfasta} -m GTR+I+G -bb 1000 -redo", shell=True)


IQ-TREE multicore version 2.1.4-beta COVID-edition for Linux 64-bit built Jun 24 2021
Developed by Bui Quang Minh, James Barbetti, Nguyen Lam Tung,
Olga Chernomor, Heiko Schmidt, Dominik Schrempf, Michael Woodhams.

Host:    rhino02 (AVX512, FMA3, 754 GB RAM)
Command: iqtree -s ../../results/phylogeny/haplotype-sequences.fa -m GTR+I+G -bb 1000 -redo
Seed:    941483 (Using SPRNG - Scalable Parallel Random Number Generator)
Time:    Mon Apr  3 11:22:01 2023
Kernel:  AVX+FMA - 1 threads (72 CPU cores detected)

HINT: Use -nt option to specify number of threads because your CPU has 72 cores!
HINT: -nt AUTO will automatically determine the best number of threads to use.

Reading alignment file ../../results/phylogeny/haplotype-sequences.fa ... Fasta format detected
NOTE: Change sequence name 'SSPE ancestor <unknown description>' -> SSPE_ancestor
NOTE: Change sequence name 'genome 2 <unknown description>' -> genome_2
NOTE: Change sequence name 'genome 01 <unknown description>' -> genome_01
N

Iteration 260 / LogL: -23042.212 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 270 / LogL: -23057.952 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 280 / LogL: -23042.207 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 290 / LogL: -23049.855 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 300 / LogL: -23042.142 / Time: 0h:0m:2s (0h:0m:0s left)
Log-likelihood cutoff on original alignment: -23066.308
NOTE: Bootstrap correlation coefficient of split occurrence frequencies: 0.943
NOTE: UFBoot does not converge, continue at least 100 more iterations
Iteration 310 / LogL: -23042.158 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 320 / LogL: -23042.142 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 330 / LogL: -23042.192 / Time: 0h:0m:2s (0h:0m:0s left)
UPDATE BEST LOG-LIKELIHOOD: -23042.142
Iteration 340 / LogL: -23042.182 / Time: 0h:0m:2s (0h:0m:0s left)
Iteration 350 / LogL: -23042.185 / Time: 0h:0m:2s (0h:0m:0s left)
Log-likelihood cutoff on original alignment: -23066.308
Iteration 360 / LogL: -23156.159 / Time:

0