Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update theme1 branch and add visualization data #4

Open
wants to merge 45 commits into
base: theme1
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
4b544e7
H3N2rooted
MarHayat Oct 19, 2019
44605d8
fixed bug in function in larger scale features
carolinecolijn Oct 19, 2019
f267e31
Add my scratchpad file
dfornika Oct 19, 2019
de34c20
Fixed bug where list does not have a name for trees
qpmnguyen Oct 19, 2019
4c22bd0
Merge branch 'master' of https://github.com/hackseq/hs19-flu
qpmnguyen Oct 19, 2019
9ad5a73
Added graph neural networks approach
saraswatmanu Oct 19, 2019
63953c6
Update README.md
carolinecolijn Oct 19, 2019
2b7dbec
Update my scratchpad
dfornika Oct 19, 2019
79e930e
Update my scratchpad
dfornika Oct 19, 2019
c8b9fcd
adding R notes for using and plotting functions for clades through time
carolinecolijn Oct 19, 2019
501458e
fix in gen dist
carolinecolijn Oct 19, 2019
1e94c6a
fix in gen dist
carolinecolijn Oct 19, 2019
af9e7e4
fix in gen dist
carolinecolijn Oct 19, 2019
84f9127
fix in gen dist
carolinecolijn Oct 19, 2019
3f7436b
GNN
MarHayat Oct 19, 2019
592a73b
function to create 2 panels of ggplots for larger clades
carolinecolijn Oct 19, 2019
65c3897
bug fix in getManyPlots.R
carolinecolijn Oct 19, 2019
1c34dda
bug fix in getManyPlots.R
carolinecolijn Oct 19, 2019
ed5d345
Return a data frame of growth ratios
dfornika Oct 19, 2019
7edab95
Merge branch 'master' of github.com:hackseq/hs19-flu
dfornika Oct 19, 2019
ebbb234
Update README.md
carolinecolijn Oct 19, 2019
cb6453f
Truncate at end of Feb and change growth_ratio function to take two t…
dfornika Oct 19, 2019
3c1d696
Merge branch 'master' of github.com:hackseq/hs19-flu
dfornika Oct 19, 2019
aa48f6c
Merge branch 'theme1'
bricoletc Oct 20, 2019
89b386d
adding notes on modelling larger clades
carolinecolijn Oct 20, 2019
daf5386
Rename labels_clades_dates.csv files to match alignment files
dfornika Oct 20, 2019
7afdb81
why have the .tree files moved? adding a nwk for h3n2 ha
carolinecolijn Oct 20, 2019
e9d554c
new version of plotClades for larger clades
carolinecolijn Oct 20, 2019
296c9ec
Remove unnecessary source() command
dfornika Oct 20, 2019
4f59cb0
Add three functions to larger_scale_features.R
dfornika Oct 20, 2019
1fd5b45
Remove functions that were moved to larger_scale_features.R
dfornika Oct 20, 2019
5eddda1
tidy up notes_plots_largerclades.R
dfornika Oct 20, 2019
821c7b1
Fill in info for notes_plots_largerclades.R
dfornika Oct 20, 2019
033ab4c
Add WHO clades data file
dfornika Oct 20, 2019
6d3c5f7
HA to NA mapping file, R script getting GC content and hamming distan…
qpmnguyen Oct 20, 2019
b8f2bc3
Joined all epitope and non-epitpe features with NA features into one …
qpmnguyen Oct 20, 2019
d95bb23
Update README.md
MarHayat Oct 20, 2019
dca3aad
adding build_map.sh and Pak's version of GC calculation result
sfpacman Oct 20, 2019
00deaf2
Merge https://github.com/hackseq/hs19-flu
sfpacman Oct 20, 2019
92b6720
Graph based features for subtrees from tree constructed from NA trees
qpmnguyen Oct 20, 2019
ae365f3
H1N1labels
MarHayat Oct 20, 2019
e732806
Adding README for processed_dat folder
qpmnguyen Oct 20, 2019
5d7f638
Merge pull request #5 from hackseq/theme1
FrankWhoee Oct 20, 2019
5012afc
Add logo
FrankWhoee Oct 20, 2019
7f90091
Merge branch 'theme1'
ivanpmartell Oct 22, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12,752 changes: 12,752 additions & 0 deletions Aux_H1N1.csv

Large diffs are not rendered by default.

File renamed without changes.
File renamed without changes.
21,326 changes: 21,326 additions & 0 deletions Data/H3N2_WHO_clades.csv

Large diffs are not rendered by default.

Binary file added GNN.zip
Binary file not shown.
9,322 changes: 9,322 additions & 0 deletions HA_NA_map.txt

Large diffs are not rendered by default.

122 changes: 122 additions & 0 deletions NA_mining_utils.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
library(ape)
library(phangorn)
library(seqinr)
library(stringr)
library(e1071)

# Extracting subtrees in tree form with pruning using Maryam's code
#' @param flutree Tree full tree file
#' @param myclades Output from getClades2() function
#' @return List of phylogenetic trees with names as the root node
tree_trim <- function(flutree, myclades){
# getting clades without rejection
Final_trimmedClades=myclades$trimclades[myclades$rejected==0]
# get root ids from final_trimmed_clades
Final_trimmedClades_root=as.numeric(names(which(myclades$rejected==0)))
# Get branch lengths
allHeights=node.depth.edgelength(flutree); max(allHeights)
# grab all tips and their lengths
hdata=data.frame(tiplab=flutree$tip.label, height=allHeights[1:length(flutree$tip.label)])
# remove subtrees without proper growth
print(max(allHeights)-allHeights[Final_trimmedClades_root[314]] <= 1.4)
res=numeric()
for(i in 1:length(Final_trimmedClades_root)){
if((max(allHeights)-allHeights[Final_trimmedClades_root[i]]) <= 1.4){
res=c(res,i)
}
}
Final_root=Final_trimmedClades_root[-res]
result <- list()
for (i in match(Final_root, Final_trimmedClades_root)){
if (i %% 5 == 0){print(i)}
tr1 <- extract.clade(flutree,Final_trimmedClades_root[i]) # get all from root
tr <- drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
root <- Final_trimmedClades_root[i]
result[[as.character(root)]] <- tr
}
return(result)
}

#' Function to map HA sample names to NA sample names
#' @param HA HA index, as a string (can get from)
HA_to_NA <- function(HA, map_file){
idx <- which(map_file[,2] == paste0(">",HA,"_A"))
output <- stringr::str_match(map_file[idx,1], regex(">(.*?)_A"))[2]
return(output)
}

#' Extracting features from NA sequences using subtrees
#' @param subtree The subtree as a phylo object
#' @param alignment_file The alignment file in fasta format and loaded as read.fasta from the seqinr package
#' @param alignment_key The clade key csv file for HA alignments with sample names as the first column.
#' Sequence of samples should be the same as that of the HA alignment fasta file
#' @param map_file The mapping file of samples between HA and NA, generated courtesy of Pak. HA second column, NA first column
#' @return A vector of length 6 with min, max and median/mean of hamming distance and GC content respectively
get_subtree_NA_feat<- function(subtree, alignment_file, alignment_key, map_file){
labs <- subtree$tip.label
# get index matching to alignment_file
match_idx <- numeric(length(labs))
for (i in 1:length(labs)){
NA_lab <- HA_to_NA(HA = labs[i], map_file = map_file)
if (!is.na(NA_lab)){
match_idx[i] <- which(alignment_key[,1] == HA_to_NA(HA = labs[i],map_file = map_file))
}
}
if (length(which(match_idx != 0)) == 0){ # means that there is no match between HA and NA
return(rep(NA,6))
} else {
match_idx <- match_idx[match_idx != 0] # only grab non-zero
# generating a matrix of sequence where each letter is it's own element in a vector
seq_mat <- c()
for (i in 1:length(match_idx)){
seq_mat <- rbind(seq_mat, getSequence(alignment_file[match_idx[i]])[[1]])
}
if (nrow(seq_mat) == 1){ # there is only one match
gc_content <- rep(seqinr::GC(seq_mat[1,]),3)
distance <- c(0,0,0)
output <- c(distance, gc_content)
return(output)
} else {
gc_content <- t(apply(seq_mat, 1, function(x){
seqinr::GC(x)
}))
distance <- as.vector(e1071::hamming.distance(seq_mat))
distance <- distance[distance != 0] # remove 0s since they are identical sequences non informative
output <- c(min(distance), max(distance), median(distance), min(gc_content), max(gc_content), mean(gc_content))
return(output)
}
}
}

#' Modified clade feature function from Maryam's script
#' @param tr The NA tree of interest
#' @return vector of 27 features similar to all subtrees
Clade_features_modified=function(tr){
features <- numeric()
features[1]=sackin(as.treeshape(tr),"pda")
features[2]=colless(as.treeshape(tr),"pda")
features[3]=var(node.depth.edgelength(tr)[1:length(tr$tip.label)])
features[4]=computeI2(tr)
features[5]=computeB1(tr)
features[6]=computeB2(tr)
features[7]=avgLadder(tr, normalise = TRUE)
features[8]=ILnumber(tr, normalise = TRUE)
features[9]=pitchforks(tr, normalise = TRUE)
features[10]=maxHeight(tr, normalise = TRUE)
features[11]=computeMaxWidth(tr)
features[12]=computeDelW(tr)
features[13]=computeStairs1(tr)
features[14]=computeStairs2(tr)
features[15]=computeCherries(tr, DOUBLE = FALSE)
features[16]=BS(tr)
features[17]=descinm(tr,m=2)$W
features[18]=getstattest(tr)$W
features[19]=skewness(i_bl(tr))
features[20]=kurtosis(i_bl(tr))
features[21]=tips_pairwise_distance(tr)
features[22]=tips_pairwise_distance_max(tr)
features[23:27]=computeNetworkStats (tr, weight = FALSE, meanpath = FALSE, maxOnly = TRUE)
return(features)
}


44 changes: 44 additions & 0 deletions NA_trees.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Script to generate features from NA sequences
# Pak Yu and Quang Nguyen
# Updated 10/20

library(ape)
library(phangorn)
library(seqinr)
library(stringr)
library(e1071)

#Utility functions
source("./NA_mining_utils.R")


# Load files
HA_tree <- read.tree(file = "./flutreeH3N2_HA.tree") # tree file
HA_dat <- readRDS(file = "./processed_dat/H3N2_HA_processed_nonepitope.rds") # processed file
alignment_file <- read.fasta(file = "./Data/alignH3N2NA.fa") # NA aligned sequences

alignment_key <- read.csv(file = "./Data/alignH3N2NA_labels_clades_dates.csv",
stringsAsFactors = F, header = F)

map_file <- read.table(file = "./HA_NA_map.txt", sep = ',')
HA_subtrees <- HA_dat$subtrees


# getting all phylogenetic trees from subtrees
tree_clades <- tree_trim(HA_tree, HA_subtrees)

feat_mat <- t(sapply(1:length(tree_clades), function(x){
print(x)
get_subtree_NA_feat(tree_clades[[x]], alignment_file = alignment_file,
alignment_key = alignment_key, map_file = map_file)
}))
colnames(feat_mat) <- c("min_hamming_distance", "max_hamming_distance", "median_hamming_distance",
"min_gc_content", "max_gc_content", "mean_gc_content")
feat_mat <- as.data.frame(feat_mat)
feat_mat <- cbind(as.numeric(names(tree_clades)), feat_mat)
colnames(feat_mat)[1] <- "root_node_id"

write.csv(feat_mat, file = "./processed_dat/NA_hamming_gc.csv")



14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# hs19-flu
![Logo](logo.png)

## Summary and outline
Project summary: We aim to predict which variants of influenza virus will succeed and circulate in the next 1-2 years.
Expand All @@ -21,16 +21,28 @@ Next, use those alignments together with phylogenetic trees to determine which W

This is a key next step for the flu prediction project, because it will allow us to compare our predictions to the current and recent seasonal influenza vaccines and determine whether our designed vaccines would have been a good match to circulating strains in the next season.

Current status: Theme 1 has successfully completed the WHO clade assignments. Yay! They are moving on to linking those clade assignments to the predictions from Theme 2 so that we can select strains for putative vaccines and then see how our choice of vaccine would perform in our data.

## Theme 2 - improve on the central model in the preprint

There are a number of relatively straightforward improvements to make on the code we already used in the preprint. These include using regression instead of classification, modifying the threshold used for defining "success" and expanding previous predictions in to 2019.

Current status: Theme 2 folks have tested regression, checked the quality of predictions on 2019 data, explored lasso regression (for feature selection) and are linking with Theme 1 re WHO clades and vaccine strain selection.

Our preduction for the success of the subtrees in 2019 was currect in 70% of the cases when we consider the majority vote between our general model and 10 models from 10-folds cross validation. In 82% of the cases, at least one of our model could curectly predict the success of the subtrees. We also found this paper "Antigenicsites of H1N1 influenza virus hemagglutinin revealed by natural isolates and inhibition assays" which identified the epitope sites of the H1N1 sequences.

## Theme 3 - explore alternative learning methods

Maryam suggested exploring whether we could use graph neural networks instead of creating data frames with features that summarise the structure of the subtrees. We think that solving this problem is not feasible during this weekend, but perhaps setting up the problem is. We have team members reading about the required inputs for graph neural networks and thinking through how we would set up our problem in that kind of structure.

There are two ways to employ graph neural networks to solve this problem. The first approach performs graph-level inference on the phylogenetic subtrees defined previously. The model would learn high level features from the nodes and structure of the graphs and use them to predict the success of the subtree. The second approach learns higher level representations for each node in the phylogenetic tree (the one, big, global tree) based on the node's features and features of its neighbours. The model then uses these representations to predict the success of the node.

Setting up the first of these structures should be reasonably straightforward: obtain subtrees from Theme 2, represent them as graphs; use the success outcomes from Theme 2 and train GNNs to predict these outcomes. It would be pretty straightforward, but a bit more time-consuming, to develop a suite of features to place on the nodes of the subtrees. These should be derived from the NA and HA sequences for the tips in the subtree -- for example, the epitope features would be a great starting point.

## Theme 4 - use simpler models and larger clades of trees

Here, instead of dividing the phylogeny into many small subtrees of size approximately 10-35 we will focus on fewer, larger clades. Now, we won't have sufficient data to train machine learning models, but we can use the number of lineages in the trees per unit time and other simple summaries of tree growth to see if we can build a predictive model that signifies whether a clade is going to grow into the future.

Current status: We have functions that extract larger clades from the global phylogeny and profile how these have evolved over time. These profiles look at the diversification of the tree in time and in genetic distance space. We have also implemented a definition of "success" in this broader context. The next step is to combine visualisations and get intuition for whether there are signatures of which clades will be highly successful.


6 changes: 6 additions & 0 deletions build_map.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!bin/bash

grep "^>" Data/alignH3N2NA.fa | sed 's@ @_@g ; s@/@ @' > alignH3N2NA.fa.header
grep "^>" Data/alignH3N2.fa | sed 's@ @_@g ; s@/@ @'> alignH3N2.fa.header

awk -F" " '(NR==FNR){ h[$2]=$1;next; }( $2 in h){print h[$2],$1}' alignH3N2NA.fa.header alignH3N2.fa.header > HA_NA_map.txt.txt
62 changes: 62 additions & 0 deletions dfornika_scratchpad.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/usr/bin/env Rscript

library(igraph)
library(phangorn)
library(ape)
library(readr)
library(dplyr)
library(tibbletime)
library(treeio)
library(ggplot2)
library(phytools)

source("getClades_size.R")
source("larger_scale_features.R")



# alignB_labels_clades_dates <- load_labels_clades_dates("Data/alignB_labels_clades_dates.csv")
# alignBNA_labels_clades_dates <- load_labels_clades_dates("Data/alignBNA_labels_clades_dates.csv")
# alignH1N1_labels_clades_dates <- load_labels_clades_dates("Data/alignH1N1_labels_clades_dates.csv")
# alignH1N1NA_labels_clades_dates <- load_labels_clades_dates("Data/alignH1N1NA_labels_clades_dates.csv")
alignH3N2_labels_clades_dates <- load_labels_clades_dates("Data/alignH3N2_labels_clades_dates.csv")
# alignH3N2NA_labels_clades_dates <- load_labels_clades_dates("Data/alignH3N2NA_labels_clades_dates.csv")
# PH3N2_labels_clades_dates <- load_labels_clades_dates("Data/PH3N2_labels_clades_dates.csv")

alignH3N2_labels_clades_dates %>% pull(date) %>% max()

flutreeH3N2_HA <- read.tree("flutreeH3N2_HA.tree")


flutreeH3N2_HA_2014 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2014-02-28"))
flutreeH3N2_HA_2015 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2015-02-28"))
flutreeH3N2_HA_2016 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2016-02-28"))
flutreeH3N2_HA_2017 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2017-02-28"))


#trees <- c(flutreeH3N2_HA_2014, flutreeH3N2_HA_2015, flutreeH3N2_HA_2016, flutreeH3N2_HA_2017)
#class(trees) <- "multiPhylo"
#ggtree(trees) + facet_wrap(~.id) + theme_tree2()

clades_2016 = pickClades(flutreeH3N2_HA_2016, 200, 400)
clades_2017 = pickClades(flutreeH3N2_HA_2017, 200, 400)

tree_2016_plot <- ggtree(flutreeH3N2_HA_2016, mrsd="2016-02-28")
for (i in seq_along(clades_2016$nodes)) {
tree_2016_plot <- tree_2016_plot + geom_cladelabel(node=clades_2016$nodes[i], label=clades_2016$nodes[i], align=T)
}
tree_2016_plot <- tree_2016_plot + theme_tree2()
tree_2016_plot

tree_2017_plot <- ggtree(flutreeH3N2_HA_2017, mrsd="2017-02-28")
for (i in seq_along(clades_2017$nodes)) {
tree_2017_plot <- tree_2017_plot + geom_cladelabel(node=clades_2017$nodes[i], label=clades_2017$nodes[i], align=T)
}
tree_2017_plot <- tree_2017_plot + theme_tree2()
tree_2017_plot


growth_ratios_for_clades(flutreeH3N2_HA_2016, flutreeH3N2_HA_2017, 200, 400)



53 changes: 53 additions & 0 deletions getManyPlots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
getManyPlots <- function(myclade, alignment, gdtree) {
## compute things

# things that are NOT cumulative
lttinfo=as.data.frame(ltt.plot.coords(myclade,backward = F))
glttd=lttgendist(timetree = myclade, gdtree = gdtree) # ltt through gen dist

# a couple functions make things both cumulative and non-cumulative
gg=gendistprofiles(myclade, timechop(myclade, 30),gdtree = gdtree,mode = "tree")
ggt = gendistprofiles(myclade, timechop(myclade,30), aln=alignment, mode="alignment")

# things that ARE cumulative
d=divtt(myclade) ; # tips through time
dg=divttgendist(timetree=myclade,gdtree=gdtree,maxdate=NULL) # tips through gen dist space

## plot things
# plot the tree itself - requires ggtree
p1=ggtree(myclade)

## plot things that are NOT cumulative
# lineages through time
n1=ggplot(data=lttinfo, aes(x=time,y=N))+geom_line()+ggtitle("Lineages through time") +
theme(plot.title = element_text(size = 8))
n2= ggplot(data.frame(glttd), aes(x=time, y=N))+geom_line()+
ggtitle("Lineages through gen dist pseudo-time") +
theme(plot.title = element_text(size = 8))
n3=ggplot(data=gg, aes(x=time, y=now.diversity))+geom_line()+
ggtitle("Gen. diversity in gd tree") +
theme(plot.title = element_text(size = 8))
n4=ggplot(data=ggt, aes(x=time, y=now.diversity))+geom_point()+
ggtitle("Gen. div in alignment") +
theme(plot.title = element_text(size = 8))

# things that are cumulative
c1=ggplot(data=d, aes(x=time, y=Nsplits))+geom_line()+ ggtitle("Num tips through time") +
theme(plot.title = element_text(size = 8)) # tips through time
c2=ggplot(dg, aes(x=time,y=Ngdist))+geom_line()+
ggtitle("Lineages through genetic distance pseudo-time") +
theme(plot.title = element_text(size = 8))
c3=ggplot(data=gg, aes(x=time, y=cum.diversity))+geom_line()+
ggtitle("Cumulative gen diversity in gd tree") +
theme(plot.title = element_text(size = 8))
c4=ggplot(data=ggt, aes(x=time, y=cum.diversity))+geom_point()+
ggtitle("Cumulative gen diversity in alignment") +
theme(plot.title = element_text(size = 8))

dev.new()
multiplot(p1,n1,n2,n3,n4, ncol=1)
dev.new()
multiplot(p1,c1,c2,c3,c4, ncol=1)

}

Loading