hackseq · FrankWhoee · Oct 19, 2019 · Oct 19, 2019 · Oct 19, 2019 · Oct 19, 2019
diff --git a/Aux_H1N1.csv b/Aux_H1N1.csv
diff --git a/Data/alignBNA_labels_clades_dates.csv → Data/BNA_labels_clades_dates.csv b/Data/alignBNA_labels_clades_dates.csv → Data/BNA_labels_clades_dates.csv
diff --git a/Data/alignB_labels_clades_dates.csv → Data/B_labels_clades_dates.csv b/Data/alignB_labels_clades_dates.csv → Data/B_labels_clades_dates.csv
diff --git a/Data/alignH1N1NA_labels_clades_dates.csv → Data/H1N1NA_labels_clades_dates.csv b/Data/alignH1N1NA_labels_clades_dates.csv → Data/H1N1NA_labels_clades_dates.csv
diff --git a/Data/alignH1N1_labels_clades_dates.csv → Data/H1N1_labels_clades_dates.csv b/Data/alignH1N1_labels_clades_dates.csv → Data/H1N1_labels_clades_dates.csv
diff --git a/Data/alignH3N2NA_labels_clades_dates.csv → Data/H3N2NA_labels_clades_dates.csv b/Data/alignH3N2NA_labels_clades_dates.csv → Data/H3N2NA_labels_clades_dates.csv
diff --git a/Data/H3N2_WHO_clades.csv b/Data/H3N2_WHO_clades.csv
diff --git a/Data/alignH3N2_labels_clades_dates.csv → Data/H3N2_labels_clades_dates.csv b/Data/alignH3N2_labels_clades_dates.csv → Data/H3N2_labels_clades_dates.csv
diff --git a/GNN.zip b/GNN.zip
diff --git a/HA_NA_map.txt b/HA_NA_map.txt
diff --git a/NA_mining_utils.R b/NA_mining_utils.R
@@ -0,0 +1,122 @@
+library(ape)
+library(phangorn)
+library(seqinr)
+library(stringr)
+library(e1071)
+
+# Extracting subtrees in tree form with pruning using Maryam's code
+#' @param flutree Tree full tree file  
+#' @param myclades Output from getClades2() function
+#' @return List of phylogenetic trees with names as the root node 
+tree_trim <- function(flutree, myclades){
+  # getting clades without rejection
+  Final_trimmedClades=myclades$trimclades[myclades$rejected==0]
+  # get root ids from final_trimmed_clades
+  Final_trimmedClades_root=as.numeric(names(which(myclades$rejected==0)))
+  # Get branch lengths  
+  allHeights=node.depth.edgelength(flutree); max(allHeights)
+  # grab all tips and their lengths 
+  hdata=data.frame(tiplab=flutree$tip.label, height=allHeights[1:length(flutree$tip.label)])
+  # remove subtrees without proper growth 
+  print(max(allHeights)-allHeights[Final_trimmedClades_root[314]] <= 1.4) 
+  res=numeric()
+  for(i in 1:length(Final_trimmedClades_root)){
+    if((max(allHeights)-allHeights[Final_trimmedClades_root[i]]) <= 1.4){
+      res=c(res,i)
+    }
+  }
+  Final_root=Final_trimmedClades_root[-res] 
+  result <- list()
+  for (i in match(Final_root, Final_trimmedClades_root)){
+    if (i %% 5 == 0){print(i)}
+    tr1 <- extract.clade(flutree,Final_trimmedClades_root[i]) # get all from root 
+    tr <- drop.tip(tr1,setdiff(tr1$tip.label,hdata$tiplab[Final_trimmedClades[[i]]]), trim.internal = TRUE)
+    root <- Final_trimmedClades_root[i]
+    result[[as.character(root)]] <- tr
+  }
+  return(result)
+}
+
+#' Function to map HA sample names to NA sample names  
+#' @param HA HA index, as a string (can get from)
+HA_to_NA <- function(HA, map_file){
+  idx <- which(map_file[,2] == paste0(">",HA,"_A"))
+  output <- stringr::str_match(map_file[idx,1], regex(">(.*?)_A"))[2]
+  return(output)
+}
+
+#' Extracting features from NA sequences using subtrees  
+#' @param subtree The subtree as a phylo object
+#' @param alignment_file The alignment file in fasta format and loaded as read.fasta from the seqinr package
+#' @param alignment_key The clade key csv file for HA alignments with sample names as the first column. 
+#'                      Sequence of samples should be the same as that of the HA alignment fasta file  
+#' @param map_file The mapping file of samples between HA and NA, generated courtesy of Pak. HA second column, NA first column
+#' @return A vector of length 6 with min, max and median/mean of hamming distance and GC content respectively
+get_subtree_NA_feat<- function(subtree, alignment_file, alignment_key, map_file){
+  labs <- subtree$tip.label
+  # get index matching to alignment_file 
+  match_idx <- numeric(length(labs))
+  for (i in 1:length(labs)){
+    NA_lab <- HA_to_NA(HA = labs[i], map_file = map_file)
+    if (!is.na(NA_lab)){
+      match_idx[i] <- which(alignment_key[,1] == HA_to_NA(HA = labs[i],map_file = map_file))
+    }
+  }
+  if (length(which(match_idx != 0)) == 0){ # means that there is no match between HA and NA
+    return(rep(NA,6))
+  } else {
+    match_idx <- match_idx[match_idx != 0] # only grab non-zero 
+    # generating a matrix of sequence where each letter is it's own element in a vector 
+    seq_mat <- c()
+    for (i in 1:length(match_idx)){
+      seq_mat <- rbind(seq_mat, getSequence(alignment_file[match_idx[i]])[[1]])
+    }
+    if (nrow(seq_mat) == 1){ # there is only one match 
+      gc_content <- rep(seqinr::GC(seq_mat[1,]),3)
+      distance <- c(0,0,0)
+      output <- c(distance, gc_content)
+      return(output)
+    } else {
+      gc_content <- t(apply(seq_mat, 1, function(x){
+        seqinr::GC(x)
+      }))
+      distance <- as.vector(e1071::hamming.distance(seq_mat))
+      distance <- distance[distance != 0] # remove 0s since they are identical sequences non informative
+      output <- c(min(distance), max(distance), median(distance), min(gc_content), max(gc_content), mean(gc_content))
+      return(output)
+    }
+  }
+}
+
+#' Modified clade feature function from Maryam's script 
+#' @param tr The NA tree of interest  
+#' @return vector of 27 features similar to all subtrees 
+Clade_features_modified=function(tr){
+  features <- numeric()
+  features[1]=sackin(as.treeshape(tr),"pda")
+  features[2]=colless(as.treeshape(tr),"pda")
+  features[3]=var(node.depth.edgelength(tr)[1:length(tr$tip.label)])
+  features[4]=computeI2(tr)
+  features[5]=computeB1(tr)
+  features[6]=computeB2(tr)
+  features[7]=avgLadder(tr, normalise = TRUE)
+  features[8]=ILnumber(tr, normalise = TRUE)
+  features[9]=pitchforks(tr, normalise = TRUE)
+  features[10]=maxHeight(tr, normalise = TRUE)
+  features[11]=computeMaxWidth(tr)
+  features[12]=computeDelW(tr)
+  features[13]=computeStairs1(tr)
+  features[14]=computeStairs2(tr)
+  features[15]=computeCherries(tr, DOUBLE = FALSE)
+  features[16]=BS(tr)
+  features[17]=descinm(tr,m=2)$W
+  features[18]=getstattest(tr)$W
+  features[19]=skewness(i_bl(tr))
+  features[20]=kurtosis(i_bl(tr))
+  features[21]=tips_pairwise_distance(tr)
+  features[22]=tips_pairwise_distance_max(tr)
+  features[23:27]=computeNetworkStats (tr, weight = FALSE, meanpath = FALSE, maxOnly = TRUE)
+  return(features)
+}
+
+
diff --git a/NA_trees.R b/NA_trees.R
@@ -0,0 +1,44 @@
+# Script to generate features from NA sequences  
+# Pak Yu and Quang Nguyen
+# Updated 10/20
+
+library(ape)
+library(phangorn)
+library(seqinr)
+library(stringr)
+library(e1071)
+
+#Utility functions  
+source("./NA_mining_utils.R")
+
+
+# Load files  
+HA_tree <- read.tree(file = "./flutreeH3N2_HA.tree") # tree file 
+HA_dat <- readRDS(file = "./processed_dat/H3N2_HA_processed_nonepitope.rds") # processed file 
+alignment_file <- read.fasta(file = "./Data/alignH3N2NA.fa") # NA aligned sequences 
+
+alignment_key <- read.csv(file = "./Data/alignH3N2NA_labels_clades_dates.csv",
+                          stringsAsFactors = F, header = F)
+
+map_file <- read.table(file = "./HA_NA_map.txt", sep = ',')
+HA_subtrees <- HA_dat$subtrees  
+
+
+# getting all phylogenetic trees from subtrees
+tree_clades <- tree_trim(HA_tree, HA_subtrees)
+
+feat_mat <- t(sapply(1:length(tree_clades), function(x){
+  print(x)
+  get_subtree_NA_feat(tree_clades[[x]], alignment_file = alignment_file,
+                      alignment_key = alignment_key, map_file = map_file)
+}))
+colnames(feat_mat) <- c("min_hamming_distance", "max_hamming_distance", "median_hamming_distance", 
+                        "min_gc_content", "max_gc_content", "mean_gc_content")
+feat_mat <- as.data.frame(feat_mat)
+feat_mat <- cbind(as.numeric(names(tree_clades)), feat_mat)
+colnames(feat_mat)[1] <- "root_node_id"
+
+write.csv(feat_mat, file = "./processed_dat/NA_hamming_gc.csv")
+
+
+
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# hs19-flu
+![Logo](logo.png)
 
 ## Summary and outline
 Project summary: We aim to predict which variants of influenza virus will succeed and circulate in the next 1-2 years. 
@@ -21,16 +21,28 @@ Next, use those alignments together with phylogenetic trees to determine which W
 
 This is a key next step for the flu prediction project, because it will allow us to compare our predictions to the current and recent seasonal influenza vaccines and determine whether our designed vaccines would have been a good match to circulating strains in the next season.
 
+Current status: Theme 1 has successfully completed the WHO clade assignments. Yay! They are moving on to linking those clade assignments to the predictions from Theme 2 so that we can select strains for putative vaccines and then see how our choice of vaccine would perform in our data. 
+
 ## Theme 2 - improve on the central model in the preprint 
 
 There are a number of relatively straightforward improvements to make on the code we already used in the preprint. These include using regression instead of classification, modifying the threshold used for defining "success" and expanding previous predictions in to 2019. 
 
+Current status: Theme 2 folks have tested regression, checked the quality of predictions on 2019 data, explored lasso regression (for feature selection) and are linking with Theme 1 re WHO clades and vaccine strain selection. 
+
+Our preduction for the success of the subtrees in 2019 was currect in 70% of the cases when we consider the majority vote between our general model and 10 models from 10-folds cross validation. In 82% of the cases, at least one of our model could curectly predict the success of the subtrees. We also found this paper "Antigenicsites of H1N1 influenza virus hemagglutinin revealed by natural isolates and inhibition assays" which identified the epitope sites of the H1N1 sequences. 
+
 ## Theme 3 - explore alternative learning methods 
 
 Maryam suggested exploring whether we could use graph neural networks instead of creating data frames with features that summarise the structure of the subtrees. We think that solving this problem is not feasible during this weekend, but perhaps setting up the problem is. We have team members reading about the required inputs for graph neural networks and thinking through how we would set up our problem in that kind of structure. 
 
+There are two ways to employ graph neural networks to solve this problem. The first approach performs graph-level inference on the phylogenetic subtrees defined previously. The model would learn high level features from the nodes and structure of the graphs and use them to predict the success of the subtree.  The second approach learns higher level representations for each node in the phylogenetic tree (the one, big, global tree) based on the node's features and features of its neighbours. The model then uses these representations to predict the success of the node.
+
+Setting up the first of these structures should be reasonably straightforward: obtain subtrees from Theme 2, represent them as graphs; use the success outcomes from Theme 2 and train GNNs to predict these outcomes. It would be pretty straightforward, but a bit more time-consuming, to develop a suite of features to place on the nodes of the subtrees. These should be derived from the NA and HA sequences for the tips in the subtree -- for example, the epitope features would be a great starting point. 
+
 ## Theme 4 - use simpler models and larger clades of trees 
 
 Here, instead of dividing the phylogeny into many small subtrees of size approximately 10-35 we will focus on fewer, larger clades. Now, we won't have sufficient data to train machine learning models, but we can use the number of lineages in the trees per unit time and other simple summaries of tree growth to see if we can build a predictive model that signifies whether a clade is going to grow into the future. 
 
+Current status: We have functions that extract larger clades from the global phylogeny and profile how these have evolved over time. These profiles look at the diversification of the tree in time and in genetic distance space. We have also implemented a definition of "success" in this broader context. The next step is to combine visualisations and get intuition for whether there are signatures of which clades will be highly successful. 
+
 
diff --git a/build_map.sh b/build_map.sh
@@ -0,0 +1,6 @@
+#!bin/bash
+
+grep "^>" Data/alignH3N2NA.fa | sed 's@ @_@g ; s@/@ @' > alignH3N2NA.fa.header
+grep "^>" Data/alignH3N2.fa | sed 's@ @_@g ; s@/@ @'> alignH3N2.fa.header
+
+awk -F" " '(NR==FNR){ h[$2]=$1;next; }( $2 in h){print h[$2],$1}' alignH3N2NA.fa.header alignH3N2.fa.header > HA_NA_map.txt.txt
diff --git a/dfornika_scratchpad.R b/dfornika_scratchpad.R
@@ -0,0 +1,62 @@
+#!/usr/bin/env Rscript
+
+library(igraph)
+library(phangorn)
+library(ape)
+library(readr)
+library(dplyr)
+library(tibbletime)
+library(treeio)
+library(ggplot2)
+library(phytools)
+
+source("getClades_size.R")
+source("larger_scale_features.R")
+
+
+
+# alignB_labels_clades_dates <- load_labels_clades_dates("Data/alignB_labels_clades_dates.csv")
+# alignBNA_labels_clades_dates <- load_labels_clades_dates("Data/alignBNA_labels_clades_dates.csv")
+# alignH1N1_labels_clades_dates <- load_labels_clades_dates("Data/alignH1N1_labels_clades_dates.csv")
+# alignH1N1NA_labels_clades_dates <- load_labels_clades_dates("Data/alignH1N1NA_labels_clades_dates.csv")
+alignH3N2_labels_clades_dates <- load_labels_clades_dates("Data/alignH3N2_labels_clades_dates.csv")
+# alignH3N2NA_labels_clades_dates <- load_labels_clades_dates("Data/alignH3N2NA_labels_clades_dates.csv")
+# PH3N2_labels_clades_dates <- load_labels_clades_dates("Data/PH3N2_labels_clades_dates.csv")
+
+alignH3N2_labels_clades_dates  %>% pull(date) %>% max()
+
+flutreeH3N2_HA <- read.tree("flutreeH3N2_HA.tree")
+
+
+flutreeH3N2_HA_2014 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2014-02-28"))
+flutreeH3N2_HA_2015 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2015-02-28"))
+flutreeH3N2_HA_2016 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2016-02-28"))
+flutreeH3N2_HA_2017 <- truncate_tree_by_date(flutreeH3N2_HA, alignH3N2_labels_clades_dates, as.Date("2017-02-28"))
+
+
+#trees <- c(flutreeH3N2_HA_2014, flutreeH3N2_HA_2015, flutreeH3N2_HA_2016, flutreeH3N2_HA_2017)
+#class(trees) <- "multiPhylo"
+#ggtree(trees) + facet_wrap(~.id) + theme_tree2()
+
+clades_2016 = pickClades(flutreeH3N2_HA_2016, 200, 400)
+clades_2017 = pickClades(flutreeH3N2_HA_2017, 200, 400)
+
+tree_2016_plot <-  ggtree(flutreeH3N2_HA_2016, mrsd="2016-02-28")
+for (i in seq_along(clades_2016$nodes)) {
+  tree_2016_plot <- tree_2016_plot + geom_cladelabel(node=clades_2016$nodes[i], label=clades_2016$nodes[i], align=T) 
+}
+tree_2016_plot <- tree_2016_plot + theme_tree2()
+tree_2016_plot
+
+tree_2017_plot <- ggtree(flutreeH3N2_HA_2017, mrsd="2017-02-28")
+for (i in seq_along(clades_2017$nodes)) {
+  tree_2017_plot <- tree_2017_plot + geom_cladelabel(node=clades_2017$nodes[i], label=clades_2017$nodes[i], align=T) 
+}
+tree_2017_plot <- tree_2017_plot + theme_tree2()
+tree_2017_plot
+
+
+growth_ratios_for_clades(flutreeH3N2_HA_2016, flutreeH3N2_HA_2017, 200, 400)
+
+
+
diff --git a/getManyPlots.R b/getManyPlots.R
@@ -0,0 +1,53 @@
+ getManyPlots <- function(myclade, alignment, gdtree) {
+  ## compute things 
+
+  # things that are NOT cumulative 
+  lttinfo=as.data.frame(ltt.plot.coords(myclade,backward = F))
+  glttd=lttgendist(timetree = myclade, gdtree = gdtree) # ltt through gen dist 
+
+  # a couple functions make things both cumulative and non-cumulative
+  gg=gendistprofiles(myclade, timechop(myclade, 30),gdtree = gdtree,mode = "tree")
+  ggt = gendistprofiles(myclade, timechop(myclade,30), aln=alignment, mode="alignment")
+
+ # things that ARE cumulative 
+  d=divtt(myclade) ; # tips through time 
+  dg=divttgendist(timetree=myclade,gdtree=gdtree,maxdate=NULL) # tips through gen dist  space 
+
+  ## plot things
+  # plot the tree itself - requires ggtree 
+  p1=ggtree(myclade) 
+
+  ## plot things that are NOT cumulative 
+  # lineages through time
+  n1=ggplot(data=lttinfo, aes(x=time,y=N))+geom_line()+ggtitle("Lineages through time") + 
+    theme(plot.title = element_text(size = 8))
+  n2= ggplot(data.frame(glttd), aes(x=time, y=N))+geom_line()+
+    ggtitle("Lineages through gen dist pseudo-time") + 
+    theme(plot.title = element_text(size = 8))
+  n3=ggplot(data=gg, aes(x=time, y=now.diversity))+geom_line()+
+    ggtitle("Gen. diversity in gd tree") + 
+    theme(plot.title = element_text(size = 8))
+  n4=ggplot(data=ggt, aes(x=time, y=now.diversity))+geom_point()+
+    ggtitle("Gen. div in alignment")  + 
+    theme(plot.title = element_text(size = 8))
+
+  # things that are cumulative 
+  c1=ggplot(data=d, aes(x=time, y=Nsplits))+geom_line()+ ggtitle("Num tips through time") + 
+    theme(plot.title = element_text(size = 8)) # tips through time 
+  c2=ggplot(dg, aes(x=time,y=Ngdist))+geom_line()+
+    ggtitle("Lineages through genetic distance pseudo-time") + 
+    theme(plot.title = element_text(size = 8))
+  c3=ggplot(data=gg, aes(x=time, y=cum.diversity))+geom_line()+
+    ggtitle("Cumulative gen diversity in gd tree") + 
+    theme(plot.title = element_text(size = 8))
+  c4=ggplot(data=ggt, aes(x=time, y=cum.diversity))+geom_point()+
+    ggtitle("Cumulative gen diversity in alignment") + 
+    theme(plot.title = element_text(size = 8))
+
+  dev.new()
+  multiplot(p1,n1,n2,n3,n4, ncol=1)
+  dev.new()
+  multiplot(p1,c1,c2,c3,c4, ncol=1)
+
+ }
+