## using treesignal on euk_ToL using sptrees from iGTP and genetrees from RAxML
- iGTP was run on samples of 150 gene trees, using DupLoss cost (100 runs in total)
- the list of gene trees for iGTP was the first (MAP?) tree estimated by phylobayes (Miguel data). Notice that first tree doesn't mean much since many flat surfaces...
- while iGTP will give us the set of reference (species) trees, the samples we will plot are the RAxML gene trees.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import sys, subprocess, time, dendropy, os, copy, glob
import numpy as np
from sklearn import manifold, metrics, cluster, neighbors, decomposition, preprocessing
import treesignal

### Loading reference trees (sptrees)
100 estimates using iGTP duploss cost. We also add "noise" (SPR neighbours) since this function also removes duplicated trees. Some iGTP sptrees miss one species, and therefore must be excluded.

In [None]:
f_dir = "/home/leo/Academic/Projects/164-Katz/12.concat_mbayes_output_201703/201704.igtp_output/result_0407/igtp*"
spstring = ""
for file in glob.glob(f_dir)[:5]:
    sptree = dendropy.TreeList.get_from_path (file, "newick", preserve_underscores=True)
    if(len(sptree[0].leaf_nodes()) == 155):
        spstring += sptree[0].as_string("newick",suppress_edge_lengths=True).rstrip().replace("'","")
    
sptrees = treesignal.lowlevel_randomise_trees_with_spr_string(spstring, n_copies=2, n_spr=2)

### Read gene trees and calculate feature matrix rows 
* Reading only first tree from each file
* function yield_from_files allow several files to be read at once, and trees being read one by one
* the calculated distances are scaled (using a single scaling factor for all distances, which is total number of branches (on both trees)

In [None]:
ts = treesignal.TreeSignal(sp_trees = dendropy.TreeList.get( data=sptrees, schema="newick"), replicates=1)

f_dir = "/home/leo/Academic/Projects/164-Katz/12.concat_mbayes_output_201703/201705.raxml/OG5_*"
feat_mat = []
genesize = []

for file in glob.glob(f_dir)[:10]:
    tyield = dendropy.Tree.yield_from_files(files=[file], schema="nexus", preserve_underscores=True)
    for idx, tre in enumerate (tyield): # iterate over trees as it reads (not everything in memory at once)
        if idx > 0: # read only first trees
            break
        else:
            print (tre.as_string("newick",suppress_edge_lengths=True).replace("'",""))
            spectrum = ts(tre)
            if spectrum.max() > -1.: # if genetree is too small etc. there is no spectrum
                feat_mat.append(spectrum)
                genesize.append(len(tre.leaf_nodes()))
    
    
feat_mat = np.array(feat_mat)
genesize = np.array(genesize)
print ("dimensions: ", feat_mat.shape) 

In [None]:
signal = feat_mat/feat_mat.mean(0); 
transf=manifold.MDS(n_components=2).fit_transform(signal)

In [None]:
fig, axes = plt.subplots(1) ; fig.set_size_inches(10, 8)
fig.subplots_adjust(top=.99, bottom=.01, left=.02, right=.98, wspace=.1, hspace=.2)
jit = 0.00001 * transf.max() * np.random.normal(size=feat_mat.shape[0]) # avoid complete overlap of points
axes.scatter(transf[:,1]+jit, transf[:,0]+jit[::-1], c=geneidx, edgecolor="none", cmap="jet", alpha=.9, s=100)
axes.set_title("MDS",  fontsize=18)

In [None]:
transfeat  = decomposition.PCA(n_components=2).fit_transform(signal.T) ## order FEATURES
feat_order = np.argsort(transfeat[:,0])

In [None]:
fig, axes = plt.subplots(2,1) ; fig.set_size_inches(16, 10)
fig.subplots_adjust(top=.99, bottom=.01, left=.03, right=.97, wspace=.1, hspace=.2)

im = axes[0].imshow(signal[:,feat_order], aspect='auto', cmap="Spectral_r", interpolation="nearest")
axes[0].set_title("Feature matrix using reference trees reordered by PCA",  fontsize=20)

im = axes[1].imshow(signal, aspect='auto', cmap="Spectral_r", interpolation="nearest")
axes[1].set_title("Feature matrix (original order) using reference trees",  fontsize=20)

Add original reference (species) trees 

In [None]:
feat_refs = ts()

In [None]:
feat_all = np.concatenate((feat_mat,feat_refs))
signal = feat_all/feat_all.mean(0); 
transf=manifold.MDS(n_components=2).fit_transform(signal)

In [None]:
cols = np.concatenate((np.array(geneidx),np.repeat(max(geneidx)+1,feat_refs.shape[0])))
fig, axes = plt.subplots(1) ; fig.set_size_inches(10, 8)
fig.subplots_adjust(top=.99, bottom=.01, left=.02, right=.98, wspace=.1, hspace=.2)
axes.scatter(transf[:,1], transf[:,0],c=cols, edgecolor="none", cmap="jet", alpha=.9, s=200)
axes.set_title("MDS",  fontsize=18)