Merge pull request #27 from js51/develop

add lots of new stuff
js51 · Jun 30, 2021 · 6703bba · 6703bba
2 parents 05019e4 + fe96e43
commit 6703bba
Show file tree

Hide file tree

Showing 8 changed files with 194 additions and 24 deletions.
diff --git a/.github/workflows/runpytest.yml b/.github/workflows/runpytest.yml
@@ -9,7 +9,7 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.6, 3.7, 3.8]
 
     steps:
     - uses: actions/checkout@v1

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-numpy == 1.16.4
-pandas == 0.24.2
-networkx == 2.4
-scipy == 1.4.1
+numpy == 1.19.2
+pandas == 1.1.2
+networkx == 2.5
+scipy == 1.5.2
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@ def readfile(filename):
         "License :: OSI Approved :: MIT License",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.8',
+    python_requires='>=3.6',
     package_data={'splitp': ['*.txt']},
     include_package_data=True,
 )
diff --git a/splitp/__init__.py b/splitp/__init__.py
@@ -2,4 +2,6 @@
 from splitp.nx_tree import *
 from splitp.parsers import *
 from splitp.tree_helper_functions import *
-from splitp.squangles import *
+from splitp.squangles import *
+from splitp.enums import *
+from splitp.tree_reconstruction import *
diff --git a/splitp/enums.py b/splitp/enums.py
@@ -0,0 +1,14 @@
+from enum import Enum, auto
+
+class Model(Enum):
+    def _generate_next_value_(name, start, count, last_values):
+        return name
+    JC      = auto()
+    K2ST    = auto()
+
+class Method(Enum):
+    def _generate_next_value_(name, start, count, last_values):
+        return name
+    flattenings      = auto()
+    subflattenings   = auto()
+    distance         = auto()
diff --git a/splitp/nx_tree.py b/splitp/nx_tree.py
@@ -1,15 +1,28 @@
-import numpy as np
 import copy as cp
+import itertools
+from warnings import warn
+from math import exp
+# Numpy
+import numpy as np
+# Pandas
+import pandas as pd
+# NetworkX
 import networkx as nx
 from networkx.readwrite import json_graph
-import pandas as pd
+from networkx.algorithms.traversal.depth_first_search import dfs_tree, dfs_postorder_nodes
+from networkx.algorithms.traversal.breadth_first_search import bfs_successors
+# Scipy
 import scipy
-import itertools
+from scipy.sparse import coo_matrix
+from scipy.sparse import dok_matrix
+from scipy.sparse.linalg import svds
+# Random
+import random
+from random import choices
+# SplitP
 from splitp import tree_helper_functions as hf
 from splitp import parsers
-from warnings import warn
-from scipy.sparse.linalg import svds
-
+from splitp.enums import Model
 
 class NXTree:
     """A rooted phylogenetic tree.
@@ -68,7 +81,6 @@ def all_splits(self, trivial=False, only_balance=None, randomise=False):
         r = 0 if trivial else 1
         loop_over = range(r, 2**(n-1) - r)
         if randomise:
-            import random
             loop_over = [i for i in loop_over]
             random.shuffle(loop_over)
         for i in loop_over:
@@ -97,7 +109,10 @@ def true_splits(self, include_trivial=False):
         for split in splits:
             yield split
 
+    ## BROKEN because this function doesn't realise that all_splits is a generator (I think!)
+    # TODO: add a warning or someting until it's fixed
     def false_splits(self, only_balance=None, randomise=False):
+        warn("This function is completely broken! Sorry!")
         """Returns set of all false splits in the tree."""
         true_splits = self.true_splits(include_trivial=False)
         for split in hf.all_splits(self.get_num_taxa(), trivial=False, only_balance=only_balance, randomise=randomise):
@@ -112,7 +127,6 @@ def reassign_all_transition_matrices(self, matrix):
                 self.nx_graph.nodes[n].pop('branch_length') # TODO: recompute branch lengths instead
 
     def build_JC_matrix(self, l):
-        from math import exp
         matrix = [[0 for i in range(self.num_bases)] for n in range(self.num_bases)]
         for r, row in enumerate(matrix):
             for c, _ in enumerate(row):
@@ -143,6 +157,52 @@ def build_K2ST_matrix(self, transition, transversion):
                     matrix[r][c] = transversion
         return np.array(matrix).T
 
+    def rate_matrix(self, model):
+        def _JC_rate_matrix(mutation_rate=None):
+            if mutation_rate:
+                a = mutation_rate
+                return [[-3*a, a, a, a],
+                        [a, -3*a, a, a],
+                        [a, a, -3*a, a],
+                        [a, a, a, -3*a]]
+        def _K2ST_rate_matrix(rate_transition=None, rate_transversion=None, ratio=None):
+            if self.num_bases != 4:
+                warn(f"K2ST matrices are 4x4 but your model has {self.num_bases} states!" )
+            purines = ('A', 'G')
+            pyrimidines = ('C', 'T')
+            matrix = [[0 for i in range(self.num_bases)] for n in range(self.num_bases)]
+            if rate_transition and rate_transversion:
+                transition = rate_transition
+                transversion = rate_transversion
+                if transversion > transition: 
+                    warn(f"transitions are known to be more likely than transversions!")
+            elif ratio:
+                transition = ratio
+                transversion = 1
+            for r, row in enumerate(matrix):
+                from_state = self.state_space[r]
+                for c, _ in enumerate(row):
+                    to_state = self.state_space[c]
+                    if from_state == to_state:
+                        # No change
+                        matrix[r][c] = -(transition+2*transversion)
+                    elif from_state in purines and to_state in purines:
+                        matrix[r][c] = transition 
+                    elif from_state in pyrimidines and to_state in pyrimidines:
+                        matrix[r][c] = transition
+                    else:
+                        matrix[r][c] = transversion
+            return np.array(matrix).T
+        if   model is Model.JC:   return _JC_rate_matrix
+        elif model is Model.K2ST: return _K2ST_rate_matrix
+
+    def scale_TR_rate_matrix(self, Q, return_scale_factor=False):
+        scale_factor = 1/(-sum(self.initDist[i]*Q[i][i] for i in range(4)))
+        if return_scale_factor:
+            return scale_factor
+        else:
+            return scale_factor*np.array(Q)
+
     def adjacency_matrix(self):
         return np.array(nx.adjacency_matrix(self.nx_graph).todense())
 
@@ -331,8 +391,6 @@ def get_pattern_probabilities(self):
         return emptyArray
 
     def evolve_pattern(self):
-        from random import choices
-        from networkx.algorithms.traversal.depth_first_search import dfs_tree
         def __evolve_on_subtree(subtree, state):
             root_node = [n for n,d in subtree.in_degree() if d==0][0]
             children = list(subtree.successors(root_node))
@@ -413,11 +471,9 @@ def __subflattening_labels(self, length):
         return patterns
 
     def sparse_flattening(self, split, table, format='dok'):
-        import scipy
         split = split.split('|')
         num_taxa = sum(len(part) for part in split)
         if format == 'coo':
-            from scipy.sparse import coo_matrix
             rows = []
             cols = []
             data = []
@@ -431,7 +487,6 @@ def sparse_flattening(self, split, table, format='dok'):
                     data.append(r[1])
             return coo_matrix((data, (rows, cols)), shape=(4**len(split[0]),4**len(split[1])))
         elif format == 'dok':
-            from scipy.sparse import dok_matrix
             flattening = dok_matrix((4**len(split[0]),4**len(split[1])))
             for r in table.itertuples(index=False, name=None):
                 pattern = r[0]
@@ -469,8 +524,6 @@ def signed_sum_subflattening(self, split, data_table):
         return np.array(subflattening)
 
     def sparse_subflattening(self, split, data_table, as_dense_array=False):
-        import scipy
-        from scipy.sparse import coo_matrix
         split = split.split('|')
         num_taxa = len(split[0]) + len(split[1])
         H = np.array([[1, -1], [1, 1]])
@@ -625,6 +678,38 @@ def subflattening(self, Ft, specialState='T', type=(1, 1)):
                 if row: matrix.append(row)
         return np.asarray(matrix)
 
+    def hartigan_algorithm(self, pattern):
+        score = 0
+        graph = self.nx_graph.copy()
+        nodes = graph.nodes
+        taxa = [t for t in self.taxa]
+        for i, t in enumerate(taxa):
+            nodes[t]['S1'] = set(pattern[i])
+        postorder_nodes = list(dfs_postorder_nodes(graph, source=self.get_root(return_index=False)))
+        for n in postorder_nodes:
+            if not self.is_leaf(n):
+                children = self.get_descendants(n)
+                k={}
+                for state in self.state_space:
+                    k[state] = len(set(child for child in children if state in nodes[child]['S1']))
+                K = max(k.values())
+                nodes[n]['S1'] = {state for state in self.state_space if k[state] == K}
+                nodes[n]['S2'] = {state for state in self.state_space if k[state] == K-1}
+        # Now compute the score
+        top_to_bottom_nodes = [x[0] for x in bfs_successors(graph, source=self.get_root(return_index=False))] + taxa
+        for n in top_to_bottom_nodes:
+            if n == self.get_root(return_index=False):
+                nodes[n]['hart_state'] = list(nodes[n]['S1'])[0]
+            else:
+                parent = nodes[list(graph.predecessors(n))[0]]
+                if parent['hart_state'] not in nodes[n]['S1']:
+                    nodes[n]['hart_state'] = list(nodes[n]['S1'])[0]
+                    score += 1
+                else:
+                    nodes[n]['hart_state'] = parent['hart_state']
+        return score
+
+
     def parsimony_score(self, pattern):
         """Calculate a parsimony score for a site pattern or split
 
@@ -644,7 +729,7 @@ def parsimony_score(self, pattern):
                 i += 1
             pattern = "".join(str(i) for i in pattern2)
 
-        taxa = [n for n in nodes if self.is_leaf(n)]
+        taxa = [t for t in self.taxa]
         for i, t in enumerate(taxa):
             nodes[t]['pars'] = set(pattern[i])
         score = 0

diff --git a/splitp/tree_helper_functions.py b/splitp/tree_helper_functions.py
@@ -19,7 +19,7 @@ def _balanced_newick_subtree(nt, left=False):
                 return f"({_balanced_newick_subtree(floor(nt/2) + int(left), True)},{_balanced_newick_subtree(floor(nt/2) + int(not left))})"
     newick_string = f"({_balanced_newick_subtree(num_taxa/2, True)},{_balanced_newick_subtree(num_taxa/2)});"
     for i in range(0, num_taxa):
-        newick_string = newick_string.replace('_', str(i), 1)
+        newick_string = newick_string.replace('_', str(np.base_repr(i, base=i+3)), 1)
     return newick_string
 
 def get_balance(s, asTuple=False):
@@ -172,6 +172,19 @@ def __orbits(splits, group):
         return sorted(list(set(orbit) for orbit in orbits), key=len)
     return __orbits(splits, group)
 
+def normalised(scores):
+    scores = scores.copy()
+    if type(scores) is dict:
+        minimum = min(scores.values())
+        maximum = max(scores.values())
+        for key, value in scores.items():
+            scores[key] = (value - minimum)/(maximum - minimum)
+    elif type(scores) is list:
+        minimum = min(scores)
+        maximum = max(scores)
+        scores = [(score-minimum)/(maximum-minimum) for score in scores]
+    return scores
+
 def check_splits_representatives(split_reps, orbits):
     represented_orbits = set()
     for orbit in orbits:

diff --git a/splitp/tree_reconstruction.py b/splitp/tree_reconstruction.py
@@ -0,0 +1,56 @@
+""" A collection of tree reconstruction methods. 
+    Each one of these methods accepts a sequence alignment in the form of a dictionary, 
+    and returns a collection of splits which have been chosen as `most likely' to be true.
+    Some methods will return additional information. Not all methods guarantee that the splits returned will be compatible.
+"""
+
+import splitp as sp
+from splitp import tree_helper_functions as hf
+import numpy as np
+
+def erickson_SVD():
+    pass
+
+def all_split_scores():
+    pass
+
+def split_tree_parsimony(alignment, splits=None):
+    if type(alignment) is dict:
+        alignment_dict = alignment
+    else:
+        alignment_dict = {}
+        for table_pattern, value in alignment.itertuples(index=False, name=None):
+            alignment_dict[table_pattern] = value
+    num_taxa = len(list(alignment_dict.keys())[0]) # Length of first pattern
+    all_splits = list(hf.all_splits(num_taxa)) if splits==None else splits
+    scores = {split : 0 for split in all_splits}
+    for split in all_splits:
+        newick_string = []
+        for part in split.split('|'):
+            newick_string.append(f'({",".join(c for c in part)})')
+        newick_string = f"({newick_string[0]},{newick_string[1]});"
+        split_tree = sp.NXTree(newick_string, taxa_ordering='sorted')
+        for pattern, value in alignment_dict.items():
+            scores[split] += value * (split_tree.hartigan_algorithm(pattern)/(num_taxa-1))
+    return scores
+
+def euclidean_split_distance(alignment, splits):
+    print("assuming sorted taxa")
+    states = ('A', 'G', 'C', 'T')
+    alignment_dict = {}
+    for table_pattern, value in alignment.itertuples(index=False, name=None):
+        alignment_dict[table_pattern] = value
+    num_taxa = len(list(alignment_dict.keys())[0]) # Length of first pattern
+    all_splits = list(hf.all_splits(num_taxa)) if splits==None else splits
+    scores = {split : 0 for split in all_splits}
+    for split in all_splits:
+        split_list = split.split('|')
+        for pattern, value in alignment_dict.items():
+            part_a = "".join(pattern[int(s, base=num_taxa+1)] for s in split_list[0])
+            part_b = "".join(pattern[int(s, base=num_taxa+1)] for s in split_list[1])
+            vec_a = np.array([ part_a.count(state) for state in states ])
+            vec_b = np.array([ part_b.count(state) for state in states ])
+            vec_a = vec_a / np.linalg.norm(vec_a)
+            vec_b = vec_b / np.linalg.norm(vec_b)
+            scores[split] += value * (2-np.linalg.norm(vec_a - vec_b))/2
+    return scores