Skip to content

Commit

Permalink
Merge pull request #27 from js51/develop
Browse files Browse the repository at this point in the history
add lots of new stuff
  • Loading branch information
js51 committed Jun 30, 2021
2 parents 05019e4 + fe96e43 commit 6703bba
Show file tree
Hide file tree
Showing 8 changed files with 194 additions and 24 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/runpytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
python-version: [3.8, 3.9]
python-version: [3.6, 3.7, 3.8]

steps:
- uses: actions/checkout@v1
Expand Down
8 changes: 4 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy == 1.16.4
pandas == 0.24.2
networkx == 2.4
scipy == 1.4.1
numpy == 1.19.2
pandas == 1.1.2
networkx == 2.5
scipy == 1.5.2
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def readfile(filename):
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.8',
python_requires='>=3.6',
package_data={'splitp': ['*.txt']},
include_package_data=True,
)
4 changes: 3 additions & 1 deletion splitp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
from splitp.nx_tree import *
from splitp.parsers import *
from splitp.tree_helper_functions import *
from splitp.squangles import *
from splitp.squangles import *
from splitp.enums import *
from splitp.tree_reconstruction import *
14 changes: 14 additions & 0 deletions splitp/enums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from enum import Enum, auto

class Model(Enum):
def _generate_next_value_(name, start, count, last_values):
return name
JC = auto()
K2ST = auto()

class Method(Enum):
def _generate_next_value_(name, start, count, last_values):
return name
flattenings = auto()
subflattenings = auto()
distance = auto()
117 changes: 101 additions & 16 deletions splitp/nx_tree.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,28 @@
import numpy as np
import copy as cp
import itertools
from warnings import warn
from math import exp
# Numpy
import numpy as np
# Pandas
import pandas as pd
# NetworkX
import networkx as nx
from networkx.readwrite import json_graph
import pandas as pd
from networkx.algorithms.traversal.depth_first_search import dfs_tree, dfs_postorder_nodes
from networkx.algorithms.traversal.breadth_first_search import bfs_successors
# Scipy
import scipy
import itertools
from scipy.sparse import coo_matrix
from scipy.sparse import dok_matrix
from scipy.sparse.linalg import svds
# Random
import random
from random import choices
# SplitP
from splitp import tree_helper_functions as hf
from splitp import parsers
from warnings import warn
from scipy.sparse.linalg import svds

from splitp.enums import Model

class NXTree:
"""A rooted phylogenetic tree.
Expand Down Expand Up @@ -68,7 +81,6 @@ def all_splits(self, trivial=False, only_balance=None, randomise=False):
r = 0 if trivial else 1
loop_over = range(r, 2**(n-1) - r)
if randomise:
import random
loop_over = [i for i in loop_over]
random.shuffle(loop_over)
for i in loop_over:
Expand Down Expand Up @@ -97,7 +109,10 @@ def true_splits(self, include_trivial=False):
for split in splits:
yield split

## BROKEN because this function doesn't realise that all_splits is a generator (I think!)
# TODO: add a warning or someting until it's fixed
def false_splits(self, only_balance=None, randomise=False):
warn("This function is completely broken! Sorry!")
"""Returns set of all false splits in the tree."""
true_splits = self.true_splits(include_trivial=False)
for split in hf.all_splits(self.get_num_taxa(), trivial=False, only_balance=only_balance, randomise=randomise):
Expand All @@ -112,7 +127,6 @@ def reassign_all_transition_matrices(self, matrix):
self.nx_graph.nodes[n].pop('branch_length') # TODO: recompute branch lengths instead

def build_JC_matrix(self, l):
from math import exp
matrix = [[0 for i in range(self.num_bases)] for n in range(self.num_bases)]
for r, row in enumerate(matrix):
for c, _ in enumerate(row):
Expand Down Expand Up @@ -143,6 +157,52 @@ def build_K2ST_matrix(self, transition, transversion):
matrix[r][c] = transversion
return np.array(matrix).T

def rate_matrix(self, model):
def _JC_rate_matrix(mutation_rate=None):
if mutation_rate:
a = mutation_rate
return [[-3*a, a, a, a],
[a, -3*a, a, a],
[a, a, -3*a, a],
[a, a, a, -3*a]]
def _K2ST_rate_matrix(rate_transition=None, rate_transversion=None, ratio=None):
if self.num_bases != 4:
warn(f"K2ST matrices are 4x4 but your model has {self.num_bases} states!" )
purines = ('A', 'G')
pyrimidines = ('C', 'T')
matrix = [[0 for i in range(self.num_bases)] for n in range(self.num_bases)]
if rate_transition and rate_transversion:
transition = rate_transition
transversion = rate_transversion
if transversion > transition:
warn(f"transitions are known to be more likely than transversions!")
elif ratio:
transition = ratio
transversion = 1
for r, row in enumerate(matrix):
from_state = self.state_space[r]
for c, _ in enumerate(row):
to_state = self.state_space[c]
if from_state == to_state:
# No change
matrix[r][c] = -(transition+2*transversion)
elif from_state in purines and to_state in purines:
matrix[r][c] = transition
elif from_state in pyrimidines and to_state in pyrimidines:
matrix[r][c] = transition
else:
matrix[r][c] = transversion
return np.array(matrix).T
if model is Model.JC: return _JC_rate_matrix
elif model is Model.K2ST: return _K2ST_rate_matrix

def scale_TR_rate_matrix(self, Q, return_scale_factor=False):
scale_factor = 1/(-sum(self.initDist[i]*Q[i][i] for i in range(4)))
if return_scale_factor:
return scale_factor
else:
return scale_factor*np.array(Q)

def adjacency_matrix(self):
return np.array(nx.adjacency_matrix(self.nx_graph).todense())

Expand Down Expand Up @@ -331,8 +391,6 @@ def get_pattern_probabilities(self):
return emptyArray

def evolve_pattern(self):
from random import choices
from networkx.algorithms.traversal.depth_first_search import dfs_tree
def __evolve_on_subtree(subtree, state):
root_node = [n for n,d in subtree.in_degree() if d==0][0]
children = list(subtree.successors(root_node))
Expand Down Expand Up @@ -413,11 +471,9 @@ def __subflattening_labels(self, length):
return patterns

def sparse_flattening(self, split, table, format='dok'):
import scipy
split = split.split('|')
num_taxa = sum(len(part) for part in split)
if format == 'coo':
from scipy.sparse import coo_matrix
rows = []
cols = []
data = []
Expand All @@ -431,7 +487,6 @@ def sparse_flattening(self, split, table, format='dok'):
data.append(r[1])
return coo_matrix((data, (rows, cols)), shape=(4**len(split[0]),4**len(split[1])))
elif format == 'dok':
from scipy.sparse import dok_matrix
flattening = dok_matrix((4**len(split[0]),4**len(split[1])))
for r in table.itertuples(index=False, name=None):
pattern = r[0]
Expand Down Expand Up @@ -469,8 +524,6 @@ def signed_sum_subflattening(self, split, data_table):
return np.array(subflattening)

def sparse_subflattening(self, split, data_table, as_dense_array=False):
import scipy
from scipy.sparse import coo_matrix
split = split.split('|')
num_taxa = len(split[0]) + len(split[1])
H = np.array([[1, -1], [1, 1]])
Expand Down Expand Up @@ -625,6 +678,38 @@ def subflattening(self, Ft, specialState='T', type=(1, 1)):
if row: matrix.append(row)
return np.asarray(matrix)

def hartigan_algorithm(self, pattern):
score = 0
graph = self.nx_graph.copy()
nodes = graph.nodes
taxa = [t for t in self.taxa]
for i, t in enumerate(taxa):
nodes[t]['S1'] = set(pattern[i])
postorder_nodes = list(dfs_postorder_nodes(graph, source=self.get_root(return_index=False)))
for n in postorder_nodes:
if not self.is_leaf(n):
children = self.get_descendants(n)
k={}
for state in self.state_space:
k[state] = len(set(child for child in children if state in nodes[child]['S1']))
K = max(k.values())
nodes[n]['S1'] = {state for state in self.state_space if k[state] == K}
nodes[n]['S2'] = {state for state in self.state_space if k[state] == K-1}
# Now compute the score
top_to_bottom_nodes = [x[0] for x in bfs_successors(graph, source=self.get_root(return_index=False))] + taxa
for n in top_to_bottom_nodes:
if n == self.get_root(return_index=False):
nodes[n]['hart_state'] = list(nodes[n]['S1'])[0]
else:
parent = nodes[list(graph.predecessors(n))[0]]
if parent['hart_state'] not in nodes[n]['S1']:
nodes[n]['hart_state'] = list(nodes[n]['S1'])[0]
score += 1
else:
nodes[n]['hart_state'] = parent['hart_state']
return score


def parsimony_score(self, pattern):
"""Calculate a parsimony score for a site pattern or split
Expand All @@ -644,7 +729,7 @@ def parsimony_score(self, pattern):
i += 1
pattern = "".join(str(i) for i in pattern2)

taxa = [n for n in nodes if self.is_leaf(n)]
taxa = [t for t in self.taxa]
for i, t in enumerate(taxa):
nodes[t]['pars'] = set(pattern[i])
score = 0
Expand Down
15 changes: 14 additions & 1 deletion splitp/tree_helper_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def _balanced_newick_subtree(nt, left=False):
return f"({_balanced_newick_subtree(floor(nt/2) + int(left), True)},{_balanced_newick_subtree(floor(nt/2) + int(not left))})"
newick_string = f"({_balanced_newick_subtree(num_taxa/2, True)},{_balanced_newick_subtree(num_taxa/2)});"
for i in range(0, num_taxa):
newick_string = newick_string.replace('_', str(i), 1)
newick_string = newick_string.replace('_', str(np.base_repr(i, base=i+3)), 1)
return newick_string

def get_balance(s, asTuple=False):
Expand Down Expand Up @@ -172,6 +172,19 @@ def __orbits(splits, group):
return sorted(list(set(orbit) for orbit in orbits), key=len)
return __orbits(splits, group)

def normalised(scores):
scores = scores.copy()
if type(scores) is dict:
minimum = min(scores.values())
maximum = max(scores.values())
for key, value in scores.items():
scores[key] = (value - minimum)/(maximum - minimum)
elif type(scores) is list:
minimum = min(scores)
maximum = max(scores)
scores = [(score-minimum)/(maximum-minimum) for score in scores]
return scores

def check_splits_representatives(split_reps, orbits):
represented_orbits = set()
for orbit in orbits:
Expand Down
56 changes: 56 additions & 0 deletions splitp/tree_reconstruction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
""" A collection of tree reconstruction methods.
Each one of these methods accepts a sequence alignment in the form of a dictionary,
and returns a collection of splits which have been chosen as `most likely' to be true.
Some methods will return additional information. Not all methods guarantee that the splits returned will be compatible.
"""

import splitp as sp
from splitp import tree_helper_functions as hf
import numpy as np

def erickson_SVD():
pass

def all_split_scores():
pass

def split_tree_parsimony(alignment, splits=None):
if type(alignment) is dict:
alignment_dict = alignment
else:
alignment_dict = {}
for table_pattern, value in alignment.itertuples(index=False, name=None):
alignment_dict[table_pattern] = value
num_taxa = len(list(alignment_dict.keys())[0]) # Length of first pattern
all_splits = list(hf.all_splits(num_taxa)) if splits==None else splits
scores = {split : 0 for split in all_splits}
for split in all_splits:
newick_string = []
for part in split.split('|'):
newick_string.append(f'({",".join(c for c in part)})')
newick_string = f"({newick_string[0]},{newick_string[1]});"
split_tree = sp.NXTree(newick_string, taxa_ordering='sorted')
for pattern, value in alignment_dict.items():
scores[split] += value * (split_tree.hartigan_algorithm(pattern)/(num_taxa-1))
return scores

def euclidean_split_distance(alignment, splits):
print("assuming sorted taxa")
states = ('A', 'G', 'C', 'T')
alignment_dict = {}
for table_pattern, value in alignment.itertuples(index=False, name=None):
alignment_dict[table_pattern] = value
num_taxa = len(list(alignment_dict.keys())[0]) # Length of first pattern
all_splits = list(hf.all_splits(num_taxa)) if splits==None else splits
scores = {split : 0 for split in all_splits}
for split in all_splits:
split_list = split.split('|')
for pattern, value in alignment_dict.items():
part_a = "".join(pattern[int(s, base=num_taxa+1)] for s in split_list[0])
part_b = "".join(pattern[int(s, base=num_taxa+1)] for s in split_list[1])
vec_a = np.array([ part_a.count(state) for state in states ])
vec_b = np.array([ part_b.count(state) for state in states ])
vec_a = vec_a / np.linalg.norm(vec_a)
vec_b = vec_b / np.linalg.norm(vec_b)
scores[split] += value * (2-np.linalg.norm(vec_a - vec_b))/2
return scores

0 comments on commit 6703bba

Please sign in to comment.