# Complete Restart with JOGs using Orthofinder results
JOGs are Josiah's Orthogroups from Orthofinder run using latest assemblies

# HOG file parsing for gene names

In [39]:
def convert_to_gene_id(name):
    if name.startswith('FRAX00'):
        name = name.replace('FRAX00_FRAEX38873_v2_', '')
        return name#[:name.rfind('.')]
    if name.startswith('FRAX'):  # not FRAEX for excelsior
        name = name.split('_')[2] + '_' + name.split('_')[3]
        return name#[:name.rfind('.')]
    else:
        return name
assert convert_to_gene_id("FRAX19_FRAX19_000226960.2_R0") == "000226960.2_R0", convert_to_gene_id("FRAX19_FRAX19_000226960.2_R0")
assert convert_to_gene_id("FRAX00_FRAEX38873_v2_000001220.2") == "000001220.2", convert_to_gene_id("FRAX00_FRAEX38873_v2_000001220.2")
tomato = 'Solyc03g095770.2.1 pacid=36135394 transcript=Solyc03g095770.2.1 locus=Solyc03g095770.2 ID=Solyc03g095770.2.1.ITAG2.4 annot-version=ITAG2.4 [Slycopersicum_390_ITAG2]'
assert convert_to_gene_id(tomato) == tomato


In [40]:
def parse_HOGs_from_fasta_directory():
    directory_HOGs = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\Ash_Proteome\Results_Jun25\Orthologues_Jul06\Sequences"
    from glob import glob
    from os.path import join, splitext, basename
    file_list = glob(join(directory_HOGs, "*.fa"))
    HOGs = {}
    for filename in file_list:
        assert basename(filename).startswith('OG')
        headers = []
        with open(filename, 'r') as fasta:
            for line in fasta:
                if line.startswith('>'):
                    headers.append(convert_to_gene_id(line[1:-1]))
        #assert len(headers) == len(set(headers)), "There was a redundant gene mention %s" % headers 
        HOGs[basename(splitext(filename)[0])] = headers
    return HOGs
HOGs = parse_HOGs_from_fasta_directory()
len(HOGs)

64650

# GFF parsing for overlapping annotations

TBD

# Greedy merge algorithm
 ( agglomerative clustering? )
* Starting clusters can be HOGs
* clusters = dict{ gene: pointer to cluster }
* Clusters are set(gene)
* First gene gets a new cluster
* If gene A overlaps with gene B anywhere, then go through each cluster to find gene A and add gene B
* If gene B is in any other clusters, find it and merge the entire cluster into gene A cluster
* Assert a gene can only ever be in one cluster at a time, except in the atomic operation of merging two


### Starting from HOGs but no side-effects, using networkx to retreive clusters

[Python Network Graphs](https://www.python.org/doc/essays/graphs/)

[Creating a networkx graph](https://networkx.github.io/documentation/networkx-1.10/tutorial/tutorial.html)  
[Graph Connected Components](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.components.connected.connected_components.html)

In [41]:
def map_genes_to_HOGs(HOGs):
    genes_to_HOGs = {}
    for hog, genes in HOGs.items():
        for gene in genes:
            if gene not in genes_to_HOGs:
                genes_to_HOGs[gene] = set()
            genes_to_HOGs[gene].add(hog)
    return genes_to_HOGs

def test_map_genes_to_HOGs():
    genes_to_HOGs_answer = {'gene1': {'HOG1'},
                     'gene2': {'HOG1', 'HOG2'},
                     'gene3': {'HOG2'}}
    HOGs = {'HOG1': set(('gene1', 'gene2')),
            'HOG2': set(('gene2', 'gene3'))}  # all HOGs start out parsed and non-exlusive
    genes_to_HOGs = map_genes_to_HOGs(HOGs)
    print(genes_to_HOGs, '\n',genes_to_HOGs_answer)
    assert genes_to_HOGs == genes_to_HOGs_answer
test_map_genes_to_HOGs()    

{'gene3': {'HOG2'}, 'gene1': {'HOG1'}, 'gene2': {'HOG1', 'HOG2'}} 
 {'gene3': {'HOG2'}, 'gene1': {'HOG1'}, 'gene2': {'HOG1', 'HOG2'}}


In [42]:
from itertools import combinations
import networkx

def create_cluster_network(genes_to_HOGs, HOGs):
    g = networkx.Graph()
    g.add_nodes_from(HOGs.keys())
    for gene, hogs in genes_to_HOGs.items():
        if len(hogs) > 1:
            # two HOGs have overlap and need to be merged
            g.add_edges_from(combinations(hogs, 2))  # add edge between all hogs
            
    return g

In [43]:
def make_super_HOGs(HOGs, networked_hogs):
    """collect subnetworks together into larger clusters"""
    super_HOGs = {}
    genes_seen = set()
    clusters = [c for c in sorted(networkx.connected_components(networked_hogs), key=len, reverse=True)]
    for cluster in clusters:
        name = '-'.join(cluster)
        super_HOGs[name] = set().union(gene for hog in cluster for gene in HOGs[hog])
        assert super_HOGs[name] not in genes_seen, "You missed a clustering connection.  Genes should only occur once"
        genes_seen.update(super_HOGs[name])
            
    return super_HOGs

def test_make_super_HOGs():
    super_HOGs_answer = {'HOG1-HOG2': {'gene1', 'gene3', 'gene2'}}
    HOGs = {'HOG1': set(('gene1', 'gene2')),
            'HOG2': set(('gene2', 'gene3'))}  # all HOGs start out parsed and non-exlusive
    genes_to_HOGs = map_genes_to_HOGs(HOGs)
    network = create_cluster_network(genes_to_HOGs, HOGs)
    assert str(network.edges()) == "[('HOG1', 'HOG2')]", network.edges()
    super_HOGs = make_super_HOGs(HOGs, network)
    print(super_HOGs)
    assert super_HOGs == super_HOGs_answer, super_HOGs
test_make_super_HOGs()

{'HOG1-HOG2': {'gene1', 'gene3', 'gene2'}}


### Actual Clusters with Real HOGs

In [44]:
actual_genes_to_HOGs = map_genes_to_HOGs(HOGs)
actual_network = create_cluster_network(actual_genes_to_HOGs, HOGs)
super_HOGs = make_super_HOGs(HOGs, actual_network)

"{:,}".format(len(HOGs) - len(super_HOGs)) + "  Reduced count"

'13,306  Reduced count'

## Compose new alignment with Super HOGs

In [45]:
from DNASkittleUtils.DDVUtils import pp
modified_HOGs_only = [c for c in sorted(networkx.connected_components(actual_network), key=len, reverse=True) if len(c) > 1]
pp(len(modified_HOGs_only))

'5,347'

In [25]:
assert all(['-'.join(h) in super_HOGs for h in modified_HOGs_only])

_We could simply exclude all modified HOGs from the data as being ambiguous_

### Concat super HOG fasta files into a new directoy

In [48]:
from os.path import join, basename, splitext
import os
family_out_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\Ash_Proteome\Results_Jun25\Orthologues_Jul06\super_JOGs__tail"
HOGfasta = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\Ash_Proteome\Results_Jun25\Orthologues_Jul06\Sequences"
os.makedirs(family_out_dir)
for family in modified_HOGs_only:
    family_name = ('-'.join(family))[:114] + '.fa'
    with open(join(family_out_dir, family_name), 'wb') as concat:  # output file concatenation
        for hog in family:
            with open(join(HOGfasta, hog + '.fa'), 'rb') as hog_file:  # smaller input file
                concat.write(hog_file.read())  # dump the whole file
#                 concat.write('\n')  # just to be safe
            

### Align the new gene family files

In [7]:
import subprocess
import multiprocessing
from datetime import datetime
import os
from os.path import dirname, join, basename
from Bio.Align.Applications import MuscleCommandline
from DNASkittleUtils.CommandLineUtils import just_the_name
from glob import glob

def do_alignment(args):
#     print("starting alignment")
    index, fa, output_folder = args
    target = join(output_folder, just_the_name(fa) + '.fa')
    muscle_exe = 'muscle3.8.31_i86win32.exe'

    if not os.path.exists(target):
        muscle_cline = MuscleCommandline(muscle_exe, input=fa, out=target)
        try:
            stdout, stderr = muscle_cline()
        except subprocess.CalledProcessError as err:
            print(err.stderr)
        print(datetime.now(), just_the_name(fa), '{:%}'.format(index / 5347))

def batch_align_sequences(input_folder, output_folder):
    start = datetime.now()
    input_folder = os.path.abspath(input_folder)
    os.makedirs(output_folder, exist_ok=True)
    files = glob(os.path.join(input_folder, '*.fa'))
    files = sorted(files, key= os.path.getsize)
    args = [(i, ipath, output_folder) for i, ipath in list(enumerate(files))]
    for line in args:
        do_alignment(line)
#     pool.map(do_alignment, args)
        
    return os.path.abspath(output_folder)

#You can't actually do multiprocessing from a notebook
# if __name__ == '__main__':  # https://github.com/jupyter/notebook/issues/2080
#     pool = multiprocessing.Pool(10)


In [6]:
family_aligned_dir = r"D:\Genomes\Ash_Gene_Families\test\aligned"   
family_fasta_dir = r"D:\Genomes\Ash_Gene_Families\test"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

2018-08-28 13:26:16.300948 OG0038720-OG0030999 0.000000%
2018-08-28 13:26:16.395894 OG0038724-OG0023084 0.018702%
2018-08-28 13:26:17.135477 OG0038726-OG0016384 0.037404%
2018-08-28 13:26:19.598957 OG0038708-OG0030240-OG0001174-OG0037219-OG0038241 0.056106%


'D:\\Genomes\\Ash_Gene_Families\\test\\aligned'

In [8]:
family_aligned_dir = r"D:\Genomes\Ash_Gene_Families\aligned"   
family_fasta_dir = r"D:\Genomes\Ash_Gene_Families"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

2018-08-28 13:26:48.913771 OG0031144-OG0030007 0.000000%
2018-08-28 13:26:48.939756 OG0031652-OG0037054 0.018702%
2018-08-28 13:26:48.969815 OG0036194-OG0030003 0.037404%
2018-08-28 13:26:48.994801 OG0036397-OG0031191 0.056106%
2018-08-28 13:26:49.022786 OG0036745-OG0032840 0.074808%
2018-08-28 13:26:49.048771 OG0038682-OG0033356 0.093510%
2018-08-28 13:26:49.077020 OG0030513-OG0034452 0.112212%
2018-08-28 13:26:49.103004 OG0032406-OG0029521 0.130915%
2018-08-28 13:26:49.132987 OG0036445-OG0030609 0.149617%
2018-08-28 13:26:49.159972 OG0030152-OG0035697 0.168319%
2018-08-28 13:26:49.190954 OG0036135-OG0031315 0.187021%
2018-08-28 13:26:49.218939 OG0029684-OG0031354 0.205723%
2018-08-28 13:26:49.249921 OG0031223-OG0033271 0.224425%
2018-08-28 13:26:49.279905 OG0037966-OG0030453 0.243127%
2018-08-28 13:26:49.309887 OG0033298-OG0032850 0.261829%
2018-08-28 13:26:49.339869 OG0036431-OG0029918 0.280531%
2018-08-28 13:26:49.377101 OG0030796-OG0034119 0.299233%
2018-08-28 13:26:49.415079 OG00

KeyboardInterrupt: 

In [None]:
family_aligned_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\super_hog_test_viz\aligned"   
family_fasta_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\super_hog_test_viz"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

In [None]:
family_aligned_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\phylogenetic_guides\aligned"   
family_fasta_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\phylogenetic_guides"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

In [2]:
do_alignment((1, r"D:\Genomes\Ash_Gene_Families\OG0037028-OG0004348.fa",
             r"D:\Genomes\Ash_Gene_Families\test\aligned"))

starting alignment
2018-08-28 13:10:19.687085 OG0037028-OG0004348 0.018702%


In [10]:
do_alignment((1, r"D:\Genomes\Ash_Gene_Families\test\FRAX04_pair.fa", r'D:\Genomes\Ash_Gene_Families\test\aligned'))

2018-08-28 15:00:47.718385 FRAX04_pair 0.018702%


In [4]:

list(enumerate(glob(join(r'D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\candidates', '*.fa'))))

[(0,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10001.fa'),
 (1,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10002.fa'),
 (2,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10003.fa'),
 (3,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10007.fa'),
 (4,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10009.fa'),
 (5,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10011.fa'),
 (6,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10030.fa'),
 (7,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10059.fa'),
 (8,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome

* output files with lists of gene names to look for in each HOG
* per species, find those gene names in the annotation
* count presence / absence of a gene name in an annotation
* group them back by super-HOGs
* End result: gene copy count per each gene family defined by a super-HOG

_Should I call this a multigene family?_