# HOG file parsing for gene names

In [1]:
def convert_to_gene_id(name):
    if name.startswith('FRAX'):  # not FRAEX for excelsior
        return name[name.find('_')+1:name.rfind(' ')]
    if name.startswith('FRAEX'):
        return name[0:name.rfind(' ')]
    else:
        return name
assert convert_to_gene_id("FRAX01_FRAEX38873_V2_000052990.1_R0 [FRAX27_predicted_proteins_with_species_tag]") == "FRAEX38873_V2_000052990.1_R0"
tomato = 'Solyc03g095770.2.1 pacid=36135394 transcript=Solyc03g095770.2.1 locus=Solyc03g095770.2 ID=Solyc03g095770.2.1.ITAG2.4 annot-version=ITAG2.4 [Slycopersicum_390_ITAG2]'
assert convert_to_gene_id(tomato) == tomato


In [2]:
def parse_HOGs_from_fasta_directory():
    directory_HOGs = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\HOGFasta"
    from glob import glob
    from os.path import join, splitext, basename
    file_list = glob(join(directory_HOGs, "*.fa"))
    HOGs = {}
    for filename in file_list:
        assert basename(filename).startswith('HOG')
        headers = []
        with open(filename, 'r') as fasta:
            for line in fasta:
                if line.startswith('>'):
                    headers.append(convert_to_gene_id(line[1:-1]))
        #assert len(headers) == len(set(headers)), "There was a redundant gene mention %s" % headers 
        HOGs[basename(splitext(filename)[0])] = headers
    return HOGs
HOGs = parse_HOGs_from_fasta_directory()
len(HOGs)

28794

# GFF parsing for overlapping annotations

TBD

# Greedy merge algorithm
 ( agglomerative clustering? )
* Starting clusters can be HOGs
* clusters = dict{ gene: pointer to cluster }
* Clusters are set(gene)
* First gene gets a new cluster
* If gene A overlaps with gene B anywhere, then go through each cluster to find gene A and add gene B
* If gene B is in any other clusters, find it and merge the entire cluster into gene A cluster
* Assert a gene can only ever be in one cluster at a time, except in the atomic operation of merging two


### Starting from HOGs but no side-effects, using networkx to retreive clusters

[Python Network Graphs](https://www.python.org/doc/essays/graphs/)

[Creating a networkx graph](https://networkx.github.io/documentation/networkx-1.10/tutorial/tutorial.html)  
[Graph Connected Components](https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.algorithms.components.connected.connected_components.html)

In [3]:
def map_genes_to_HOGs(HOGs):
    genes_to_HOGs = {}
    for hog, genes in HOGs.items():
        for gene in genes:
            if gene not in genes_to_HOGs:
                genes_to_HOGs[gene] = set()
            genes_to_HOGs[gene].add(hog)
    return genes_to_HOGs

def test_map_genes_to_HOGs():
    genes_to_HOGs_answer = {'gene1': {'HOG1'},
                     'gene2': {'HOG1', 'HOG2'},
                     'gene3': {'HOG2'}}
    HOGs = {'HOG1': set(('gene1', 'gene2')),
            'HOG2': set(('gene2', 'gene3'))}  # all HOGs start out parsed and non-exlusive
    genes_to_HOGs = map_genes_to_HOGs(HOGs)
    print(genes_to_HOGs, '\n',genes_to_HOGs_answer)
    assert genes_to_HOGs == genes_to_HOGs_answer
test_map_genes_to_HOGs()    

{'gene2': {'HOG2', 'HOG1'}, 'gene1': {'HOG1'}, 'gene3': {'HOG2'}} 
 {'gene2': {'HOG2', 'HOG1'}, 'gene1': {'HOG1'}, 'gene3': {'HOG2'}}


In [4]:
from itertools import combinations
import networkx

def create_cluster_network(genes_to_HOGs, HOGs):
    g = networkx.Graph()
    g.add_nodes_from(HOGs.keys())
    for gene, hogs in genes_to_HOGs.items():
        if len(hogs) > 1:
            # two HOGs have overlap and need to be merged
            g.add_edges_from(combinations(hogs, 2))  # add edge between all hogs
            
    return g

In [8]:
def make_super_HOGs(HOGs, networked_hogs):
    """collect subnetworks together into larger clusters"""
    super_HOGs = {}
    genes_seen = set()
    clusters = [c for c in sorted(networkx.connected_components(networked_hogs), key=len, reverse=True)]
    for cluster in clusters:
        name = '-'.join(cluster)
        super_HOGs[name] = set().union(gene for hog in cluster for gene in HOGs[hog])
        assert super_HOGs[name] not in genes_seen, "You missed a clustering connection.  Genes should only occur once"
        genes_seen.update(super_HOGs[name])
            
    return super_HOGs

def test_make_super_HOGs():
    super_HOGs_answer = {'HOG2-HOG1': {'gene2','gene1','gene3'}}
    HOGs = {'HOG1': set(('gene1', 'gene2')),
            'HOG2': set(('gene2', 'gene3'))}  # all HOGs start out parsed and non-exlusive
    genes_to_HOGs = map_genes_to_HOGs(HOGs)
    network = create_cluster_network(genes_to_HOGs, HOGs)
    assert str(network.edges()) == "[('HOG2', 'HOG1')]", network.edges()
    super_HOGs = make_super_HOGs(HOGs, network)
    print(super_HOGs)
    assert super_HOGs == super_HOGs_answer, super_HOGs
test_make_super_HOGs()

{'HOG2-HOG1': {'gene2', 'gene1', 'gene3'}}


### Actual Clusters with Real HOGs

In [26]:
actual_genes_to_HOGs = map_genes_to_HOGs(HOGs)
actual_network = create_cluster_network(actual_genes_to_HOGs, HOGs)
super_HOGs = make_super_HOGs(HOGs, actual_network)

"{:,}".format(len(HOGs) - len(super_HOGs)) + "  Reduced count"

'8,119  Reduced count'

## Compose new alignment with Super HOGs

In [29]:
from DNASkittleUtils.DDVUtils import pp
modified_HOGs_only = [c for c in sorted(networkx.connected_components(actual_network), key=len, reverse=True) if len(c) > 1]
pp(len(modified_HOGs_only))

5,244


'5,244'

In [20]:
assert all(['-'.join(h) in super_HOGs for h in modified_HOGs_only])

_We could simply exclude all modified HOGs from the data as being ambiguous_

### Concat super HOG fasta files into a new directoy

In [33]:
from os.path import join, basename, splitext
family_out_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\super_hog_test_viz"
HOGfasta = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\HOGFasta"
for family in modified_HOGs_only:
    family_name = '-'.join(family) + '.fa'
    with open(join(family_out_dir, family_name), 'wb') as concat:  # output file concatenation
        for hog in family:
            with open(join(HOGfasta, hog + '.fa'), 'rb') as hog_file:  # smaller input file
                concat.write(hog_file.read())  # dump the whole file
                concat.write('\n')  # just to be safe
            

### Align the new gene family files

In [1]:
import subprocess
import multiprocessing
from datetime import datetime
import os
from os.path import dirname, join, basename
from Bio.Align.Applications import MuscleCommandline
from DNASkittleUtils.CommandLineUtils import just_the_name
from glob import glob

def do_alignment(args):
    index, fa = args
    target = join(dirname(fa), 'aligned', just_the_name(fa) + '.fa')
    muscle_exe = 'muscle3.8.31_i86win32.exe'

    if not os.path.exists(target):
        muscle_cline = MuscleCommandline(muscle_exe, input=fa, out=target)
        try:
            stdout, stderr = muscle_cline()
        except subprocess.CalledProcessError as err:
            print(err.stderr)
        print(datetime.now(), just_the_name(fa), '{:%}'.format(index / 274))

def batch_align_sequences(input_folder, output_folder):
    start = datetime.now()
    input_folder = os.path.abspath(input_folder)
    os.makedirs(output_folder, exist_ok=True)
    files = glob(os.path.join(input_folder, '*.fa'))
    pool.map(do_alignment, list(enumerate(files)))
        
    return os.path.abspath(output_folder)

if __name__ == '__main__':  # https://github.com/jupyter/notebook/issues/2080
    pool = multiprocessing.Pool(10)


In [None]:
family_aligned_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\super_hog_test_viz\aligned"   
family_fasta_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\super_hog_test_viz"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

In [None]:
family_aligned_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\phylogenetic_guides\aligned"   
family_fasta_dir = r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\phylogenetic_guides"

batch_align_sequences(input_folder = family_fasta_dir,
                      output_folder = family_aligned_dir)

In [3]:
do_alignment((1, r"D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\phylogenetic_guides\OG133.fa"))

2018-03-19 17:17:06.080634 OG133 0.364964%


In [4]:

list(enumerate(glob(join(r'D:\josiah\Documents\Research\Thesis - Genome Symmetry\DNA_Duplications\data\candidates', '*.fa'))))

[(0,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10001.fa'),
 (1,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10002.fa'),
 (2,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10003.fa'),
 (3,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10007.fa'),
 (4,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10009.fa'),
 (5,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10011.fa'),
 (6,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10030.fa'),
 (7,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome Symmetry\\DNA_Duplications\\data\\candidates\\HOG10059.fa'),
 (8,
  'D:\\josiah\\Documents\\Research\\Thesis - Genome

In [41]:
for f in big_super.split('-'):
    print("cp %s.fa ../super_hog_test_viz/" % f)

cp HOG27342.fa ../super_hog_test_viz/
cp HOG25547.fa ../super_hog_test_viz/
cp HOG9983.fa ../super_hog_test_viz/
cp HOG27399.fa ../super_hog_test_viz/
cp HOG2766.fa ../super_hog_test_viz/
cp HOG27814.fa ../super_hog_test_viz/
cp HOG2765.fa ../super_hog_test_viz/
cp HOG2763.fa ../super_hog_test_viz/


* output files with lists of gene names to look for in each HOG
* per species, find those gene names in the annotation
* count presence / absence of a gene name in an annotation
* group them back by super-HOGs
* End result: gene copy count per each gene family defined by a super-HOG

_Should I call this a multigene family?_