# Master project main script

The following cell aims to visualize the different internal nodes

In [62]:
for ag in ham_analysis_no_name.taxonomy.internal_nodes:
    print("\t- {}".format(ag.name))

	- SE003_aa/SE005_aa/SE007_aa
	- SE001_aa/SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa/SE002_aa
	- SE004_aa/SE008_aa/SE010_aa
	- SE005_aa/SE007_aa
	- SE008_aa/SE010_aa
	- SE001_aa/SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa
	- SE006_aa/SE009_aa
	- SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa
	- SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa


In [303]:
import pyham
from Bio import SeqIO
import os
import matplotlib.pyplot as plt

# Path to nwk file
nwk_path = "./data/no_events/RealTree.nwk"

# The newick tree is extracted as a string
tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk")

# Path to OrthoXML file
orthoxml_path = "./data/no_events/Output/HierarchicalGroups.orthoxml"

#ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True)
ham_analysis_no_name = pyham.Ham(tree_str, orthoxml_path, use_internal_name=False)

## Loci assignment

In [304]:
def locus_to_id(file):
    
    """
    This function extracts loci from a fasta file and store the information as keys in a dictionary 
    and the related protein ID as value.

    Args:
        file (:obj:`str`): path to the file of interest.

    Returns:
        a dictionary mapping locus (:obj:`str`) with protein ID (:obj:`str`).
    """
    
    locus_to_id = {}
    for seq_record in SeqIO.parse(file, 'fasta'):
        locus_to_id[int(seq_record.description.split('locus: ')[-1])] = seq_record.description  # In the orthoxml file, the external id correspond to the description, this '.description' is used instead of '.id[:-1]'
            
    return locus_to_id

In [305]:
def get_locus(directory):
    
    """
    This function goes through each file of a folder and build a dictionary containing loci 
    as keys and the related protein ID as values. For each entry in the dictionary, the function 
    updates the corresponding :obj:`pyham.abstractgene.Gene` by adding its locus.

    Args:
        directory (:obj:`str`): directory of interest.

    Returns:
        xxx
    """

    path = directory
    for file in os.listdir(path):
        path_to_file = path + '/' + file
        locus_dictionary = locus_to_id(path_to_file)
 
        for entry in locus_dictionary:
            gene = ham_analysis_no_name.get_genes_by_external_id(locus_dictionary[entry])
            for element in gene:
                element.locus = entry
                
        ham_analysis_no_name.get_extant_genome_by_name(file[:-3]).locus_assignment = True  # -6 before

In [306]:
get_locus('/Users/karimsaied/Documents/biology/master/mls_2018/master_project/data/no_events/DB')

In [337]:
def get_gene_by_locus(genome, locus):
    
    """
    Get a :obj:`pyham.abstractgene.Gene` that match the query locus. 

    Args:
        genome (:obj:`pyham.genome.Genome`): extant genome of interest.
        locus (:obj:`int`): locus

    Returns:
        :obj:`pyham.abstractgene.Gene`
    """
    
    for gene in genome.genes:
        if gene.locus == locus:
            return gene 
    
    return None

### Sanity check for loci assignment

In [298]:
# First a list of genome object is created.
genomes = [ham_analysis_no_name.get_extant_genome_by_name(file[:-3]) 
           for file in os.listdir('/Users/karimsaied/Documents/biology/master/mls_2018/master_project/data/no_events/DB')]
processed = []

for genome in genomes:
    if genome.locus_assignment:
        processed.append(True)
    else:
        processed.append(False)

if False in processed:
    print('Some genomes have not been processed.')
else:
    print('All genome have been processed!')

All genome have been processed!


## Neighbors assignment

The following cell aims to pinpoint the issue I encounter with some ancestral genome

In [336]:
anc_genome = ham_analysis_no_name.get_ancestral_genome_by_name('SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa')
print(anc_genome.genes)
print(anc_genome.genes[0].neighbors)
print(anc_genome.genes[9].neighbors)
print(anc_genome.genes[1].children)
print(anc_genome.genes[1].children[0].neighbors)
print(anc_genome.genes[1].children[1].neighbors)

''' /!\ The first HOG object has neighbors but none of the following has neighbors'''

[<HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, <HOG()>, 

AttributeError: 'HOG' object has no attribute 'neighbors'

In [308]:
def assign_neighbors(genome):
    
    """
    This function assigns a left and right neighbor to each :obj:`pyham.abstractgene.Gene` 
    or :obj:`pyham.abstractgene.HOG`.

    Args:
        genome (:obj:`pyham.genome.Genome`): extant or ancestral genome of interest.

    Returns:
        xxx
    """
    
    if isinstance(genome, pyham.ExtantGenome):
        
        for extant_gene in genome.genes:
            extant_gene.neighbors = (get_gene_by_locus(genome, extant_gene.locus-1), 
                                     get_gene_by_locus(genome, extant_gene.locus+1))  # Tuple to be handle by set further            
                        
    if isinstance(genome, pyham.AncestralGenome):
    
        for ancestral_gene in genome.genes:  # For each HOG in the ancestral genome...
            
            tmp = set()
            
            for child in ancestral_gene.children:  # For each HOG, their children are considered
                # print('child neighbors: {}'.format(child.neighbors))
                if child.neighbors[0] == None:
                    tmp.add(None)
                else:
                    tmp.add(child.neighbors[0].parent)
                    
                if child.neighbors[1] == None:
                    tmp.add(None)
                else:
                    tmp.add(child.neighbors[1].parent)
                    
            if len(tmp) == 2:  # If all left neighbors have the same parent
                ancestral_gene.neighbors = tuple(tmp)  # Extraction of the neighbors from the set
            elif len(tmp) > 2:  # If children have different neighbors
                ancestral_gene.neighbors = (None, None)
                # If one neighbor is in common between tuples in tmp, try to  find a way to get it

## Synteny block reconstruction

In [309]:
def get_synteny_blocks(genome):
    
    """
    This function aims to reconstruct the synteny blocks using the left and right 
    neighbors of :obj:`pyham.abstractgene`.

    Args:
        genome (:obj:`pyham.genome.Genome`): extant or ancestral genome of interest.

    Returns:
        list of lists that contain :obj:`pyham.abstractgene.Gene` or 
        :obj:`pyham.abstractgene.HOG` in a specific order.
    """
    
    syn_blocks = []
    seen = []
    gene_collection = genome.genes
    tmp = []
    
    for abstract_gene in gene_collection:
        if abstract_gene not in seen:  # seen is a list to keep track of already processed gene.
            growing_end = abstract_gene
            seen.append(abstract_gene)
            tmp = [growing_end]  # Temporary list for each synteny block
            
            for abstract_gene2 in gene_collection:
                if growing_end in abstract_gene2.neighbors:
                    if abstract_gene2 in growing_end.neighbors:
                        seen.append(abstract_gene2)
                        tmp.append(abstract_gene2)
                        growing_end = abstract_gene2
                
                else:
                    pass
                
            syn_blocks.append(tmp)
        
        else:
            pass
        
    genome.synteny = syn_blocks

## Processing
The tree is traversed and the function created are applied for each node.

In [310]:
genome_processed = 0
for node in ham_analysis_no_name.taxonomy.tree.traverse('postorder'):
    try:
        assign_neighbors(node.genome)
        get_synteny_blocks(node.genome)
    except:
        print(node.genome.name)

SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa
SE001_aa/SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa
SE001_aa/SE003_aa/SE005_aa/SE007_aa/SE004_aa/SE008_aa/SE010_aa/SE006_aa/SE009_aa/SE002_aa
