# Locus extraction

In [34]:
import pyham
from Bio import SeqIO
import os

# Path to nwk file
nwk_path = "./data/RealTree.nwk"

# The newick tree is extracted as a string
tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk")

# Path to OrthoXML file
orthoxml_path = "./data/hogs.orthoxml"

ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True)

In [35]:
def locus_by_id(file):
    
    '''
    This function extracts the locus from a fasta file and store the information in a dictionary 
    with the related protein ID as key.
    '''
    
    locus_by_id = {}
    for seq_record in SeqIO.parse(file, 'fasta'):
        locus_by_id[seq_record.id[:-1]] = int(seq_record.description.split(',')[2][7:])
    return locus_by_id

def get_locus(directory):
    
    '''
    This function goes through each file of a folder to build the dictionary with the protein IDs 
    as keys and their related locus as values. For each entry in the dictionary, the function 
    updates the corresponding gene object by adding its locus.
    '''
    
    path = directory
    for file in os.listdir(path):
        path_to_file = path + '/' + file
        locus_dictionary = locus_by_id(path_to_file)
        
        for key in locus_dictionary:
            gene = ham_analysis.get_genes_by_external_id(key)
            for element in gene:
                element.locus = locus_dictionary[key]

In [36]:
get_locus('/Users/karimsaied/Documents/biology/master/mls_2018/master_project/data/DB')

### Sanity check

In [37]:
list_genes = ham_analysis.get_list_extant_genes()

genes_with_locus = 0
for gene in list_genes:
    if gene.locus:
        genes_with_locus += 1

print(genes_with_locus)

20005
