In [78]:
import pyham
from Bio import SeqIO
import os
import matplotlib.pyplot as plt

# Path to nwk file
nwk_path = "test_data/EstimatedSpeciesTree.nwk"

# The newick tree is extracted as a string
tree_str = pyham.utils.get_newick_string(nwk_path, type="nwk")

# Path to OrthoXML file
#orthoxml_path = "../small_noEvents/OMA.2.3.0/Output/OrthologousGroups.orthoxml" # same error if I use this file
orthoxml_path = "test_data/HierarchicalGroups.orthoxml"

#ham_analysis = pyham.Ham(tree_str, orthoxml_path, use_internal_name=True)
ham_analysis_no_name = pyham.Ham(tree_str, orthoxml_path, use_internal_name=False)

pyham.__version__

'1.1.7'

In [127]:
def locus_to_id(file):
    
    """
    This function extracts loci from a fasta file and store the information as keys in a dictionary 
    and the related protein ID as value.

    Args:
        file (:obj:`str`): path to the file of interest.

    Returns:
        a dictionary mapping the locus (:obj:`str`) with the protein ID (:obj:`str`).
    """
    
    locus_to_id = {}
    for seq_record in SeqIO.parse(file, 'fasta'):
        locus_to_id[seq_record.description.split('locus: ')[-1]] = seq_record.description
        
    return locus_to_id

In [128]:
#locus_to_id("test_data/DB/SE001_aa.fa")

{'1': 'G1_SE001, sequence type: type1, locus: 1',
 '2': 'G2_SE001, sequence type: type1, locus: 2',
 '3': 'G3_SE001, sequence type: type1, locus: 3',
 '4': 'G4_SE001, sequence type: type1, locus: 4',
 '5': 'G5_SE001, sequence type: type1, locus: 5',
 '6': 'G6_SE001, sequence type: type1, locus: 6',
 '7': 'G7_SE001, sequence type: type1, locus: 7',
 '8': 'G8_SE001, sequence type: type1, locus: 8',
 '9': 'G9_SE001, sequence type: type1, locus: 9',
 '10': 'G10_SE001, sequence type: type1, locus: 10',
 '11': 'G11_SE001, sequence type: type1, locus: 11',
 '12': 'G12_SE001, sequence type: type1, locus: 12',
 '13': 'G13_SE001, sequence type: type1, locus: 13',
 '14': 'G14_SE001, sequence type: type1, locus: 14',
 '15': 'G15_SE001, sequence type: type1, locus: 15',
 '16': 'G16_SE001, sequence type: type1, locus: 16',
 '17': 'G17_SE001, sequence type: type1, locus: 17',
 '18': 'G18_SE001, sequence type: type1, locus: 18',
 '19': 'G19_SE001, sequence type: type1, locus: 19',
 '20': 'G20_SE001, s

In [149]:
def get_locus(directory):
    
    """
    This function goes through each file of a folder and build a dictionary containing loci 
    as keys and the related protein ID as values. For each entry in the dictionary, the function 
    updates the corresponding :obj:`pyham.abstractgene.Gene` by adding its locus.

    Args:
        directory (:obj:`str`): directory of interest.

    Returns:
        xxx
    """

    path = directory
    for file in os.listdir(path):
        ##print('File:', file)
        path_to_file = path + '/' + file
        locus_dictionary = locus_to_id(path_to_file)
 
        for (loci, gene_id) in locus_dictionary.items():
            gene = ham_analysis_no_name.get_genes_by_external_id(gene_id)
            
            # each gene should only map to 1 other gene, so should always be length 1...
            assert len(gene) == 1, 'gene_id maps to multiple entries'
            gene[0].locus = loci
            #for element in gene:
            #    element.locus = loci
#                print(element.locus)
                
        ham_analysis_no_name.get_extant_genome_by_name('.'.join(file.split('.')[:-1])).locus_assignment = True

In [150]:
get_locus('test_data/DB')


In [136]:
x = {1: 'v 1',
     2: 'v 2'}

In [137]:
list(x)

[1, 2]

In [138]:
list(x.keys())

[1, 2]

In [139]:
list(x.values())

['v 1', 'v 2']

In [140]:
list(x.items())

[(1, 'v 1'), (2, 'v 2')]