In [3]:
from ete3 import EvolTree
from ete3.treeview.layouts import evol_clean_layout

path = '/Users/jra/Dropbox/ciliates/old/testing/cenH3/'
input_newick = path + 'cenH3_1.fa.tblastn.fa.blast.annotation.centromeric.fa.extract.aligned.nwk'
input_alignment = path + 'cenH3_1.fa.tblastn.fa.blast.annotation.centromeric.fa.extract.aligned.trimal'

# WHEN INPUTTING ALIGNMENT, ETE3 TRANSLATES THE CDS TO PROTEIN.
# ENSURE IT IS USING THE CORRECT CODON CODE: CILIATES ENCODE GLU INSTEAD OF STOP FOR 2 CODONS
tree = EvolTree(newick = input_newick, format=1)
tree.link_to_alignment(input_alignment)

print(tree)


               /-43138.PPENT_87_T0680088
            /-|
           |  |   /-5886.PPRIM_Ir42_T20394
           |   \-|
           |      \-5886.PPRIM_AZ9_T0660164
           |
         /-|         /-5885.PCAU_43c3d_T00320099
        |  |      /-|
        |  |     |   \-65128.PSEX_AZ84_T22733
        |  |   /-|
        |  |  |  |   /-224956.PQUADEC_NiA_T00430116
      /-|   \-|   \-|
     |  |     |      \-65130.PTRED_209_T71800001294020121
     |  |     |
     |  |      \-219701.PNOV_TE_T00130141
     |  |
   /-|   \-65126.PBIA_V14_T00250144
  |  |
  |  |      /-5888.PTET_32_T0770173
  |  |   /-|
  |  |  |   \-5888.PTET_51_T0800171
  |   \-|
--|     |   /-43137.POCT_138_T0510209
  |      \-|
  |         \-43137.POCT_K8_T71800002769450088
  |
  |--304693.PDEC_223_T01060010
  |
   \-304694.PDODEC_274_T00990128


In [4]:
partial_leaf_name = "5888.PTET_51"
output_file = input_alignment + "dS_values_ref.txt"

# Initialize variables to store the reference leaf and its full name
reference_leaf = None
reference_leaf_name = None

# Search for the leaf with a name that starts with the specified partial name
for leaf in tree:
    if leaf.name.startswith(partial_leaf_name):
        reference_leaf = leaf
        reference_leaf_name = leaf.name

if reference_leaf:
    # Calculate dS values for leaves relative to the reference leaf
    dS_values = {}

    for leaf in tree:
        # Calculate the dS value for the leaf relative to the reference leaf
        dS = reference_leaf.get_distance(leaf)
        dS_values[leaf.name] = dS

    with open(output_file, "w") as f:   
        for leaf_name, dS in dS_values.items():
            print(f"{leaf_name}: dS = {dS:.4f}")
            f.write(f"{leaf_name}: dS = {dS:.4f}\n")
        print(f"Ds values have been written to {output_file}")
        
else:
    print(f"No leaf found in the tree with a name starting with '{partial_leaf_name}'.")

43138.PPENT_87_T0680088: dS = 0.0725
5886.PPRIM_Ir42_T20394: dS = 0.0825
5886.PPRIM_AZ9_T0660164: dS = 0.0825
5885.PCAU_43c3d_T00320099: dS = 0.3659
65128.PSEX_AZ84_T22733: dS = 0.1595
224956.PQUADEC_NiA_T00430116: dS = 0.1282
65130.PTRED_209_T71800001294020121: dS = 0.1228
219701.PNOV_TE_T00130141: dS = 0.0966
65126.PBIA_V14_T00250144: dS = 0.0857
5888.PTET_32_T0770173: dS = 0.0000
5888.PTET_51_T0800171: dS = 0.0000
43137.POCT_138_T0510209: dS = 0.0274
43137.POCT_K8_T71800002769450088: dS = 0.0246
304693.PDEC_223_T01060010: dS = 0.0288
304694.PDODEC_274_T00990128: dS = 0.0300
Ds values have been written to /Users/jra/Dropbox/ciliates/old/testing/cenH3/cenH3_1.fa.tblastn.fa.blast.annotation.centromeric.fa.extract.aligned.trimaldS_values_ref.txt


In [5]:
tree.run_model('M2')

model2 = tree.get_evol_model('M2')
print(model2)

 Evolutionary Model M2:
        log likelihood       : -2234.600188
        number of parameters : 32
        sites inference      : BEB, NEB
        sites classes        : 
        proportions : p0=0.36542   p1=0.56288   p2=0.07169   
        w           : w0=0.0808    w1=1.0       w2=2.873     
        branches             : 
        mark:     , omega: None      , node_ids: 16  , name: Inner13
        mark:     , omega: 0.7984    , node_ids: 17  , name: Inner12
        mark:     , omega: 0.7984    , node_ids: 3   , name: 304693.PDEC_223_T01060010
        mark:     , omega: 0.7984    , node_ids: 4   , name: 304694.PDODEC_274_T00990128
        mark:     , omega: 0.7984    , node_ids: 18  , name: Inner8
        mark:     , omega: 0.7984    , node_ids: 26  , name: Inner11
        mark:     , omega: 0.7984    , node_ids: 19  , name: Inner7
        mark:     , omega: 0.7984    , node_ids: 13  , name: 65126.PBIA_V14_T00250144
        mark:     , omega: 0.7984    , node_ids: 27  , name: Inne

In [41]:
# M2 vs M1
tree.run_model('M2')
tree.run_model('M1')
pval = tree.get_most_likely('M2','M1')
model2 = tree.get_evol_model('M2')
print(model2)
print(pval)

if pval < 0.05:
    print('M2 model wins.')
    for s in range(len(model2.sites['BEB']['aa'])):
        if model2.sites['BEB']['p2'][s] > 0.95:
            print('positively selected site %s at position: %s, with probability: %s' % (model2.sites['BEB']['aa'][s], s+1, model2.sites['BEB']['p2'][s]))
else:
    print('M1 model is not rejected')


# M8 vs M7
tree.run_model('M7')
tree.run_model('M8')
model2 = tree.get_evol_model('M8')
pval = tree.get_most_likely('M8','M7')
print(model2)
print(pval)

# I AM NOT CERTAIN P2 REFERS TO THE SAME P VALUE AS P2 from THE M2 MODEL! TO VERIFY!!!
if pval < 0.05:
    print('M8 model wins.')
    for s in range(len(model2.sites['BEB']['aa'])):
        if model2.sites['BEB']['p2'][s] > 0.95:
            print('positively selected site %s at position: %s, with probability: %s' % (model2.sites['BEB']['aa'][s], s+1, model2.sites['BEB']['p2'][s]))
else:
    print('M7 model is not rejected')
    
    
# if both M7 and M1 are not rejected, stop here.

 Evolutionary Model M2:
        log likelihood       : -2234.600188
        number of parameters : 32
        sites inference      : BEB, NEB
        sites classes        : 
        proportions : p0=0.36542   p1=0.56288   p2=0.07169   
        w           : w0=0.0808    w1=1.0       w2=2.873     
        branches             : 
        mark:     , omega: None      , node_ids: 16  , name: Inner13
        mark:     , omega: 0.7984    , node_ids: 17  , name: Inner12
        mark:     , omega: 0.7984    , node_ids: 3   , name: 304693.PDEC_223_T01060010
        mark:     , omega: 0.7984    , node_ids: 4   , name: 304694.PDODEC_274_T00990128
        mark:     , omega: 0.7984    , node_ids: 18  , name: Inner8
        mark:     , omega: 0.7984    , node_ids: 26  , name: Inner11
        mark:     , omega: 0.7984    , node_ids: 19  , name: Inner7
        mark:     , omega: 0.7984    , node_ids: 13  , name: 65126.PBIA_V14_T00250144
        mark:     , omega: 0.7984    , node_ids: 27  , name: Inne

In [43]:
model2 = tree.get_evol_model('M2')
from ete3 import NCBITaxa

# Create an instance of the NCBITaxa class
ncbi = NCBITaxa()

# Function to get species name from TaxID
def get_species_name(taxid):
    try:
        lineage = ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        species_name = names[taxid]
        return species_name
    except:
        return f"Unknown_species_{taxid}"

# Modify the tree leaf names
for leaf in tree.iter_leaves():
    taxid = int(leaf.name.split('.')[0])
    species_name = get_species_name(taxid)
    leaf.name = species_name
    
    
col1 = {'NS' : '#000000', 
       'RX' : '#000000', 'RX+': '#000000', 
       'CN' : '#000000', 'CN+': '#000000', 
       'PS' : '#000000', 'PS+': '#000000'}
col2 = {'NS' : '#BCBCBC', 
       'RX' : '#5D63AB', 'RX+': '#5D63AB', 
       'CN' : '#659A62', 'CN+': '#659A62', 
       'PS' : '#F4C95D', 'PS+': '#F4C95D'}

#tree.show(histfaces=['M2'])
model2.set_histface(up=True, colors=col1, errors=True, kind='curve', ylim=[0,20], hlines = [1], hlines_col=['black'])
tree.render(input_alignment + "_M2_line.pdf", histfaces=['M2'])

model2.set_histface(up=True, colors=col2, errors=True, kind='stick', ylim=[0,4], hlines = [1], hlines_col=['black'])
tree.render(input_alignment + "_M2_bar.pdf", histfaces=['M2'])



model2 = tree.get_evol_model('M8')    
col1 = {'NS' : '#000000', 
       'RX' : '#000000', 'RX+': '#000000', 
       'CN' : '#000000', 'CN+': '#000000', 
       'PS' : '#000000', 'PS+': '#000000'}
col2 = {'NS' : '#BCBCBC', 
       'RX' : '#5D63AB', 'RX+': '#5D63AB', 
       'CN' : '#659A62', 'CN+': '#659A62', 
       'PS' : '#F4C95D', 'PS+': '#F4C95D'}

#tree.show(histfaces=['M2'])
model2.set_histface(up=True, colors=col1, errors=True, kind='curve', ylim=[0,20], hlines = [1], hlines_col=['black'])
tree.render(input_alignment + "_M8_line.pdf", histfaces=['M8'])

model2.set_histface(up=True, colors=col2, errors=True, kind='stick', ylim=[0,4], hlines = [1], hlines_col=['black'])
tree.render(input_alignment + "_M8_bar.pdf", histfaces=['M8'])


ValueError: invalid literal for int() with base 10: 'Paramecium pentaurelia'