## Define synapomorphies from alignment, for different taxonomic levels

In [1]:
import numpy as np
import pandas as pd
import sqlite3
#from __future__ import division
from Bio import AlignIO

### Start with synapomizer script had built before

In [2]:
# %load -r 22-32 /Users/hughcross/scripts/synapomizer.py
def number_alleles(column):
    leng = len(column)
    bp_list = []
    for bp in range(0,leng):
        pos = column[bp]
        if pos in bp_list:
            pass
        else:
            bp_list.append(pos)
    #no_alleles = len(bp_list) # maybe take out if a gap
    return bp_list

In [3]:
# %load -r 34-49 /Users/hughcross/scripts/synapomizer.py
def informative_alleles(character_dict):
    """a function to filter out uninformative positions"""
    new_dict = {}
    allele_list = []
    for k,v in character_dict.items():
        newd = v
        for key,value in newd.items():
            new_dict.setdefault(key, []).append(value)
    #print new_dict
    for keys, values in new_dict.items():
        alleles = set(values)
        #print keys
        #print alleles
        if len(alleles) > 1:
            allele_list.append(keys)
    return allele_list

In [4]:
# start with a file, then import muscle to make other alignments
# %load -r 9-12 /Users/hughcross/scripts/synapomizer.py
align_file = '/Users/hughcross/Analysis/CAMEL/cam2018/local_chenopod_ref_seqs alignment_for_mapping.nex'
alignment = AlignIO.read(align_file, 'nexus')
length = alignment.get_alignment_length()

In [5]:
# %load -r 14-19 /Users/hughcross/scripts/synapomizer.py
#create a dictionary with sample# : genus name ## note: need to make this part flexible so that different taxonomic levels can be used
name_map = {} # also maybe make a list of each genus, and a dict where key = genus, values = number of species in genus
genus_dict = {}
sample_map = {}
for i, record in enumerate(alignment):
    fullname = record.id
    newname = fullname.split('_')[0]
    name_map[i]=newname
    genus_dict.setdefault(newname, []).append(fullname)
    sample_map[i]=fullname

In [6]:
sample_map[0]

'Salsola_kali_AD_b36'

In [7]:
genus_dict['Sclerolaena']

['Sclerolaena_cuneata_AD_b32', 'Sclerolaena_deserticola_AD_b31']

In [8]:
# make a dictionary of lists for each clade
bluebush = ['Maireana','Enchylaena','Neobassia','Sclerolaena','Threlkeldia']
saltbush =['Atriplex','Chenopodium','Rhagodia','Einadia','Dysphania']
genera = genus_dict.keys()
print(genera)

dict_keys(['Salsola', 'Tecticornia', 'Atriplex', 'Chenopodium', 'Rhagodia', 'Dysphania', 'Einadia', 'Bassia', 'Suaeda', 'Enchylaena', 'Maireana', 'Neobassia', 'Sclerolaena', 'Threlkeldia'])


In [9]:
clades = {}
for gen in genera:
    if gen in bluebush:
        clades[gen]='bluebush'
    elif gen in saltbush:
        clades[gen]='saltbush'
    else:
        clades[gen]=gen
print(clades)

{'Salsola': 'Salsola', 'Tecticornia': 'Tecticornia', 'Atriplex': 'saltbush', 'Chenopodium': 'saltbush', 'Rhagodia': 'saltbush', 'Dysphania': 'saltbush', 'Einadia': 'saltbush', 'Bassia': 'Bassia', 'Suaeda': 'Suaeda', 'Enchylaena': 'bluebush', 'Maireana': 'bluebush', 'Neobassia': 'bluebush', 'Sclerolaena': 'bluebush', 'Threlkeldia': 'bluebush'}


In [10]:
# %load -r 51-66 /Users/hughcross/scripts/synapomizer.py
pos_list = []
for num in range(0,length):
    position = alignment[:,num] # add num to list
    #print position
    allele_list = number_alleles(position)
    no_alleles = len(allele_list)
    
    if '-' in allele_list:
        no_alleles = no_alleles - 1
    #print no_alleles

    if no_alleles > 1:
        #print position
        pos_list.append(num)
print(pos_list)
print(len(pos_list))

[27, 35, 68, 70, 71, 77, 81, 97, 103, 110, 111, 114, 118, 119, 120, 121, 122, 127, 133, 136, 139]
21


In [11]:
print(length)

149


In [12]:
ambigs = {'AT':'W','TA':'W','CG':'S','GC':'S','CT':'Y','TC':'Y','AG':'R','GA':'R','AC':'M','CA':'M','GT':'K','TG':'K'}

In [13]:
# %load -r 68-85 /Users/hughcross/scripts/synapomizer.py
## note: fixing ambiguous characters in original below (was 'continue')
final_chars = {}

for pos in pos_list:
    gen_char_dict = {}
    for sample in range(0,32): # change this to num of samples
        char = alignment[sample,pos]
        genus = name_map[sample]
        
        gen_char_dict.setdefault(genus, []).append(char)
        
    for key, values in gen_char_dict.items():
        #print key
        newlist = gen_char_dict[key]
        #print newlist
        check = set(newlist)
        #print check
        size = len(check)
        if size > 2:
            #final_chars.setdefault(key, {})[pos]='N'
            newchar = 'N'
        elif size == 2:
            amb_str = ''
            for bp in check:
                amb_str = amb_str + bp
            if '-' in amb_str:
                newchar = amb_str.replace('-','') # may want to count the gaps and adjust later
            else:
                newchar = ambigs[amb_str]
                #final_chars.setdefault(key, {})[pos]=newchar
            
        else:
            newchar = newlist[0]
        final_chars.setdefault(key, {})[pos]=newchar
        

In [14]:
# %load -r 68-85 /Users/hughcross/scripts/synapomizer.py
## note: fixing ambiguous characters in original below (was 'continue')
final_chars_clade = {}

for pos in pos_list:
    cld_char_dict = {}
    for sample in range(0,32): # change this to num of samples
        char = alignment[sample,pos]
        genus = name_map[sample]
        clade = clades[genus]
        
        cld_char_dict.setdefault(clade, []).append(char)
        
    for key, values in cld_char_dict.items():
        #print key
        newlist = cld_char_dict[key]
        #print newlist
        check = set(newlist)
        #print check
        size = len(check)
        if size > 2:
            #final_chars.setdefault(key, {})[pos]='N'
            newchar = 'N'
        elif size == 2:
            amb_str = ''
            for bp in check:
                amb_str = amb_str + bp
            if '-' in amb_str:
                newchar = amb_str.replace('-','') # may want to count the gaps and adjust later
            else:
                newchar = ambigs[amb_str]
                #final_chars.setdefault(key, {})[pos]=newchar
            
        else:
            newchar = newlist[0]
        final_chars_clade.setdefault(key, {})[pos]=newchar
        

In [15]:
final_chars_clade['Bassia']

{27: 'C',
 35: 'T',
 68: 'T',
 70: 'A',
 71: 'T',
 77: 'T',
 81: 'T',
 97: 'C',
 103: 'T',
 110: 'G',
 111: 'R',
 114: 'A',
 118: 'G',
 119: 'R',
 120: 'A',
 121: 'W',
 122: 'A',
 127: '-',
 133: '-',
 136: '-',
 139: '-'}

In [16]:
# %load -r 68-85 /Users/hughcross/scripts/synapomizer.py
## note: fixing ambiguous characters in original below (was 'continue')

final_char_indiv = {}
for pos in pos_list:
    sample_char_dict = {}
    for sample in range(0,32): # change this to num of samples
        char = alignment[sample,pos]
        
        samp_indiv = sample_map[sample]
        
        sample_char_dict.setdefault(samp_indiv, []).append(char)
    for key, values in sample_char_dict.items():
        #print key
        newlist = sample_char_dict[key]
        #print newlist
        check = set(newlist)
        #print check
        size = len(check)
        if size > 2:
            #final_chars.setdefault(key, {})[pos]='N'
            newchar = 'N'
        elif size == 2:
            amb_str = ''
            for bp in check:
                amb_str = amb_str + bp
            if '-' in amb_str:
                newchar = amb_str.replace('-','') # may want to count the gaps and adjust later
            else:
                newchar = ambigs[amb_str]
                #final_chars.setdefault(key, {})[pos]=newchar
            
        else:
            newchar = newlist[0]
        
        final_char_indiv.setdefault(key, {})[pos]=newchar

In [17]:
final_char_indiv['Atriplex_deserticola_gi440584215']

{27: 'C',
 35: 'C',
 68: 'T',
 70: 'A',
 71: 'T',
 77: 'T',
 81: 'C',
 97: 'C',
 103: 'G',
 110: 'G',
 111: 'C',
 114: 'A',
 118: 'G',
 119: 'A',
 120: 'A',
 121: 'A',
 122: 'G',
 127: 'A',
 133: 'C',
 136: 'G',
 139: 'T'}

In [18]:
# %load -r 89-95 /Users/hughcross/scripts/synapomizer.py
# To filter out the characters that have only one alleles; it is variable but only one synapomorphy
## perhaps compare dictionaries in final_chars, if all at one position the same, then delete. 
### try to make it a function 

inform_alleles = informative_alleles(final_chars)
print(inform_alleles)
print(len(inform_alleles))

[27, 35, 68, 70, 71, 77, 81, 97, 103, 110, 111, 114, 118, 119, 120, 121, 122, 127, 133, 136, 139]
21


In [19]:
# %load -r 97-109 /Users/hughcross/scripts/synapomizer.py
# the informative_alleles function makes a list of informative alleles, then the next loop creates a filtered dictionary of dicts for each genus
filtered_chars1 = {}
for k,v in final_chars.items():
    tax_dict = v  #final_chars[k]
    #print tax_dict
    for key, value in tax_dict.items():
        if key in inform_alleles:
            filtered_chars1.setdefault(k, {})[key]=value

# now maybe sort the dictionary
print(filtered_chars1)
print(name_map)
#now to deal with gaps?, maybe later

{'Salsola': {27: 'C', 35: 'T', 68: 'T', 70: 'C', 71: 'T', 77: 'A', 81: 'C', 97: 'C', 103: 'T', 110: 'G', 111: 'G', 114: 'A', 118: 'G', 119: 'A', 120: 'A', 121: 'G', 122: 'A', 127: '-', 133: '-', 136: '-', 139: '-'}, 'Tecticornia': {27: 'C', 35: 'T', 68: 'T', 70: 'A', 71: 'T', 77: 'A', 81: 'C', 97: 'C', 103: 'T', 110: 'T', 111: 'A', 114: 'A', 118: 'G', 119: 'G', 120: 'A', 121: 'A', 122: '-', 127: '-', 133: '-', 136: '-', 139: '-'}, 'Atriplex': {27: 'C', 35: 'Y', 68: 'T', 70: 'A', 71: 'T', 77: 'T', 81: 'C', 97: 'C', 103: 'G', 110: 'G', 111: 'C', 114: 'A', 118: 'G', 119: 'A', 120: 'A', 121: 'A', 122: 'G', 127: 'A', 133: 'C', 136: 'G', 139: 'T'}, 'Chenopodium': {27: 'C', 35: 'T', 68: 'T', 70: 'M', 71: 'N', 77: 'Y', 81: 'Y', 97: 'C', 103: 'K', 110: 'G', 111: 'C', 114: 'R', 118: 'K', 119: 'A', 120: 'M', 121: 'N', 122: 'N', 127: 'N', 133: 'N', 136: 'N', 139: 'T'}, 'Rhagodia': {27: 'C', 35: 'T', 68: 'T', 70: 'A', 71: 'T', 77: 'T', 81: 'C', 97: 'C', 103: 'G', 110: 'G', 111: 'C', 114: 'A', 118: 

In [20]:
# %load -r 97-109 /Users/hughcross/scripts/synapomizer.py
# the informative_alleles function makes a list of informative alleles, then the next loop creates a filtered dictionary of dicts for each genus
filtered_chars_cld = {}
for k,v in final_chars_clade.items():
    tax_dict = v  #final_chars[k]
    #print tax_dict
    for key, value in tax_dict.items():
        if key in inform_alleles:
            filtered_chars_cld.setdefault(k, {})[key]=value

# now maybe sort the dictionary
print(filtered_chars_cld)
#print(name_map)
#now to deal with gaps?, maybe later

{'Salsola': {27: 'C', 35: 'T', 68: 'T', 70: 'C', 71: 'T', 77: 'A', 81: 'C', 97: 'C', 103: 'T', 110: 'G', 111: 'G', 114: 'A', 118: 'G', 119: 'A', 120: 'A', 121: 'G', 122: 'A', 127: '-', 133: '-', 136: '-', 139: '-'}, 'Tecticornia': {27: 'C', 35: 'T', 68: 'T', 70: 'A', 71: 'T', 77: 'A', 81: 'C', 97: 'C', 103: 'T', 110: 'T', 111: 'A', 114: 'A', 118: 'G', 119: 'G', 120: 'A', 121: 'A', 122: '-', 127: '-', 133: '-', 136: '-', 139: '-'}, 'saltbush': {27: 'C', 35: 'Y', 68: 'T', 70: 'M', 71: 'N', 77: 'Y', 81: 'Y', 97: 'C', 103: 'K', 110: 'G', 111: 'C', 114: 'R', 118: 'K', 119: 'A', 120: 'M', 121: 'N', 122: 'N', 127: 'N', 133: 'N', 136: 'N', 139: 'N'}, 'Bassia': {27: 'C', 35: 'T', 68: 'T', 70: 'A', 71: 'T', 77: 'T', 81: 'T', 97: 'C', 103: 'T', 110: 'G', 111: 'R', 114: 'A', 118: 'G', 119: 'R', 120: 'A', 121: 'W', 122: 'A', 127: '-', 133: '-', 136: '-', 139: '-'}, 'Suaeda': {27: 'T', 35: 'T', 68: 'G', 70: 'A', 71: 'T', 77: 'T', 81: 'T', 97: 'T', 103: 'T', 110: 'G', 111: 'C', 114: 'A', 118: 'G', 11

In [21]:
# %load -r 97-109 /Users/hughcross/scripts/synapomizer.py
# the informative_alleles function makes a list of informative alleles, then the next loop creates a filtered dictionary of dicts for each genus
filtered_chars_indiv = {}
for k,v in final_char_indiv.items():
    tax_dict = v  #final_chars[k]
    #print tax_dict
    for key, value in tax_dict.items():
        if key in inform_alleles:
            filtered_chars_indiv.setdefault(k, {})[key]=value

# now maybe sort the dictionary
#print(filtered_chars_indiv)
#print(name_map)
#now to deal with gaps?, maybe later

In [22]:
print(filtered_chars_indiv['Sclerolaena_cuneata_AD_b32'])

{27: 'C', 35: 'T', 68: 'T', 70: 'A', 71: 'T', 77: 'T', 81: 'C', 97: 'C', 103: 'T', 110: 'G', 111: 'G', 114: 'A', 118: 'G', 119: 'A', 120: 'A', 121: 'T', 122: 'A', 127: '-', 133: '-', 136: '-', 139: '-'}


In [23]:
# maybe make genotype dict for each genus (could also make it string)
genotypes = {} # dict of lists: {Felis: ['T', 'T',...]}
for k,v in filtered_chars1.items():
    for i in inform_alleles:
        geno = v[i]
        genotypes.setdefault(k, []).append(geno)
len(genotypes)

14

In [24]:
# maybe make genotype dict for each genus (could also make it string)
clade_genotypes = {} # dict of lists: {Felis: ['T', 'T',...]}
for k,v in filtered_chars_cld.items():
    for i in inform_alleles:
        geno = v[i]
        clade_genotypes.setdefault(k, []).append(geno)
len(clade_genotypes)
        

6

In [25]:
# maybe make genotype dict for each genus (could also make it string)
genotypes_indiv = {} # dict of lists: {Felis: ['T', 'T',...]}
for k,v in filtered_chars_indiv.items():
    for i in inform_alleles:
        geno = v[i]
        genotypes_indiv.setdefault(k, []).append(geno)
len(genotypes_indiv)
        

32

In [79]:
clade_genotypes['saltbush']

['C',
 'Y',
 'T',
 'M',
 'N',
 'Y',
 'Y',
 'C',
 'K',
 'G',
 'C',
 'R',
 'K',
 'A',
 'M',
 'N',
 'N',
 'N',
 'N',
 'N',
 'N']

In [80]:
clade_strings = {}
for k,v in clade_genotypes.items():
    vstring = ''.join(v)
    clade_strings[k]=vstring


In [81]:
clade_strings

{'Bassia': 'CTTATTTCTGRAGRAWA----',
 'Salsola': 'CTTCTACCTGGAGAAGA----',
 'Suaeda': 'TTGATTTTTGCAGGATA----',
 'Tecticornia': 'CTTATACCTTAAGGAA-----',
 'bluebush': 'CTTATTCCTGGAKAATA----',
 'saltbush': 'CYTMNYYCKGCRKAMNNNNNN'}

In [82]:
# output to file
strout = open('chenopod_trnl_genotypes.txt','w')

In [83]:
for k,v in clade_strings.items():
    strout.write(k+'\t'+v+'\n')
strout.close()

In [38]:
# note: use amb from below
clade_str_regex = {}
for k,v in clade_strings.items():
    newstring = ''
    for letter in v:
        if letter in amb:
            regs = amb[letter]
            newstring += '['
            for r in regs:
                newstring += r
            newstring += ']'
        else:
            newstring += letter
    clade_str_regex[k]=newstring

In [39]:
clade_str_regex

{'Bassia': 'CTTATTTCTG[AG]AG[AG]A[AT]A----',
 'Salsola': 'CTTCTACCTGGAGAAGA----',
 'Suaeda': 'TTGATTTTTGCAGGATA----',
 'Tecticornia': 'CTTATACCTTAAGGAA-----',
 'bluebush': 'CTTATTCCTGGA[GT]AATA----',
 'saltbush': 'C[CT]T[AC][ACGT][CT][CT]C[GT]GC[AG][GT]A[AC][ACGT][ACGT][ACGT][ACGT][ACGT][ACGT]'}

In [40]:
# may output these to file, or convert to regex when searching
# but for now, to check we can do it
import re

In [44]:
clade_regex_comp = {}
for k,v in clade_str_regex.items():
    vcomp = re.compile(r'%s' % (v))
    clade_regex_comp[k]=vcomp

In [45]:
clade_regex_comp

{'Bassia': re.compile(r'CTTATTTCTG[AG]AG[AG]A[AT]A----', re.UNICODE),
 'Salsola': re.compile(r'CTTCTACCTGGAGAAGA----', re.UNICODE),
 'Suaeda': re.compile(r'TTGATTTTTGCAGGATA----', re.UNICODE),
 'Tecticornia': re.compile(r'CTTATACCTTAAGGAA-----', re.UNICODE),
 'bluebush': re.compile(r'CTTATTCCTGGA[GT]AATA----', re.UNICODE),
 'saltbush': re.compile(r'C[CT]T[AC][ACGT][CT][CT]C[GT]GC[AG][GT]A[AC][ACGT][ACGT][ACGT][ACGT][ACGT][ACGT]',
 re.UNICODE)}

In [47]:
bas1 = 'CTTATTTCTGGAGGATA----'

In [53]:
if clade_regex_comp['Tecticornia'].search(bas1):
    print('sample is Bassia')
else:
    print('sample is not')

sample is not


In [54]:
# now put in a loop
matches = {}
for k,v in clade_regex_comp.items():
    if v.search(bas1):
        matches.setdefault(k, []).append(bas1)
print(matches)

{'Bassia': ['CTTATTTCTGGAGGATA----']}


In [58]:
samples = {}
samples['bas1'] = 'CTTATTTCTGGAGGATA----'
samples['salt1'] = 'CCTAACTCGGCAGAAAAAAAA'
                   # CYTMNYYCKGCRKAMNNNNNN
samples['tec1'] = 'CTTATACCTTAAGGAA-----'

In [59]:
# note: have to deal with differing lengths
matches = {}
for k,v in samples.items():
    for key,value in clade_regex_comp.items():
        if value.search(v):
            matches.setdefault(key, []).append(k)
print(matches)

{'Bassia': ['bas1'], 'saltbush': ['salt1'], 'Tecticornia': ['tec1']}


## find differences between genotypes

In [28]:
cladelist = clade_genotypes.keys()

In [49]:
# start with clades
diffs = {} # {pos: {base:[taxa],base2:[taxa]}, pos}
for base in range(0,len(inform_alleles)):
    base_pos = inform_alleles[base]
    # make list 
    baselist = []
    for k,v in clade_genotypes.items():
        allele = v[base]
        baselist.append(allele)
    # find if differences
    baseset = set(baselist)
    if len(baseset) > 1:
        # quantify each base
        for b in baseset:
            # set up temp dict to get list for each base
            allele_list = []
            for clad in cladelist:
                if b == clade_genotypes[clad][base]:
                    #print(b, clad)
                    #diffs.setdefault(base_pos, )
                    allele_list.append(clad)
                    #allele_dict.setdefault(b, []).append(clad)#[b]=clad#.append(clad)
            # now add dict to diffs
            diffs.setdefault(base_pos, {})[b]=allele_list

In [50]:
diffs

{27: {'C': ['Salsola', 'Tecticornia', 'saltbush', 'Bassia', 'bluebush'],
  'T': ['Suaeda']},
 35: {'T': ['Salsola', 'Tecticornia', 'Bassia', 'Suaeda', 'bluebush'],
  'Y': ['saltbush']},
 68: {'G': ['Suaeda'],
  'T': ['Salsola', 'Tecticornia', 'saltbush', 'Bassia', 'bluebush']},
 70: {'A': ['Tecticornia', 'Bassia', 'Suaeda', 'bluebush'],
  'C': ['Salsola'],
  'M': ['saltbush']},
 71: {'N': ['saltbush'],
  'T': ['Salsola', 'Tecticornia', 'Bassia', 'Suaeda', 'bluebush']},
 77: {'A': ['Salsola', 'Tecticornia'],
  'T': ['Bassia', 'Suaeda', 'bluebush'],
  'Y': ['saltbush']},
 81: {'C': ['Salsola', 'Tecticornia', 'bluebush'],
  'T': ['Bassia', 'Suaeda'],
  'Y': ['saltbush']},
 97: {'C': ['Salsola', 'Tecticornia', 'saltbush', 'Bassia', 'bluebush'],
  'T': ['Suaeda']},
 103: {'K': ['saltbush'],
  'T': ['Salsola', 'Tecticornia', 'Bassia', 'Suaeda', 'bluebush']},
 110: {'G': ['Salsola', 'saltbush', 'Bassia', 'Suaeda', 'bluebush'],
  'T': ['Tecticornia']},
 111: {'A': ['Tecticornia'],
  'C': ['sal

## now find unique genotypes for each clade

In [34]:
# first have to modify to account for ambigs
# ambigs = {'AT':'W','TA':'W','CG':'S','GC':'S','CT':'Y','TC':'Y','AG':'R','GA':'R','AC':'M','CA':'M','GT':'K','TG':'K'}
# make dict of lists for each ambig
#amb = {'W':['A','T']}
amb = {}
for k,v in ambigs.items():
    if v in amb:
        continue
    else:
        amb.setdefault(v, []).append(k[0])
        amb.setdefault(v, []).append(k[1])
print(amb)

{'W': ['A', 'T'], 'S': ['C', 'G'], 'Y': ['C', 'T'], 'R': ['A', 'G'], 'M': ['A', 'C'], 'K': ['G', 'T']}


In [35]:
amb['N']=['A','C','G','T']

In [84]:
amb

{'K': ['G', 'T'],
 'M': ['A', 'C'],
 'N': ['A', 'C', 'G', 'T'],
 'R': ['A', 'G'],
 'S': ['C', 'G'],
 'W': ['A', 'T'],
 'Y': ['C', 'T']}

In [59]:
diffs[121]

{'A': ['Tecticornia', 'Bassia', 'saltbush'],
 'C': ['saltbush'],
 'G': ['Salsola', 'saltbush'],
 'N': ['saltbush'],
 'T': ['Suaeda', 'bluebush', 'Bassia', 'saltbush'],
 'W': ['Bassia']}

In [52]:
# have to copy the dict this way, otherwise the original will be altered too downstream
diffsA = {} # to just add to them
for k,v in diffs.items():
    diffsA[k]=v


In [58]:
# now adjust diffs
for key,value in diffsA.items():
    #print(key)
    # first make a list of all bases
    bases = value.keys()
    #for nt in bases:
     #   newlist = []
    #for k,v in value.items():
    for k in list(value):
        
        if k in amb:
            seplist = amb[k]
            for base in seplist:
                if base in diffsA[key]:
                    for sp in diffs[key][k]:
                        if sp not in diffsA[key][base]:
                            diffsA[key][base].append(sp)
                else:
                    newlist = []
                    for sp in diffs[key][k]:
                        newlist.append(sp)
                    diffsA[key][base]=newlist

In [55]:
diffsA[35]

{'C': ['saltbush'],
 'T': ['Salsola', 'Tecticornia', 'Bassia', 'Suaeda', 'bluebush', 'saltbush'],
 'Y': ['saltbush']}

In [46]:
for key,value in clade_genotypes.items():
    print(key,'\t', value)

Salsola 	 ['C', 'T', 'T', 'C', 'T', 'A', 'C', 'C', 'T', 'G', 'G', 'A', 'G', 'A', 'A', 'G', 'A', '-', '-', '-', '-']
Tecticornia 	 ['C', 'T', 'T', 'A', 'T', 'A', 'C', 'C', 'T', 'T', 'A', 'A', 'G', 'G', 'A', 'A', '-', '-', '-', '-', '-']
saltbush 	 ['C', 'Y', 'T', 'M', 'N', 'Y', 'Y', 'C', 'K', 'G', 'C', 'R', 'K', 'A', 'M', 'N', 'N', 'N', 'N', 'N', 'N']
Bassia 	 ['C', 'T', 'T', 'A', 'T', 'T', 'T', 'C', 'T', 'G', 'R', 'A', 'G', 'R', 'A', 'W', 'A', '-', '-', '-', '-']
Suaeda 	 ['T', 'T', 'G', 'A', 'T', 'T', 'T', 'T', 'T', 'G', 'C', 'A', 'G', 'G', 'A', 'T', 'A', '-', '-', '-', '-']
bluebush 	 ['C', 'T', 'T', 'A', 'T', 'T', 'C', 'C', 'T', 'G', 'G', 'A', 'K', 'A', 'A', 'T', 'A', '-', '-', '-', '-']


In [48]:
print('saltbush\t',clade_genotypes['saltbush'])
print('bluebush\t',clade_genotypes['bluebush'])

saltbush	 ['C', 'Y', 'T', 'M', 'N', 'Y', 'Y', 'C', 'K', 'G', 'C', 'R', 'K', 'A', 'M', 'N', 'N', 'N', 'N', 'N', 'N']
bluebush	 ['C', 'T', 'T', 'A', 'T', 'T', 'C', 'C', 'T', 'G', 'G', 'A', 'K', 'A', 'A', 'T', 'A', '-', '-', '-', '-']


In [74]:
## try to get autotomorphies and synapomorphies
#auts = {}

# now how about synapomorphies
syns = {}
for k,v in diffs.items():
    print(k)
    pots = {} # temp dict to store those going to check
    basels = []
    for key,value in v.items():
        # try sets to get unique sets
        if 1 < len(value) < 6: # 6 is the number of clades, so anything less is potential
            if key not in amb:
                pots[key]=set(value)
                basels.append(key)
        elif len(value) == 1: 
            if key not in amb: # may have to check this
                pots[key]=set(value)
                basels.append(key)
    # now check sets
    print(pots)
    for bs in range(0,len(pots)-1):
        base = basels[bs]
        base1 = basels[bs+1]
        print(pots[base] - pots[base1])
        if len(pots[base] - pots[base1]) > 0:
            syns.setdefault(k, {})[base]=value

27
{'C': {'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Tecticornia'}, 'T': {'Suaeda'}}
{'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Tecticornia'}
35
{'C': {'saltbush'}}
68
{'G': {'Suaeda'}, 'T': {'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Tecticornia'}}
{'Suaeda'}
70
{'C': {'saltbush', 'Salsola'}, 'A': {'bluebush', 'saltbush', 'Bassia', 'Suaeda', 'Tecticornia'}}
{'Salsola'}
71
{'A': {'saltbush'}, 'C': {'saltbush'}, 'G': {'saltbush'}}
set()
set()
77
{'A': {'Tecticornia', 'Salsola'}, 'T': {'bluebush', 'Bassia', 'Suaeda', 'saltbush'}, 'C': {'saltbush'}}
{'Tecticornia', 'Salsola'}
{'bluebush', 'Bassia', 'Suaeda'}
81
{'C': {'Tecticornia', 'saltbush', 'bluebush', 'Salsola'}, 'T': {'saltbush', 'Bassia', 'Suaeda'}}
{'bluebush', 'Tecticornia', 'Salsola'}
97
{'C': {'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Tecticornia'}, 'T': {'Suaeda'}}
{'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Tecticornia'}
103
{'G': {'saltbush'}}
110
{'G': {'saltbush', 'bluebush', 'Salsola', 'Bassia', 'Suaeda'}

In [76]:
# try for autapomorphies
auts = {} # {pos: clade}
for k,v in diffs.items():
    for key,value in v.items():
        if len(value) == 1: 
            if key not in amb:
                auts.setdefault(k, {})[key]=value[0]
print(auts)

{27: {'T': 'Suaeda'}, 35: {'C': 'saltbush'}, 68: {'G': 'Suaeda'}, 71: {'A': 'saltbush', 'C': 'saltbush', 'G': 'saltbush'}, 77: {'C': 'saltbush'}, 97: {'T': 'Suaeda'}, 103: {'G': 'saltbush'}, 110: {'T': 'Tecticornia'}, 114: {'G': 'saltbush'}, 120: {'C': 'saltbush'}, 121: {'C': 'saltbush'}, 122: {'-': 'Tecticornia', 'C': 'saltbush', 'G': 'saltbush', 'T': 'saltbush'}, 127: {'A': 'saltbush', 'C': 'saltbush', 'G': 'saltbush', 'T': 'saltbush'}, 133: {'A': 'saltbush', 'C': 'saltbush', 'G': 'saltbush', 'T': 'saltbush'}, 136: {'A': 'saltbush', 'C': 'saltbush', 'G': 'saltbush', 'T': 'saltbush'}, 139: {'A': 'saltbush', 'C': 'saltbush', 'G': 'saltbush', 'T': 'saltbush'}}


### Need to add CIGAR functions to read score from usearch output, and assign bp of each seq