# Pseudocode for DNA translation

### seems like a dictionary would be awesome here
### read 3 letter DNA codes and resulting protein tranlation into a dictionary
### search a DNA sequence for AUG as the starting codon
### iterate through rest of sequence and translate using the dictionary
### kick out amino acid sequence


In [3]:
dna = "ATGTTCGGT"

In [4]:
for start in range(0,7,3):
    codon = dna[start:start+3]
    print("one codon is " + codon)

one codon is ATG
one codon is TTC
one codon is GGT


In [5]:
last_codon_start = len(dna)
for start in range(0,last_codon_start,3):
    codon = dna[start:start+3]
    print("one codon is " + codon)

one codon is ATG
one codon is TTC
one codon is GGT


In [6]:
gencode = {
'ATA':'I', 'ATC':'I',
'ACA':'T', 'ACC':'T',
'AAC':'N', 'AAT':'N',
'AGC':'S', 'AGT':'S',
'CTA':'L', 'CTC':'L',
'CCA':'P', 'CCC':'P',
'CAC':'H', 'CAT':'H',
'CGA':'R', 'CGC':'R',
'GTA':'V', 'GTC':'V',
'GCA':'A', 'GCC':'A',
'GAC':'D', 'GAT':'D',
'GGA':'G', 'GGC':'G',
'TCA':'S', 'TCC':'S',
'TTC':'F', 'TTT':'F',
'TAC':'Y', 'TAT':'Y',
'TGC':'C', 'TGT':'C',
'ATT':'I',
'ACG':'T',
'AAA':'K',
'AGA':'R',
'CTG':'L',
'CCG':'P',
'CAA':'Q',
'CGG':'R',
'GTG':'V',
'GCG':'A',
'GAA':'E',
'GGG':'G',
'TCG':'S',
'TTA':'L',
'TAA':'_',
'TGA':'_',
'ATG':'M',
'ACT':'T',
'AAG':'K',
'AGG':'R',
'CTT':'L',
'CCT':'P',
'CAG':'Q',
'CGT':'R',
'GTT':'V',
'GCT':'A',
'GAG':'E',
'GGT':'G',
'TCT':'S',
'TTG':'L',
'TAG':'_',
'TGG':'W'}

In [7]:
print(gencode['CAT'])

H


In [11]:
last_codon_start = len(dna)
protein = ""
for start in range(0,last_codon_start,3):
    codon = dna[start:start+3]
    print("one codon is " + codon)
    aa = gencode.get(codon)
    print("the amino acid is " + aa)
    protein = protein + aa

one codon is ATG
the amino acid is M
one codon is TTC
the amino acid is F
one codon is GGT
the amino acid is G


In [12]:
print("protein sequence is " + protein)

protein sequence is MFG


In [19]:
def translate_dna(dna):
    last_codon_start = len(dna) - 2
    protein = ""
    for start in range(0,last_codon_start,3):
        codon = dna[start:start+3]
        aa = gencode.get(codon.upper(),'X')
        protein = protein + aa
    return protein

In [20]:
print(translate_dna("ATGTTCGGT"))
print(translate_dna("ATCGATCGATCGTTGCTTATCGATCAG"))
print(translate_dna("actgatcgtagctagctgacgtatcgtat"))
print(translate_dna("ACGATCGATCGTNACGTACGATCGTACTCG"))

MFG
IDRSLLIDQ
TDRS_LTYR
TIDRXVRSYS


In [21]:
assert(translate_dna("ATGTTCGGT")) == "MFG"
assert(translate_dna("ATCGATCGATCGTTGCTTATCGATCAG")) == "IDRSLLIDQ"
assert(translate_dna("actgatcgtagcttgcttacgtatcgtat")) == "TDRSLLTYR"
assert(translate_dna("ACGATCGATCGTNACGTACGATCGTACTCG")) == "TIDRXVRSYS"

# Switching to Allesina and Williams

In [35]:
import scipy # for random numbers

def build_population(N, p):
    population = []
    for i in range(N):
        allele1 = "A"
        if scipy.random.rand() > p:
            allele1 = "a"
        allele2 = "A"
        if scipy.random.rand() > p:
            allele2 = "a"
        population.append((allele1, allele2))
    return population

In [36]:
build_population(10, 0.7)


[('A', 'A'),
 ('A', 'a'),
 ('a', 'A'),
 ('A', 'A'),
 ('A', 'a'),
 ('a', 'A'),
 ('A', 'a'),
 ('a', 'A'),
 ('a', 'a'),
 ('A', 'A')]

In [37]:
def compute_frequencies(population):
    AA = population.count(('A','A'))
    Aa = population.count(('A','a'))
    aA = population.count(('a','A'))
    aa = population.count(('a','a'))
    return({'AA': AA,
            'aa': aa,
            'Aa': Aa,
            'aA': aA})

    

In [38]:
my_pop = build_population(6,0.5)
my_pop
compute_frequencies(my_pop)

{'AA': 1, 'Aa': 1, 'aA': 3, 'aa': 1}

In [42]:
def reproduce_population(population):
    new_generation = []
    N = len(population)
    for i in range(N):
        dad = scipy.random.randint(N)
        mom = scipy.random.randint(N)
        chr_mom = scipy.random.randint(2)
        offspring = (population[mom][chr_mom],population[dad][1 - chr_mom])
        new_generation.append(offspring)
    return(new_generation)

In [43]:
reproduce_population(my_pop)

[('a', 'A'), ('A', 'a'), ('A', 'a'), ('A', 'a'), ('a', 'a'), ('A', 'A')]

In [52]:
def simulate_drift(N,p):
    my_pop = build_population(N,p)
    fixation = False
    num_generations = 0
    while fixation == False:
        genotype_counts = compute_frequencies(my_pop)
        if genotype_counts['AA'] == N or genotype_counts['aa'] ==N:
            print('An allele reached fixation at generation ', num_generations)
            print('The genotype counts are ', genotype_counts)
            fixation == True
            break
        my_pop = reproduce_population(my_pop)
        num_generations = num_generations + 1

In [53]:
simulate_drift(100,0.5)

An allele reached fixation at generation  141
The genotype counts are  {'aa': 0, 'AA': 100, 'Aa': 0, 'aA': 0}


In [54]:
simulate_drift(100,0.9)

An allele reached fixation at generation  42
The genotype counts are  {'aa': 0, 'AA': 100, 'Aa': 0, 'aA': 0}


# 4.9.1

In [56]:
import csv

In [58]:
with open ('/home/eeb177-student/Downloads/Jiang2013_data.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter = '\t')
    taxa = []
    r_values = []
    for row in reader:
        taxa.append(row['Taxon'])
        r_values.append(float(row['r']))


In [59]:
taxa[:5]

['Fish', 'Fish', 'Fish', 'Amphibian', 'Amphibian']

In [60]:
r_values[:5]

[-0.11, 0.38, 0.51, 0.868, 0.297]

In [69]:
def get_mean_r(names,values,target_taxon='Fish'):
    n = len(names)
    mean_r = 0.0
    sample_size = 0
    for i in range(n):
        if names[i] == target_taxon:
            mean_r = mean_r + values[i]
            sample_size = sample_size + 1
    return mean_r/sample_size

In [70]:
get_mean_r(taxa,r_values,target_taxon = 'Fish')

0.39719005173783783

In [71]:
def get_taxa_list(names):
    return(set(names))

In [72]:
get_taxa_list(taxa)

{'Amphibian',
 'Annelids',
 'Bird',
 'Chelicerate',
 'Crustacean',
 'Fish',
 'Gastropod',
 'Insect',
 'Mammal',
 'Protist',
 'Reptile'}

In [73]:
for t in get_taxa_list(taxa):
    print(t, get_mean_r(taxa,r_values,target_taxon = t))

Reptile 0.11750000000000002
Gastropod 0.40099999999999997
Amphibian 0.18552824175524468
Protist 0.61402
Fish 0.39719005173783783
Bird 0.13175671104423078
Crustacean 0.40302827731946345
Chelicerate 0.49113529650000004
Mammal 0.009
Annelids 0.2
Insect 0.19664531553867934


In [75]:
def get_p_value_for_mean_r(names,values,target_taxon='Fish',num_simulations=1000):
    observed = get_mean_r(names,values,target_taxon)
    rnd_names = names[:]
    p_value = 0.0
    for i in range(num_simulations):
        scipy.random.shuffle(rnd_names)
        tmp = get_mean_r(rnd_names,values,target_taxon)
        if tmp >= observed:
            p_value = p_value +1
    p_value = p_value/num_simulations
    return(target_taxon, round(observed,3),round(p_value,5))

In [76]:
get_p_value_for_mean_r(taxa,r_values,'Fish',50000)

('Fish', 0.397, 0.00386)

In [77]:
for t in get_taxa_list(taxa):
    print(get_p_value_for_mean_r(taxa,r_values,t,50000))

('Reptile', 0.118, 0.93182)
('Gastropod', 0.401, 0.07896)
('Amphibian', 0.186, 0.99996)
('Protist', 0.614, 0.0034)
('Fish', 0.397, 0.00294)
('Bird', 0.132, 0.9998)
('Crustacean', 0.403, 0.0)
('Chelicerate', 0.491, 0.01132)
('Mammal', 0.009, 0.84272)
('Annelids', 0.2, 0.59062)
('Insect', 0.197, 0.99856)
