In [1]:
#using individual variables is fine for simple activities
dna = "ATCGATCGATCGTACGCTGA"
a_count = dna.count("A")

In [2]:
#this is a little unwieldy but still okay
dna = "ATCGATCGATCGTACGCTGA"
a_count = dna.count("A")
t_count = dna.count("T")
g_count = dna.count("G")
c_count = dna.count("C")

In [6]:
#the above method is NOT practical for large volumes such as trinucleotides, however
#a list would work better...
dna = "AATGATCGATCGTACGCTGA"
all_counts = []
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            count = dna.count(trinucleotide)
            print("count is " + str(count) + " for " + trinucleotide)
            all_counts.append(count)
print(all_counts)

count is 0 for AAA
count is 1 for AAT
count is 0 for AAG
count is 0 for AAC
count is 0 for ATA
count is 0 for ATT
count is 1 for ATG
count is 2 for ATC
count is 0 for AGA
count is 0 for AGT
count is 0 for AGG
count is 0 for AGC
count is 0 for ACA
count is 0 for ACT
count is 1 for ACG
count is 0 for ACC
count is 0 for TAA
count is 0 for TAT
count is 0 for TAG
count is 1 for TAC
count is 0 for TTA
count is 0 for TTT
count is 0 for TTG
count is 0 for TTC
count is 2 for TGA
count is 0 for TGT
count is 0 for TGG
count is 0 for TGC
count is 0 for TCA
count is 0 for TCT
count is 2 for TCG
count is 0 for TCC
count is 0 for GAA
count is 2 for GAT
count is 0 for GAG
count is 0 for GAC
count is 1 for GTA
count is 0 for GTT
count is 0 for GTG
count is 0 for GTC
count is 0 for GGA
count is 0 for GGT
count is 0 for GGG
count is 0 for GGC
count is 0 for GCA
count is 1 for GCT
count is 0 for GCG
count is 0 for GCC
count is 0 for CAA
count is 0 for CAT
count is 0 for CAG
count is 0 for CAC
count is 0 f

In [7]:
#the above was not super useful though, since to access the nucleotides, the exact index position must be known
#ex: to get the count for TGA
print("count for TGA is "+str(all_counts[24]))

count for TGA is 2


In [11]:
#this produces two lists with 1:1 correspondence
dna = "AATGATCGATCGTACGCTGA"
all_trinucleotides = []
all_counts = []
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            count = dna.count(trinucleotide)
            all_trinucleotides.append(trinucleotide)
            all_counts.append(count)
print(all_counts)
print(all_trinucleotides)

[0, 1, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0]
['AAA', 'AAT', 'AAG', 'AAC', 'ATA', 'ATT', 'ATG', 'ATC', 'AGA', 'AGT', 'AGG', 'AGC', 'ACA', 'ACT', 'ACG', 'ACC', 'TAA', 'TAT', 'TAG', 'TAC', 'TTA', 'TTT', 'TTG', 'TTC', 'TGA', 'TGT', 'TGG', 'TGC', 'TCA', 'TCT', 'TCG', 'TCC', 'GAA', 'GAT', 'GAG', 'GAC', 'GTA', 'GTT', 'GTG', 'GTC', 'GGA', 'GGT', 'GGG', 'GGC', 'GCA', 'GCT', 'GCG', 'GCC', 'CAA', 'CAT', 'CAG', 'CAC', 'CTA', 'CTT', 'CTG', 'CTC', 'CGA', 'CGT', 'CGG', 'CGC', 'CCA', 'CCT', 'CCG', 'CCC']


In [9]:
#this allows the index of a tri_nt to be looked up in all_trinucleotides, then that number to be applied to all_counts to get the count
i=all_trinucleotides.index("TGA")
c=all_counts[i]
print("count for TGA is "+str(c))

count for TGA is 2


In [14]:
#problems: lists must be maintained fully synchronized, also a slow process
#the solution is DICTIONARIES:
#store 'key' + 'value' pairs (like the above "tri_nt" + "count")
#syntax is {"key":"value"}
#(value can be modified as needed with regular-expression notation- r" ", | as or, (), [,,], etc) for use in regular expressions- has no effect on dictionary label
enzymes={"EcoRI":r"GAATTC","AvaII":r"GG(A|T)CC","BisI":"GC[ATGC]GC"}
#retrieval is by dic_name["key"]
print(enzymes['AvaII'])


GG(A|T)CC


In [15]:
#dictionaries can store only strings and numbers!
#keys must be unique, and only one value per key (using regular expressions on values can work around this somewhat)
#dictionaries can also be created and then appended:
enzymes = {}
enzymes['EcoRI'] = r'GAATTC'
enzymes['AvaII'] = r'GG(A|T)CC'
enzymes['BisI'] = r'GC[ATGC]GC' 

In [16]:
#keys can be deleted using "pop" method (also returns the value before deletion)
enzymes.pop("EcoRI")

'GAATTC'

In [17]:
print(enzymes)

{'AvaII': 'GG(A|T)CC', 'BisI': 'GC[ATGC]GC'}


In [18]:
#the beginning example of trinucleotides can be stored as a dictionary (using loops again for expidency)
dna = "AATGATCGATCGTACGCTGA"
counts = {}
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            count = dna.count(trinucleotide)
            counts[trinucleotide] = count
print(counts)

{'AAA': 0, 'AAT': 1, 'AAG': 0, 'AAC': 0, 'ATA': 0, 'ATT': 0, 'ATG': 1, 'ATC': 2, 'AGA': 0, 'AGT': 0, 'AGG': 0, 'AGC': 0, 'ACA': 0, 'ACT': 0, 'ACG': 1, 'ACC': 0, 'TAA': 0, 'TAT': 0, 'TAG': 0, 'TAC': 1, 'TTA': 0, 'TTT': 0, 'TTG': 0, 'TTC': 0, 'TGA': 2, 'TGT': 0, 'TGG': 0, 'TGC': 0, 'TCA': 0, 'TCT': 0, 'TCG': 2, 'TCC': 0, 'GAA': 0, 'GAT': 2, 'GAG': 0, 'GAC': 0, 'GTA': 1, 'GTT': 0, 'GTG': 0, 'GTC': 0, 'GGA': 0, 'GGT': 0, 'GGG': 0, 'GGC': 0, 'GCA': 0, 'GCT': 1, 'GCG': 0, 'GCC': 0, 'CAA': 0, 'CAT': 0, 'CAG': 0, 'CAC': 0, 'CTA': 0, 'CTT': 0, 'CTG': 1, 'CTC': 0, 'CGA': 1, 'CGT': 1, 'CGG': 0, 'CGC': 1, 'CCA': 0, 'CCT': 0, 'CCG': 0, 'CCC': 0}


In [19]:
#finding counts is much easier:
print(counts["TGA"])

2


In [20]:
#to prevent the storage of "0" counts (useless info since we want to know which trinucleotides are present in the DNA, not which arent)
dna = "AATGATCGATCGTACGCTGA"
counts = {}
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            count = dna.count(trinucleotide)
            #handy if statement stores only the relevant counts
            if count > 0:
                counts[trinucleotide] = count
print(counts)

{'AAT': 1, 'ATG': 1, 'ATC': 2, 'ACG': 1, 'TAC': 1, 'TGA': 2, 'TCG': 2, 'GAT': 2, 'GTA': 1, 'GCT': 1, 'CTG': 1, 'CGA': 1, 'CGT': 1, 'CGC': 1}


In [21]:
#if you look for a count that doesn't exist, an error occurs
print(counts["AAA"])

KeyError: 'AAA'

In [22]:
#this can be worked around with an if statement (so program doesnt break)
if "AAA" in counts:
    print(counts("AAA"))
#or by using the .get method of dictionaries:
print(counts["TGA"])
#works like
print(counts.get("TGA"))
#but can also take a second argument that defines a default value for if a key is not present:
print("count for TGA is " + str(counts.get('TGA', 0)))
print("count for AAA is " + str(counts.get('AAA', 0)))
print("count for GTA is " + str(counts.get('GTA', 0)))
print("count for TTT is " + str(counts.get('TTT', 0)))

2
2
count for TGA is 2
count for AAA is 0
count for GTA is 1
count for TTT is 0


In [23]:
#just returns "None" if no default is set
print(counts.get("AAA"))

None


In [24]:
#dictionary keys can be iterated over using the .keys method (which essentially forms a list of all the keys)
for trinucleotide in counts.keys():
    if counts.get(trinucleotide) == 2:
        print(trinucleotide)

ATC
TGA
TCG
GAT


In [25]:
#dictionaries are UNORDERED by default though-- items may not be processed in the same order as they were added
#sorted() method sorts the list into a particular configuration-- what configuration? ASKQ
for trinucleotide in sorted(counts.keys()):
    if counts.get(trinucleotide)==2:
        print(trinucleotide)

ATC
GAT
TCG
TGA


In [26]:
#items (pairs of keys and values) can be utilized (iterated over, etc) in tandem with .items() method
#.items() returns a list of pairs of values (key + value values)
#thus two variables must be specified at start
for trinucleotide, count in counts.items():
    if count==2:
        print(trinucleotide)

ATC
TGA
TCG
GAT
