# Data types

In [3]:
# string
"hello world"

# another string
"ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"

# int
1

# float
1.23442432

1.23442432

# Making variables

In [4]:
dna_sequence = "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"
print(dna_sequence)
len(dna_sequence)

# note that dna_sequence is an informative name!

ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT


54

# Functions vs Methods

In [5]:
# Functions are defined externally, and not an implicit attribute of an object
len(dna_sequence)
# Here - len() is a function that finds the length of many types of objects.


# Methods are specific to certain types of objects. Strings have some methods, lists have
# some methods, etc. Something like .lower() and .upper() are methods.
# Note that methods are called with a "." and then the method name. Methods are
# simply functions that are defined within an object
dna_sequence.lower()

54


# Subsetting strings, and indexing

In [47]:
# Python is 0-indexed! So counting things starts at "position" 0. Sorry.
# I kinda hate it, but it does make some things a little easier. 

# For subsetting strings, you can use your_string[start_index:end_index]. However, the start_index
# is "inclusive" but the end_index is "excluseive" What do I mean?
# The character at start_index is INCLUDED in the subset, but the character at 
# end_index is NOT included.

my_string = "ABCDEFGHI" 

# Value: ABCDEFGHI
# index: 012345678

my_string[0:3] # ABC - includes characters with indexes 0, 1, and 2

my_string[3:5] # DE - includes characters with indexes 3, 4 

'DE'

# Problem 1 - Calculate AT content

In [9]:
problem1_dna_sequence = "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"

# first, count the number of A's and T's. Here, we're using the .count() method that
# is part of every string.
a_count = problem1_dna_sequence.count("A")
t_count = problem1_dna_sequence.count("T")

# Then, we need to count the total length of the dna
total_dna_length = len(problem1_dna_sequence)

# The AT content is simply the sum of a_count and t_count over total length.
AT_content = (a_count + t_count)/total_dna_length
print(AT_content)

0.6851851851851852


# Complementing DNA

In [16]:
problem2_dna_sequence = "ACTGATCGATTACGTATAGTATTTGCTATCATACATATATATCGATGCGTTCAT"

# To complement, we want to replace all A's with T's, all T's with A's, etc.
# You might think we should replace all As first, then replace all Ts, etc... 
# but then, you could end up changing all of the T's back to A's! 

# To get around this problem, we can make use of the fact that the case (uppercase vs 
# lowercase) matters. When we do each replacement, the replacement will be lowercase.

problem2_dna_sequence = problem2_dna_sequence.replace("A", "t")
problem2_dna_sequence = problem2_dna_sequence.replace("T", "a")
problem2_dna_sequence = problem2_dna_sequence.replace("C", "g")
problem2_dna_sequence = problem2_dna_sequence.replace("G", "c")
problem2_dna_sequence = problem2_dna_sequence.upper()
problem2_dna_sequence

'TGACTAGCTAATGCATATCATAAACGATAGTATGTATATATAGCTACGCAAGTA'

# Restriction fragment lengths

In [27]:
problem3_dna_sequence = "ACTGATCGATTACGTATAGTAGAATTCTATCATACATATATATCGATGCGTTCAT"
cut_site = "GAATTC"

# We can use the sting's .find() method to find the cut site
cut_position = problem3_dna_sequence.find(cut_site)
cut_position

# This gives the number 21... how does this work?
#           1         2         3         4         5
# 0123456789012345678901234567890123456789012345678901234   - remember: 0 indexing! I hate it too
# ACTGATCGATTACGTATAGTAGAATTCTATCATACATATATATCGATGCGTTCAT
#                      ^

# So, 21 is the index position of the "G" in GAATTC. However, the cut happens after
# the G. So the real cut_position is
cut_position = problem3_dna_sequence.find(cut_site) + 1

# So, the first fragment is 
frag1 = problem3_dna_sequence[:cut_position]
# with a length of
len(frag1) # 22

# And the second fragment is 
frag2 = problem3_dna_sequence[cut_position:]
len(frag2) # 33


33

# Splice out the intron - part 1 - extract the coding region

In [31]:
problem4_dna_sequence = "ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCGATCGATCGATCGATCGATCATGCTATCATCGATCGATATCGATGCATCGACTACTAT"
first_exon_end_character = 63
second_exon_start_character = 91

# First, we need to adjust the positions. The 63rd character is index 62, and the 91st
# character is index 90. Sorry.
first_exon_end = 62
second_exon_start = 90

first_exon = problem4_dna_sequence[:first_exon_end]
second_exon = problem4_dna_sequence[second_exon_start:]

coding_sequence = first_exon + second_exon
print(coding_sequence)

ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGATCATCGATCGATATCGATGCATCGACTACTAT


# Splice out the intron - part 2 - get percentage of DNA that is coding

In [32]:
# We can just count the number of nucleotides in the exons, and divde by total length
exon_lengths = len(first_exon) + len(second_exon)
total_length = len(problem4_dna_sequence)
coding_percentage = exon_lengths/total_length
print(coding_percentage)

0.7723577235772358


# Splice out the intron - part 3 - write out coding DNA in uppercase, non-coding in lowercase

In [40]:
# Here, we need to also use subsetting to get the intron string
intron = problem4_dna_sequence[first_exon_end:second_exon_start]

# So, the answer is...
answer = first_exon + intron.lower() + second_exon
print(answer)

# Sanity check! Lets make sure the answer is the same length as the input sequence
if len(answer) == len(problem4_dna_sequence):
    print("We got it right!")

ATCGATCGATCGATCGACTGACTAGTCATAGCTATGCATGTAGCTACTCGATCGATCGATCGatcgatcgatcgatcgatcgatcatgctATCATCGATCGATATCGATGCATCGACTACTAT
We got it right!
