In [35]:
############################################
###                                      ### 
###   This notebook will introduce you   ###
###   to some basic Python syntax,       ###
###   particularly focused on lists,     ###
###   dictionaries, for loops and        ###
###   if/else control statements.        ### 
###                                      ### 
############################################

# To "fix" integer division behavior
from __future__ import division


In [None]:
############################
###                      ### 
###        Lists         ###
###                      ### 
############################

# The list is a variable type that is used to store an ordered collection of items
emptylist1 = []       # This is one way to create an empty list
emptylist2 = list()   # This is another way
print (type(emptylist1))
print (type(emptylist2))
print ("")

# These items can be of any type: strings, integers, floats, even other lists or dictionaries
# And all of these types can be stored together within a single list
# For example, here is a list containing: a string, an integer, a float, another string
    # and finally another list with two strings and two integers 
thisIsAList = ['This is a string', 3, 4.5, "askdjfh", ['3', '4', 3, 4]]
print ("This is what it looks like when you print a list")
print (thisIsAList)

print ("\nHere is what it looks like if you use the type() function on a list")
print (type(thisIsAList))

# You can check the number of items in a list by using the len() function
    # This is the same function you use to check the length of strings
print("\nThis list contains %d items" % (len(thisIsAList)))


# Just like with strings, we can use the dir() function to look at the built-in methods associated with a list
print("\nHere are the built-in methods/functions associated with lists in Python (many are __magic__):")
print (dir(thisIsAList))

In [None]:
# The items in a list are ordered and you can use indexing to retrieve particular elements from a list
# These indices start at 0, therefore, here is how you can retieve the first element of the list
print ("Here is the first element of the list: %s" % (thisIsAList[0]))
# And likewise, here is the second element of the list
print ("Here is the second element of the list: %d" % (thisIsAList[1]))
## **Note the change to the string formatting because the second element is an integer

## Alternatively, you could use the same string formatting, but convert the integer manually into a string
print ("Here is the second element of the list, pre-fromatted as string: %s" % (str(thisIsAList[1])))

# You can also extract a range of items from the list, by specifying start and stop indices separated by a ":"
    # **Note: the first index is inclusive, the second is exclusive
print ("\nHere are the first two elements of the list:")
print (thisIsAList[0:2])
# Because our subset starts at the very beginning of the list, we can also just use
print ("Same thing, different syntax:")
print (thisIsAList[:2])

# Don't forget...you can also use negative indices, which start from the end of the list with -1
print ("\nHere are the last two elements of the list, specified with negative indices:")
print (thisIsAList[-3:-1])


In [None]:
# Unlike strings, lists are mutable
    # This means that you can directly make changes to lists, without needing to make a copy

# This statement will generate a list containing all integers between 0 and 20
    # And assign this list to the variable range21
range21 = range(21)
print(range21)

# Here we will replace the first item in the list with the string 'A'
range21[0] = 'A'
print "\nAltered list:"
print (range21)

# Below here is code that attempts to perform a similar operation with a list
# Uncomment the second command and re-run the cell to see what happens
string = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
# string[0] = "a"

# In addition to replacing items,

# You can add items to the end of a list
range21.append("Added to end")
print ("\nAfter appending a string item")
print (range21)

# You can insert items into the middle
range21.insert(3, "Insert as fourth item")
print ("\nAfter inserting a string item")
print (range21)

# You can remove items by identity
range21.remove('A')
print ("\nAfter removing the first item by specifying it's value")
print (range21)
# Or by position
del(range21[0])
print ("\nAfter removing the first item by specifying it's index")
print (range21)

# You can even sort the items of a list
range21.sort()
print ("\nAfter sorting the list, in place")
print (range21)


In [None]:
# Here are some other useful functions for working with lists

# sum() will add up all of the values in a list
print(sum([1, 2, 4, 5, 7.8, 91.4, -30.4]))
# but will give an error if there are non-numerical items in your list
#print(sum(['a', 1, 2, 4, 5, 7.8, 91.4, -30.4]))

# The sorted() function offers another way to sort a list
    # In this case, the sorted version is returned as a copy of the original
x = [1000, 100, 0, -100000, -10]
print ("\nHere is the list assigned to variable x")
print (x)
print ("Here is a sorted version of that list")
print(sorted(x))
print ("Note that the original list has not been changed")
print (x)

# max() can be used to extract the maximum value from a list
print ("\nHere the maximum value in x: %d" % max(x))
# min() can be used to extract the minimum value from a list
print ("Here the minimum value in x: %d" % min(x))

#Be aware however, that these functions can provide unexpected results with lists that contains a mix of variable types
y = ['a', 'A', 1, 2, 3, 4, range(10)]
print("\nDid you expect this to be the max value?")
print(max(y))


In [62]:
###       -----> Exercise 1.1 <-----
#         Use lists and associated functions to calculate the sum and the mean 
#         of all integers between 0 and 1,000,000
#         but excluding all numbers between 550,000 and 750,000 (inclusive)





In [None]:
############################
###                      ### 
###  Generating lists    ###
###    from strings      ### 
###                      ### 
############################

# A common task when reading in data from a file is to convert a string into a list
filestring = "1000\tblue\t3.4567\t1 July 2006\t-336ft\t122 22.48 W"

# You can use the list() function, but this will create one item in the list for every character in the string
alist = list(filestring)
print("A list generated with the list() function, with %d items:" % len(alist))
print(alist)
# This is not typically the desired result

# Alternatively, you can use the built-in string method .split()
    # In this case, you would want to split using the '\t' delimiter
alist = filestring.split('\t')
print("\nA list generated with the built-in .split() function, with %d items:" % len(alist))
print(alist)


In [None]:
# !!!!! Caution beware of making "copies" of mutable objects, like lists

original = range(10)

# This may seem like you're making a copy, 
# but actually you're just creating a second variable points to the list
pseudocopy = original
print("\nHere is the pseudocopy variable before altering the original:")
print (pseudocopy)
original[1:4] = ['B', 'C', 'D']
print("Here is the pseudocopy variable after making a change to the original variable:")
print (pseudocopy)

original = range(10)

# Here is how to make an actual copy
copy = original[:]
print("\nHere is the copy variable before altering the original:")
print (copy)
original[1:4] = ['B', 'C', 'D']
print("Here is the copy variable after making a change to the original variable:")
print (copy)





In [None]:
###################################
###                             ### 
###         Dictionaries        ###
###                             ### 
###################################

# Dictionaries allow you to store key:value pairs
emptydict1 = {}       # This is one way to create an empty dictionary
emptydict2 = dict()   # This is another way
print type(emptydict1), type(emptydict2)

# You can also define a dictionary along with key:value pairs
aDictionary = {1: "one", "one": 1, 2: "two", "two": 2, 3: "three", "three": 3, 4: "four", "four": 4}

# Unlike lists, the contents of dictionaries are unordered
print ("\nThis is what it looks like when you print a dictionary")
print (aDictionary)
# Note that the key:value pairs are not printed in the order they were entered, or in sorted order

# Again, we can take a look at the built-in methods using the dir() function
print("\nHere are the built-in methods/functions associated with dictionaries in Python (many are __magic__):")
print (dir(aDictionary))

# Dictionaries are also MUTABLE. Here I will change several of the values associated with the dictoinary keys
aDictionary[1] = 'uno'
aDictionary['one'] = 'one'
aDictionary['three'] = 3.33
print ("\nHere is the dictionary following these changes:")
print (aDictionary)
# Note that with dictionaries, values are accessed by using the associated key, NOT an index

# You can also add new values to an existing dictionary
aDictionary[5] = 'five'
# Or remove existing values
del(aDictionary["four"])
print ("\nHere is the dictionary following the addition and deletion:")
print (aDictionary)

# Important dictionary rules to be aware of:
    # 1. Keys can be any NON-MUTABLE variable type. This means lists and dictionaries cannot be keys
    # 2. Keys must be unique within a dictionary, but values can be repeated
    # 3. Values can be anything, including lists and dictionaries
    
    


In [None]:
############################
###                      ### 
###       for loops      ###
###                      ### 
############################

# As you'll recall from class 3, which focused on shell scripting, 
    # for loops cycle through each item in a list or other collection of variables
        # And each time through, they execute the same block of commands
# Therefore, for loops provide a very efficient way to complete repetitive tasks

# Here is a simple example that loops through a list of integers and prints their square roots
for each in range(2,21,2):
    print ("The square root of %d is %.4f" %(each, each**(1/2)))
print "for loop complete!"
# Note: Only the indented command is run in each iteration of the for loop
#       The indentation indicates the block of code contained within the loop
#       However, you could certainly add additional commands within this block
# Note: the third number provided to the range() function is the step size between integers



In [None]:
###       -----> Exercise 1.2 <-----
#         Write a for loop that will generate a dictionary where:
#               1) The keys will be the integers from 1-10 (inclusive)
#               2) The values will be the reciprocal of those integers (e.g., the reciprocal of 5 is 1/5) 
#         Note: A dictionary needs to exist before values can be added to it










In [None]:
############################
###                      ### 
###      if, else        ###
###                      ### 
############################

# Another really important part of Python scripts are conditional statements
    # These make it possible to have certain blocks of code executed only under certain circumstances

# More often than not, these are in the form of if/else statements
# For example:
for each in range(1,11):
    if each%3==0:
        print "%d is divsible by 3" % each
    else:
        print "%d is NOT divsible by 3" % each

# An if statement must be followed by some type of logical expression to be evaluated
    # If the result of that evaluation is True, then the block of code will be executed
    # In the example above, we are testing to see if the remainder after dividing by three is equal "==" to 0
# However, there are no logical expressions associated with an else statement
    # Instead, the block of code following this statement is executed whenever the expression tested in the if statement is false



In [None]:
############################
###                      ### 
###        elif          ###
###                      ### 
############################

# There is also elif, which is a hybrid of else and if
    # elif must follow an if (or another elif) statement
    # Like else, the block of code linked to the elif statement will only run if all proceeding if/elif statements are false
    # However, in addition, there must be a logical expression associated with an elif statement
        # And this expression must evaluate to True for the block of code to be executed

# Here is an example that builds upon the previous if/else statements:
for each in range(1,11):
    if each%3==0:
        print "%d is divsible by 3" % each
    elif each%2==0:
        print "%d is NOT divsible by 3, but IS divisible by 2" % each
    else:
        print "%d is NOT divsible by 2 or 3" % each


In [11]:
###       -----> Exercise 1.3 <-----
#         Below, the variable seq1 contains one of the Zika virus genome sequences from the exercises last week
#         Use a for loop to move through the sequence using a sliding window
#             - The length of each window should be 100 nucleotides
#             - And you should move over 50 nucleotides between each window
#         For each window, first test to see if the window contains >50% A and T
#             if so, add the sequence corresponding to that window to a designated list
#         Then test to see if the window contains >50% G and C
#             if so, add the sequence corresponding to that window to a different list
#         Finally, test to see if the window contains >50% N
#             if so, add the sequence corresponding to that window to a third list
#         At the end, print the length of each list

seq1 = 'NNNNNNNNNNNNNTGTGAATCAGACTGCGACAGTTCGAGTTTGAAGCGAAAGCTAGCAACAGTATCAACAGGTTTTATTTTGGATTTGGAAACGAGAGTTTCTGGTCATGAAAAACCCAAAAAAGAAATCCGGAGGATTCCGGATTGTCAATATGCTAAAACGCGGAGTAGCCCGTGTGAGCCCCTTTGGGGGCTTGAAGAGGCTGCCAGCCGGACTTCTGCTGGGTCATGGGCCCATCAGGATGGTCTTGGCGATTCTAGCCTTTTTGAGATTCACGGCAATCAAGCCATCACTGGGTCTCATCAATAGATGGGGTTCAGTGGGGAAAAAAGAGGCTATGGAAATAATAAAGAAGTTCAAGAAAGATCTGGCTGCCATGCTGAGAATAATCAATGCTAGGAAGGAGAAGAAGAGACGAGGCGCAGATACTAGTGTCGGAATTGTTGGCCTCCTGCTGACCACAGCTATGGCAGCGGAGGTCACTAGACGTGGGAGTGCATACTATATGTACTTGGACAGAAACGATGCTGGGGAGGCCATATCTTTTCCAACCACATTGGGGATGAATAAGTGTTATATACAGATCATGGATCTTGGACACATGTGTGATGCCACCATGAGCTATGAATGCCCTATGCTGGATGAGGGGGTGGAACCAGATGACGTCGATTGTTGGTGCAACACGACGTCAACTTGGGTTGTGTACGGAACCTGCCATCACAAAAAAGGTGAAGCACGGAGATCTAGAAGAGCTGTGACGCTCCCCTCCCATTCCACTAGGAAGCTGCAAACGCGGTCGCAAACCTGGTTGGAATCAAGAGAATACACAAAGCACTTGATTAGAGTCGAAAATTGGATATTCAGGAACCCTGGCTTCGCGTTAGCAGCAGCTGCCATCGCTTGGCTTTTGGGAAGCTCAACGAGCCAAAAAGTCATATACTTGGTCATGATACTGCTGATTGCCCCGGCATACAGCATCAGGTGCATAGGAGTCAGCAATAGGGACTTTGTGGAAGGCATGTCAGGTGGGACTTGGGTTGATGTTGTCTTGGAACATGGAGGTTGTGTCACCGTAATGGCACAGGACAAACCGACTGTCGACATAGAGCTGGTTACAACAACAGTCAGCAACATGGCGGAGGTAAGATCCTACTGCTATGAGGCATCAATATCAGACATGGCTTCGGACAGCCGCTGCCCAACACAAGGTGAAGCCTACCTTGACAAGCAATCAGACACTCAATATGTCTGCAAAAGAACGTTAGTGGACAGAGGCTGGGGAAATGGATGTGGACTTTTTGGCAAAGGGAGCCTGGTGACATGCGCTAAGTTTGCATGCTCCAAGAAAATGACCGGGAAGAGCATCCAGCCAGAGAATCTGGAGTACCGGATAATGCTGTCAGTTCATGGCTCCCAGCACAGTGGGATGATCGTTAATGACACAGGACATGAAACTGATGAGAATAGAGCGAAGGTTGAGATAACGCCCAATTCACCAAGAGCCGAAGCCACCCTGGGGGGTTTTGGAAGCCTAGGACTTGATTGTGAACCGAGGACAGGCCTTGACTTTTCAGATTTGTATTACTTGACTATGAATAACAAGCACTGGTTGGTCCACAAGGAGTGGTTCCACGACATTCCATTACCTTGGCACGCTGGGGCAGACACCGGAACTCCACACTGGAACAACAAAGAAGCACTGGTAGAGTTCAAGGACGCACATGCCAAAAGGCAAACTGTCGTGGTTCTAGGGAGTCAAGAAGGAGCAGTTCACACGGCCCTTGCTGGAGCTCTGGAGGCTGAGATGGATGGTGCAAAGGGAAGGCTGTCCTCTGGCCACTTGAAATGTCGCCTGAAAATGGATAAACTTAGATTGAAGGGCGTGTCATACTCCTTGTGTACCGCAGCGTTCACATTCACCAAGATCCCGGCTGAAACACTGCACGGGACAGTCACAGTGGAGGTACAGTACGCAGGGACAGATGGACCTTGCAAGGTTCCAGCTCAGATGGCGGTGGACATGCAAACTCTGACCCCAGTTGGGAGGTTGATAACCGCTAACCCCGTAATCACTGAAAGCACTGAGAACTCTAAGATGATGCTGGAACTTGATCCACCATTTGGGGACTCTTACATTGTCATAGGAGTCGGGGAGAAGAAGATCACCCACCACTGGCACAGGAGTGGTAGCACCATTGGAAAAGCATTTGAAGCCACTGTGAGAGGTGCCAAGAGAATGGCAGTCTTGGGAGACACAGCCTGGGACTTTGGATCAGTTGGAGGCGCTCTCAACTCATTGGGCAAGGGCATCCATCAAATTTTTGGAGCAGCTTTCAAATCATTGTTTGGAGGAATGTCCTGGTTCTCACAAATCCTCATTGGAACGTTGCTGATGTGGTTGGGTCTGAACACAAAGAATGGATCTATTTCCCTTATGTGCTTGGCCTTAGGGGGAGTGTTGATCTTCTTATCCACAGCCGTCTCTGCTGATGTGGGGTGCTCGGTGGACTTCTCAAAGAAGGAGACGAGATGCGGTACAGGGGTGTTCGTCTATAACGACGTTGAAGCCTGGAGGGACAGGTACAAGTACCATCCTGACTCCCCCCGTAGATTGGCAGCAGCAGTCAAGCAAGCCTGGGAAGATGGTATCTGCGGGATCTCCTCTGTTTCAAGAATGGAAAACATCATGTGGAGATCAGTAGAAGGGGAGCTCAATGCAATCCTGGAAGAGAATGGAGTTCAACTGACGGTCGTTGTGGGATCTGTAAAAAACCCCATGTGGAGAGGTCCACAGAGATTGCCCGTGCCTGTGAACGAGCTGCCCCACGGCTGGAAGGCTTGGGGGAAATCGTACTTCGTCAGAGCAGCAAAGACAAATAACAGCTTTGTCGTGGATGGTGACACACTGAAGGAATGCCCACTCAAACATAGAGCATGGAACAGCTTTCTTGTGGAGGATCATGGGTTCGGGGTATTTCACACTAGTGTCTGGCTCAAGGTTAGAGAAGATTATTCATTAGAGTGTGATCCAGCCGTTATTGGAACAGCTGTTAAGGGAAAGGAGGCTGTACACAGTGATCTAGGCTACTGGATTGAGAGTGAGAAGAATGACACATGGAGGCTGAAGAGGGCCCATCTGATCGAGATGAAAACATGTGAATGGCCAAAGTCCCACACATTGTGGACAGATGGAATAGAAGAGAGTGATCTGATCATACCCAAGTCTTTAGCTGGGCCACTCAGCCATCACAATACCAGAGAGGGCTACAGGACCCAAATGAAAGGGCCATGGCACAGTGAAGAGCTTGAAATTCGGTTTGAGGAATGCCCAGGCACTAAGGTCCACGTGGAGGAAACATGTGGAACAAGAGGACCATCTCTGAGATCAACCACTGCAAGCGGAAGGGTGATCGAGGAATGGTGCTGCAGAGAGTGCACAATGCCCCCACTGTCGTTCCGGGCTAAAGATGGCTGTTGGTATGGAATGGAGATAAGGCCCAGGAAAGAACCAGAAAGCAACTTAGTAAGGTCAGTGGTGACTGCAGGATCAACTGATCACATGGATCACTTCTCCCTTGGAGTGCTTGTGATTCTGCTCATGGTGCAGGAAGGGCTGAAGAAGAGAATGACCACAAAGATCATCATAAGCACATCAATGGCAGTGCTGGTAGCTATGATCCTGGGAGGATTTTCAATGAGTGACCTGGCTAGGCTTGCAATTTTGATGGGTGCCACCTTCGCGGAAATGAACACTGGAGGAGATGTAGCTCATCTGGCGCTGATAGCGGCATTCAAAGTCAGACCAGCGTTGCTGGTATCTTTCATCTTCAGAGCTAATTGGACACCCCGTGAAAGCATGCTGCTGGCCTTGGCCTCGTGTCTTTTGCAAACTGCGATCTCCGCCTTGGAAGGCGACCTGATGGTTCTCATCAATGGTTTTGCTTTGGCCTGGTTGGCAATACGAGCGATGGTTGTTCCACGCACTGATAACATCACCGTGGCAATCCTGGCTGCTCTGACACCACTGGCCCGGGGCACACTGCTTGTGGCGTGGAGAGCAGGCCTTGCTACTTGCGGGGGGTTTATGCTCCTCTCTCTGAAGGGAAAAGGCAGTGTGAAGAAGAACTTACCATTTGTCATGGCCCTGGGACTAACCGCTGTGAGGCTGGTTGACCCCATCAACGTGGTGGGACTGCTGTTGCTCACAAGGAGTGGGAAGCGGAGCTGGCCCCCTAGCGAAGTACTCACAGCTGTTGGCCTGATATGCGCATTGGCTGGAGGGTTCGCCAAGGCAGATATAGAGATGGCTGGGCCCATGGCCGCGGTCGGTCTGCTAATTGTCAGTTACGTGGTCTCAGGAAAGAGTGTGGACATGTACATTGAAAGAGTAGGTGACATCACATGGGAAAAAGATGCGGAAGTCACTGGAAACAGTCCCCGGCTCGATGTGGCGCTAGATGAGAGTGGTGATTTCTCCCTGGTGGAGGATGACGGTCCCCCCATGAGAGAGATCATACTCAAGGTAGTCCTGATGACCATCTGTGGCATGAACCCAATAGCCATACCCTTTGCAGCTGGAGCGTGGTACGTATACGTGAAGACTGGAAAAAGGAGTGGTGCTCTATGGGATGTGCCTGCTCCCAAGGAAGTAAAAAAGGGGGAGACCACAGATGGAGTGTACAGAGTAATGACTCGTAGACTGCTAGGTTCAACACAAGTTGGAGTGGGAGTTATGCAAGAGGGGGTCTTTCACACTATGTGGCACGTCACAAAAGGATCCGCGCTGAGAAGCGGTGAAGGGAGACTTGATCCATACTGGGGAGATGTCAAGCAGGATCTGGTGTCATACTGTGGTCCATGGAAGCTAGATGCCGCCTGGGACGGGCACAGCGAGGTGCAGCTCTTGGCCGTGCCCCCCGGAGAGAGAGCGAGGAACATCCAGACTCTGCCCGGAATATTTAAGACAAAGGATGGGGACATTGGAGCGGTTGCGCTGGATTACCCAGCAGGAACTTCAGGATCTCCAATCCTAGACAAGTGTGGGAGAGTGATAGGACTTTATGGCAATGGGGTCGTGATCAAAAATGGGAGTTATGTTAGTGCCATCACCCAAGGGAGGAGGGAGGAAGAGACTCCTGTTGAGTGCTTCGAGCCTTCGATGCTGAAGAAGAAGCAGCTAACTGTCTTAGACTTGCATCCTGGAGCTGGGAAAACCAGGAGAGTTCTTCCTGAAATAGTCCGTGAAGCCATAAAAACAAGACTCCGTACTGTGATCTTAGCTCCAACCAGGGTTGTCGCTGCTGAAATGGAGGAAGCCCTTAGAGGGCTTCCAGTGCGTTATATGACAACAGCAGTCAATGTCACCCACTCTGGAACAGAAATCGTCGACTTAATGTGCCATGCCACCTTCACTTCACGTCTACTACAGCCAATCAGAGTCCCCAACTATAATCTGTATATTATGGATGAGGCCCACTTCACAGATCCCTCAAGTATAGCAGCAAGAGGATACATTTCAACAAGGGTTGAGATGGGCGAGGCGGCTGCCATCTTCATGACCGCCACGCCACCAGGAACCCGTGACGCATTTCCGGACTCCAACTCACCAATTATGGACACCGAAGTGGAAGTCCCAGAGAGAGCCTGGAGCTCAGGCTTTGATTGGGTGACGGATCATTCTGGAAAAACAATTTGGTTTGTTCCAAGCGTGAGGAACGGCAATGAGATCGCAGCTTGTCTGACAAAGGCTGGAAAACGGGTCATACAGCTCAGCAGAAAGACTTTTGAGACAGAGTTCCAGAAAACAAAACATCAAGAGTGGGACTTTGTCGTGACAACTGACATTTCAGAGATGGGCGCCAACTTTAAAGCTGACCGTGTCATAGATTCCAGGAGATGCCTAAAGCCGGTCATACTTGATGGCGAGAGAGTCATTCTGGCTGGACCCATGCCTGTCACACATGCCAGCGCTGCCCAGAGGAGGGGGCGCATAGGCAGGAATCCCAACAAACCTGGAGATGAGTATCTGTATGGAGGTGGGTGCGCAGAGACTGACGAAGACCATGCACACTGGCTTGAAGCAAGAATGCTCCTTGACAATATTTACCTCCAAGATGGCCTCATAGCCTCGCTCTATCGACCTGAGGCCGACAAAGTAGCAGCCATTGAGGGAGAGTTCAAGCTTAGGACGGAGCAAAGGAAGACCTTTGTGGAACTCATGAAAAGAGGAGATCTTCCTGTTTGGCTGGCCTATCAGGTTGCATCTGCCGGAATAACCTACACAGATAGAAGATGGTGCTTTGATGGCACAACCAACAACACCATAATGGAAGACAGTGTGCCGGCAGAGGTGTGGACCAGACACGGAGAGAAAAGAGTGCTCAAACCGAGGTGGATGGACGCCAGAGTTTGTTCAGATCATGCGGCCCTGAAGTCATTCAAGGAGTTTGCCGCTGGGAAAAGAGGAGCGGCTTTTGGAGTGATGGAAGCCCTGGGAACACTGCCAGGACACATGACAGAGAGATTCCAGGAAGCCATTGACAACCTCGCTGTGCTCATGCGGGCAGAGACTGGAAGCAGGCCTTACAAAGCCGCGGCGGCCCAATTGCCGGAGACCCTAGAGACCATTATGCTTTTGGGGTTGCTGGGAACAGTCTCGCTGGGAATCTTTTTCGTCTTGATGAGGAACAAGGGCATAGGGAAGATGGGCTTTGGAATGGTGACTCTTGGGGCCAGCGCATGGCTCATGTGGCTCTCGGAAATTGAGCCAGCCAGAATTGCATGTGTCCTCATTGTTGTGTTCCTATTGCTGGTGGTGCTCATACCTGAGCCAGAAAAGCAAAGATCTCCCCAGGACAACCAAATGGCAATCATCATCATGGTAGCAGTAGGTCTTCTGGGCTTGATTACCGCCAATGAACTCGGATGGTTGGAGAGAACAAAGAGTGACCTAAGCCATCTAATGGGAAGGAGAGAGGAGGGAGCAACCATAGGATTCTCAATGGACATTGACCTGCGGCCAGCCTCAGCTTGGGCCATCTATGCCGCCTTGACAACTTTCATTACCCCAGCCGTCCAACATGCAGTGACCACTTCATACAACAACTACTCCTTAATGGCGATGGCCACGCAAGCTGGAGTGTTGTTTGGTATGGGCAAAGGGATGCCATTCTACGCATGGGACTTTGGAGTCCCGCTGCTAATGATAGGTTGCTACTCACAATTAACACCCCTGACCCTAATAGTGGCCATCATTTTGCTCGTGGCGCACTACATGTACTTGATCCCAGGGCTGCAGGCAGCAGCTGCGCGTGCTGCCCAGAAGAGAACGGCAGCTGGCATCATGAAGAACCCTGTTGTGGATGGAATAGTGGTGACTGACATTGACACAATGACAATTGACCCCCAAGTGGAGAAAAAGATGGGACAGGTGCTACTCATAGCAGTAGCCGTCTCCAGCGCCATACTGTCGCGGACCGCCTGGGGGTGGGGGGAGGCTGGGGCCCTGATCACAGCCGCAACTTCCACTTTGTGGGAAGGCTCTCCGAACAAGTACTGGAACTCCTCTACAGCCACTTCACTGTGTAACATTTTTAGGGGAAGTTACTTGGCTGGAGCTTCTCTAATCTACACAGTAACAAGAAACGCTGGCTTGGTCAAGAGACGTGGGGGTGGAACAGGAGAGACCCTGGGAGAGAAATGGAAGGCCCGCTTGAACCAGATGTCGGCCCTGGAGTTCTACTCCTACAAAAAGTCAGGCATCACCGAGGTGTGCAGAGAAGAGGCCCGCCGCGCCCTCAAGGACGGTGTGGCAACGGGAGGCCATGCTGTGTCCCGAGGAAGTGCAAAGCTGAGATGGTTGGTGGAGCGGGGATACCTGCAGCCCTATGGAAAGGTCATTGATCTTGGATGTGGCAGAGGGGGCTGGAGTTACTACGCCGCCACCATCCGCAAAGTTCAAGAAGTGAAAGGATACACAAAAGGAGGCCCTGGTCATGAAGAACCCGTGTTGGTGCAAAGCTATGGGTGGAACATAGTCCGTCTCAAGAGTGGGGTGGACGTCTTTCATATGGCGGCTGAGCCGTGTGACACGTTGCTGTGTGACATAGGTGAGTCATCATCTAGTCCTGAAGTGGAAGAAGCACGGACGCTCAGAGTCCTCTCCATGGTGGGGGATTGGCTTGGAAAAAGACCAGGAGCCTTTTGTATAAAAGTGTTGTGCCCATACACCAGCACTATGATGGAAACCCTGGAGCGACTGCAGCGTAGGTATGGGGGAGGACTGGTCAGAGTGCCACTCTCCCGCAACTCTACACATGAGATGTACTGGGTCTCTGGAGCGAAAAGCAACACCATAAAAAGTGTGTCCACCACGAGCCAGCTCCTCTTGGGGCGCATGGACGGGCCTAGGAGGCCAGTGAAATATGAGGAGGATGTGAATCTCGGCTCTGGCACGCGGGCTGTGGTAAGCTGCGCTGAAGCTCCCAACATGAAGATCATTGGTAACCGCATTGAAAGGATCCGCAGTGAGCATGCGGAAACGTGGTTCTTTGACGAGAACCACCCATATAGGACATGGGCTTACCATGGAAGCTATGAGGCCCCCACACAAGGGTCAGCGTCCTCTCTAATAAACGGGGTTGTCAGGCTCCTGTCAAAACCCTGGGATGTGGTGACTGGAGTCACAGGAATAGCCATGACCGACACCACACCGTATGGTCAGCAAAGAGTTTTCAAGGAAAAAGTGGACACTAGGGTGCCAGACCCCCAAGAGGGCACTCGTCAGGTTATGAGCATGGTCTCTTCCTGGTTGTGGAAAGAGCTAGGCAAACACAAACGGCCACGAGTCTGTACCAAAGAAGAGTTCATCAACAAGGTTCGTAGCAATGCAGCATTAGGGGCAATATTTGAAGAGGAAAAAGAGTGGAAGACTGCAGTGGAAGCTGTGAACGATCCAAGGTTCTGGGCTCTAGTGGACAAGGAAAGAGAGCACCACCTGAGAGGAGAGTGCCAGAGTTGTGTGTACAACATGATGGGAAAAAGAGAAAAGAAACAAGGGGAATTTGGAAAGGCCAAGGGCAGCCGCGCCATCTGGTATATGTGGCTAGGGGCTAGATTTCTAGAGTTCGAAGCCCTTGGATTCTTGAACGAGGATCACTGGATGGGGAGAGAGAACTCAGGAGGTGGTGTTGAAGGGCTGGGACTACAAAGACTCGGATATGTCCTAGAAGAGATGAGTCGCATACCAGGAGGAAGGATGTATGCAGATGACACTGCTGGCTGGGACACCCGCATCAGCAGGTTTGATCTGGAGAATGAAGCTCTAATCACCAACCAAATGGAGAAAGGGCACAGGGCCTTGGCATTGGCCATAATCAAGTACACATACCAAAACAAAGTGGTAAAGGTCCTTAGACCAGCTGAAAAAGGGAAAACAGTCATGGACATTATTTCGAGACAAGACCAAAGGGGGAGCGGACAAGTTGTCACTTACGCTCTTAACACATTTACCAACCTAGTGGTGCAACTCATTCGGAATATGGAGGCTGAGGAAGTTCTAGAGATGCAAGACTTGTGGCTGCTGCGGAGGTCAGAGAAAGTGACCAACTGGTTGCAGAGCAACGGATGGGATAGGCTCAAACGAATGGCAGTCAGTGGAGATGATTGCGTTGTGAAGCCAATTGATGATAGGTTTGCACATGCCCTCAGGTTCTTGAATGATATGGGAAAAGTTAGGAAGGACACACAAGAGTGGAAACCCTCAACTGGATGGGACAACTGGGAAGAAGTTCCGTTTTGCTCCCACCACTTCAACAAGCTCCATCTCAAGGACGGGAGGTCCATTGTGGTTCCCTGCCGCCACCAAGATGAACTGATTGGCCGGGCCCGCGTCTCTCCAGGGGCGGGATGGAGCATCCGGGAGACTGCTTGCCTAGCAAAATCATATGCGCAAATGTGGCAGCTCCTTTATTTCCACAGAAGGGACCTCCGACTGATGGCCAATGCCATTTGTTCATCTGTGCCAGTTGACTGGGTTCCAACTGGGAGAACTACCTGGTCAATCCATGGAAAGGGAGAATGGATGACCACTGAAGACATGCTTGTGGTGTGGAACAGAGTGTGGATTGAGGAGAACGACCACATGGAAGACAAGACCCCAGTTACGAAATGGACAGACATTCCCTATTTGGGAAAAAGGGAAGACTTGTGGTGTGGATCTCTCATAGGGCACAGACCGCGCACCACCTGGGCTGAGAACATTAAAAACACAGTCAACATGGTGCGCAGGATCATAGGTGATGAAGAAAAGTACATGGACTACCTATCCACCCAAGTTCGCTACTTGGGTGAAGAAGGGTCTACACCTGGAGTGCTGTAAGCACCAATCTTAATGTTGTCAGGCCTGCTAGTCAGCCACAGCTTGGGGAAAGCTGTGCAGCCTGTGACCCCCCCAGGAGAAGCTGGGAAACCAAGCCTATAGTCAGGCCGGGAACGCCATGGCACGGAAGAAGCCATGCTGCCTGTGAGCCCCTCAGAGGACACTGAGTCAAAAAACCCCACGCGCTTGGAGGCGCAGGATGGGAAAAGAAGGTGGCGACCTTCCCCACCCTTCAATCTGGGGCCTGAACTGGAGATCAGCTGTGGATCTCCAGAAGAGGGACTAGTGGTTAGAGGAGACCCCCCGGAAAACGCAAAACAGCATATTGACGCTGGGAAAGACCAGAGACTCCATGAGTTTCCACCACGCTGGCCGCCAGGCACAGANNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN'





In [None]:
############################
###                      ### 
###  List Comprehension  ###
###                      ### 
############################

# List comprehension is a really powerful took in Python
# With list comprehension, you can very easily perform a task on every item in a list
# This tool essentially allows for simple for loops within the format of a list

# Here is a simple example that takes a list of integers and turns them into strings
intlist = range(10)
print("List of integers:")
print(intlist)
strlist = [str(x) for x in intlist]
print("\nList of strings generated through list comprehension:")
print(strlist)

# The 'x' in the above command is an arbitrrarily chosen variable
    # which serves as a placeholder for each integer within intlist

# Here is another version where every integer in the list is multiplied by 10
    #And a different variable name is chosen
x10list = [each*10 for each in intlist]
print("\nList of integers multiplied by 10:")
print(x10list)

#You can even use if statements within list comprehension
x10listeven = [each*10 for each in intlist if each%2==0]
print("\nList of even integers multiplied by 10:")
print(x10listeven)



In [None]:
###       -----> Exercise 1.4 <-----
#         
#         The list below contains Lassa virus Z genes
#         Use list comprehension to generate:
#             1) a list containing the length of each gene
#             2) a list with the % GC content for each gene

genes = ["ATGGGCAACAGACAGGCTAAACCGACAAAAGTCGACGAACATCAAAGAGCTCACCTAGTGCCAGATGCATCCCATCTAGGCCCTCAGTTTTGCAAAAGTTGCTGGTTTGAGAACAAGGGGTTGGTAGAGTGCAACAACCACTACCTGTGTCTCAACTGTCTCAGCCTGCTCCTCAGCGTTAGCAGCAGATGCCCGATCTGCAAGATGCCCCTTCCCACCAAGCTGAAGGCAGCAGCACAGCCGAGCGCACCCTCAACTGAGGCGGCCCAGAACACAGCACCTCCACCGTATGCCCCATGA", 
         "ATGGGAAACAAGCAAGCCAAGGCCCCAGAACCAAAGGATAGTCCGAGAGCCAGCCTGATCCCAGATGCTACACATCTAGGGCCACAGTTCTGTAAGAGCTGCTGGTTCGAAAACAAGGGCCTGGTTGAGTGCAACAACCACTATTTGTGCCTCAACTGCCTCACCTTACTCTTAAGTGTCAGCAACAGGTGCCCCATCTGCAAGATGCCTCTCCCCACAAAACTGAGACCATCAGCCGCTCCGACAGCACCCCCAGCCGGAGCAGTGGACAGCATCAGACCTCCACCCTACAGTCCCTGA", 
         "ATGGGGAATAAGCAAGCCAAAGATCCAAAGACAGAGAGCAGCCCAAGGGCCAGTCTCATTCCGGATGCCACACACCTTGGACCACAATTTTGCAAGAGTTGTTGGTTTGAAAACAAAGGTTTAGTTGAGTGTAACAACCACTACCTGTGCCTCAACTGCCTCTCCCTACTTTTGAGTGTCAGCAACAGGTGCCCCATCTGTAAGATGCCTCTTCCCACGAAGCTCAAGCCGATAACCACGCCAACAGCACCACAAATCACCAGAGAGAGCATCACCAATCCCCCACCATACACACCCTAA", 
         "ATGGGAGCCAGACAGACCAAGCAACCTCAGATTGAGGGATCTCCTAGAGCCTCTCTGGTGCCCGATGCAAGCCATCTAGGACCCCAGTTCTGCAAGAGCTGCTGGTTTGAGAACAAAGGGCTTGTGGAGTGTAACAATCATTACCTTTGCCTCAATTGCCTCAGCCTCCTACTCAGTGTCAGCAACAGGTGCCCTATCTGCAAGATGCCCCTCCCCACCAAGCTGAGAGTGTCAAGCGCTCCCAGTGCACCCCCAGCGGCCACGGCCCAACCCGGAACTCCTCCACCATACAGCCCCTAG", 
         "ATGGGCAACAAGCAGACCAGGTCCCCACCCAAACCAGAGCACCCCAGACCAACCCTGCTACCCGACGCATCCCACCTGGGCCCCCAATTCTGCAAGAGCTGCTGGTTTGAGAACAAGGGACTGGTGGAGTGTAATAACCATTACCTCTGTCTAAACTGTCTCACACTGCTTCTCAGCGTGAGCGACAGATGTCCTATTTGTAAGATGCCCCTCCCCACCAAGCTGGCAGTCCGAACCCAACCAAGTGCACCCCCACTCAACCAGGGCAACACTCAATCCTCCCCGCCCCCCTACAGCCCCTAA"]



In [None]:
from string import maketrans

############################
###                      ### 
###       Functions      ###
###                      ### 
############################

# Just like in bash, you can define your own functions in Python
# Functions make it easy to reuse in multiple places within your program
# Functions also help to enhance the readability of your scripts

# Here I define a function that will reverse complement a DNA sequence
def revcomp(dna):
    intab = "ACTG"
    outtab = "TGAC"
    trantab = maketrans(intab, outtab)
    return dna.upper()[::-1].translate(trantab)

# And here I'm using the function with two different strings
first_example = revcomp('aaagtctcgagggttctctagagggggaattaaggccctctcgaggaatatagggggttttaaagata')
print (first_example)
second_example = revcomp('TTTTCTCTNNNgagggggaatGGAAGAGTGctcgaggaatatagggggttTTAATATAgata')
print (second_example)


# Functions can even be used inside other functions
def revcomp_countGs(dna):
    revdna = revcomp(dna)    # This is the functin defined above
    return revdna.count('G')

first_example = revcomp_countGs('aaagtctcgagggttctctagagggggaattaaggccctctcgaggaatatagggggttttaaagata')
print (first_example)
second_example = revcomp_countGs('TTTTCTCTNNNgagggggaatGGAAGAGTGctcgaggaatatagggggttTTAATATAgata')
print (second_example)


In [None]:
############################
###                      ### 
###   Reading files      ###
###                      ### 
############################

# An important part of most analysis scripts is reading information from a file
# Here is an example where a file is opened, and stepped through line by line
# The first five lines are printed to the screen

fin = open("MACVCarvallo68_R1_Q20_cutadapt_paired_bwamem_3.5_dels.txt", "r")  # Opens the file
linecount=0
for line in fin:        # Steps through the file 1-by-1
    linecount+=1        # Increments the linecount variable
    if linecount<=5:    # Checks to see if the linecount is less than or equal to 5
        print line
fin.close()             # Closes the file

In [None]:
###       -----> Exercise 1.5 <-----
#         
#   The file "lassa_seqs.fasta" contains the same five lassa sequences used in Exercise 1.4, 
#   but in fasta format, with each sequence on a single line
#   Write a function to calculate the GC content of a sequence
#   Then read in the contents of the "lassa_seqs.fasta" file
#   and use the function you've written to calculate the GC content for each sequence
#      **hint: they better match the values from exercise 1.4!
#      **For 'extra credit', write out a tab-delimited file with two columns of data:
#             1) The sequence name
#             2) The GC content

