# This notebook will introduce you to even more Python syntax
##   Particularly, it will focus on 1) list/dictionary comprehension, 2) reading from/writing to files, 3) string formatting and 4) python modules.

# List comprehension

In [None]:
# List comprehension is a really powerful tool that allows for simple for loops that generate a list

# Here is a simple example that takes a list of integers and turns them into strings
#First, we need to generate the starting list of integers:
intlist = list(range(10))
print("List of integers:", intlist)

# Now, we can use list comprehension to generate a corresponding list of strings, in a single command:
strlist = [str(x) for x in intlist]
print("\nList of strings generated with list comprehension:", strlist)

# Note: The 'x' in the above command is an arbitrrarily chosen variable
    # which serves as a placeholder for each integer within intlist

In [None]:
# Here is another version where every integer in the list is multiplied by 10
# And just for fun, I've used a different variable name
x10list = [each*10 for each in intlist]
print("\nList of integers multiplied by 10:", x10list)

In [None]:
# You can even use if statements within list comprehension
# In this case, we only include the integer from intlist if it is even (i.e., divisible by 2)
# But still, for those that are included, we are multiplying by 10
x10listeven = [each*10 for each in intlist if each%2==0]
print("\nList of even integers multiplied by 10:", x10listeven)

In [None]:
# And here is an example of a list comprehension that uses both if and else
ifelselist = [each/2 if each%2==0 else each*2 for each in intlist]
print("\nList of even integers divided by 2 and odd integers multiplied by 2:", ifelselist)

# Assignment 1.1
## The next cell contains a string of tab-delimited integers. With this as a starting place, please do the following:
### 1. Convert the string into a list (note: this could happen prior to or within the list comprehension).
### 2. Use list comprehension to generate a new version of the list called l2 that only contains integers >=2.
### 3. Print the length of l2, as well as the sum of the integers contained in l2. 


In [None]:
s="2\t0\t1\t2\t0\t0\t2\t3\t0\t0\t4\t8\t0\t0\t1\t3\t0\t0\t0\t0\t1\t0\t2\t2\t3\t0\t0\t0\t3\t0\t1\t5\t1\t2\t0\t0\t1\t0\t1\t0\t0\t1\t1\t4\t0\t0\t0\t0\t4\t0\t2\t2\t1\t2\t1\t1\t1\t1\t0\t0\t0\t1\t0\t3\t0\t5\t0\t1\t1\t0\t0\t2\t1\t0\t4\t0\t0\t0\t0\t2\t1\t0\t1\t5\t0\t3\t2\t3\t4\t0\t0\t0\t1\t0\t8\t0"



# Assignment 1.2
## The next cell contains a list with Lassa virus Z gene sequences. Use list comprehension to:
### 1. Generate a list containing the length of each gene.
### 2. Generate a separate list with the % GC content for each gene.
## Please print both new lists.


In [None]:
genes = ["ATGGGCAACAGACAGGCTAAACCGACAAAAGTCGACGAACATCAAAGAGCTCACCTAGTGCCAGATGCATCCCATCTAGGCCCTCAGTTTTGCAAAAGTTGCTGGTTTGAGAACAAGGGGTTGGTAGAGTGCAACAACCACTACCTGTGTCTCAACTGTCTCAGCCTGCTCCTCAGCGTTAGCAGCAGATGCCCGATCTGCAAGATGCCCCTTCCCACCAAGCTGAAGGCAGCAGCACAGCCGAGCGCACCCTCAACTGAGGCGGCCCAGAACACAGCACCTCCACCGTATGCCCCATGA", 
         "ATGGGAAACAAGCAAGCCAAGGCCCCAGAACCAAAGGATAGTCCGAGAGCCAGCCTGATCCCAGATGCTACACATCTAGGGCCACAGTTCTGTAAGAGCTGCTGGTTCGAAAACAAGGGCCTGGTTGAGTGCAACAACCACTATTTGTGCCTCAACTGCCTCACCTTACTCTTAAGTGTCAGCAACAGGTGCCCCATCTGCAAGATGCCTCTCCCCACAAAACTGAGACCATCAGCCGCTCCGACAGCACCCCCAGCCGGAGCAGTGGACAGCATCAGACCTCCACCCTACAGTCCCTGA", 
         "ATGGGGAATAAGCAAGCCAAAGATCCAAAGACAGAGAGCAGCCCAAGGGCCAGTCTCATTCCGGATGCCACACACCTTGGACCACAATTTTGCAAGAGTTGTTGGTTTGAAAACAAAGGTTTAGTTGAGTGTAACAACCACTACCTGTGCCTCAACTGCCTCTCCCTACTTTTGAGTGTCAGCAACAGGTGCCCCATCTGTAAGATGCCTCTTCCCACGAAGCTCAAGCCGATAACCACGCCAACAGCACCACAAATCACCAGAGAGAGCATCACCAATCCCCCACCATACACACCCTAA", 
         "ATGGGAGCCAGACAGACCAAGCAACCTCAGATTGAGGGATCTCCTAGAGCCTCTCTGGTGCCCGATGCAAGCCATCTAGGACCCCAGTTCTGCAAGAGCTGCTGGTTTGAGAACAAAGGGCTTGTGGAGTGTAACAATCATTACCTTTGCCTCAATTGCCTCAGCCTCCTACTCAGTGTCAGCAACAGGTGCCCTATCTGCAAGATGCCCCTCCCCACCAAGCTGAGAGTGTCAAGCGCTCCCAGTGCACCCCCAGCGGCCACGGCCCAACCCGGAACTCCTCCACCATACAGCCCCTAG", 
         "ATGGGCAACAAGCAGACCAGGTCCCCACCCAAACCAGAGCACCCCAGACCAACCCTGCTACCCGACGCATCCCACCTGGGCCCCCAATTCTGCAAGAGCTGCTGGTTTGAGAACAAGGGACTGGTGGAGTGTAATAACCATTACCTCTGTCTAAACTGTCTCACACTGCTTCTCAGCGTGAGCGACAGATGTCCTATTTGTAAGATGCCCCTCCCCACCAAGCTGGCAGTCCGAACCCAACCAAGTGCACCCCCACTCAACCAGGGCAACACTCAATCCTCCCCGCCCCCCTACAGCCCCTAA"]


# Dictionary comprehension

In [None]:
# This 'simplified version of a for loop' syntax can also be applied to dictionaries

# Here is a simple example that creates a dictionary with:
# 1. keys for the integers 0-9
# 2. Corresponding values that are the integer squared
d = {x:x**2 for x in range(10)}
print("Dictionary generated with dict comprehension:", d)


In [None]:
# Dictionary comprehension is also a handy way to initialize values in a dictionary for a set of keys

# Let's say we wanted to use a dictionary to keep track of samples originating from each zip code in Flagstaff
    # We can use dict comprehension to initiate a dictionary with a key for each zip code and a value of zero
zips = ["86001", "86002", "86004", "860011"]
zipCounts = {k:0 for k in zips}
print("This dictionary is ready to go:", zipCounts)

In [None]:
# As an aside...there may be times when you want to use a dictionary to store counts,
    # But you won't know ahead of time all the keys you'll want to use 
# In these cases, the get() list method can be very useful
# It can be used to retrieve an existing value from a dictionary, but if the key doesn't exist
# It can return a default value instead

# Here is a simple dictionary we can use for some examples
testD = {"a":2, "b":3, "c":6}

# Here we are saying to retrieve the value associated with a given key
# Or to return 0, if the key does not exist
print(testD.get("a", 0))   # Because "a" is in the dictionary, it will return the value 2
print(testD.get("d", 0))   # Because "d" is NOT in the dictionary, it will return the value 0

# And this type of call can easily be used to increment values each time a key is observed
# For example:
testD["a"] = testD.get("a", 0) + 1
testD["d"] = testD.get("d", 0) + 1


In [None]:
# Now, let's print the new values associated with the "a" and "d" keys
print(testD["a"])
print(testD["d"])


In [None]:
# We can also use if/else in dict comprehension
    # But these are handled separately for the key and the value

# For example, here the value depends on whether the integer is even or odd
numTypes = {i:("even" if i%2==0 else "odd") for i in range(10)}
print(numTypes)

In [None]:
# And we can also include if statements at the very end to determine which keys are included

# Here, we're only including keys >=5
numTypes_min5 = {i:("even" if i%2==0 else "odd") for i in range(10) if i>=5}
print(numTypes_min5)

# Assignment 1.3
## The next cell contains a list of integers called intsL. For this assignment, please do the following:
### 1. Use dictionary comprehension to initiate a dictionary called intsD that contains a key for each integer in intsL, along with a value of 0. Note: it's ok to overwrite keys in this step, but you can also use a set() to handle duplicates. https://realpython.com/python-sets/
### 2. Write a for loop that iterates through intsL and for each integer encountered, increment the associated value in intsD. 
## Please print intsD before and after your for loop.


In [None]:
intsL = [4, 0, 0, 0, 2, 0, 5, 3, 0, 0, 0, 3, 2, 0, 1, 0, 0, 0, 0, 1, 3, 0, 1, 2, 0, 3, 0, 1, 0, 0, 0, 5, 1, 0, 1, 0, 0, 0, 4, 0, 0, 2, 0, 1, 3, 0, 0, 0, 21, 1, 6, 0, 2, 0, 0, 0, 12, 0, 0, 2, 1, 0, 1, 3, 2, 2, 0, 1, 2, 0, 6, 1, 1, 0, 1, 0, 0, 12, 0, 3, 1, 5, 3, 1, 0, 0, 0, 4, 8, 1, 3, 0, 3, 0, 0, 2]



### 3. In the next cell, remake the intsD dictionary using the empty "intsD2" dictionary as a starting place. Instead of using dictionary comprehension to initiate the keys with the value 0, use the get() method within the for loop. 

In [None]:
intsD2 = {}



# Reading from files

In [None]:
# An important part of most analysis scripts is reading information from a file

# Here is an example where a file is opened, and stepped through line by line
    # And the first five lines are printed to the screen

#Note: I'm using a relative path to the file, but you could also use an absolute path
with open("Assignment/MACVCarvallo68_R1_Q20_cutadapt_paired_bwamem_3.5_dels.txt", "r") as fin:
    linecount=0             # Initiates a linecount variable
    for line in fin:        # Steps through the file line-by-line
        linecount+=1        # Increments the linecount variable
        if linecount<=5:    # Checks to see if the linecount is less than or equal to 5
            print (line)    # Print the line from the file

print("Total line count:", linecount)            # Print the total number of lines in the file


In [None]:
# One group of string methods commonly used when reading from input files are the strip methods
# These are used to remove certain characters from the beginning and/or end of a string

# By default, these methods will remove all kinds of whitespace characters

# strip() will remove these characters from both the beginning and the end of the string
print("strip() version:",  "\t\n \n\t This part of the string will remain. \t\t\t\n\n\n\n\n".strip())


In [None]:
# rstrip() will remove these characters only from the end ("right") of the string
print("rstrip() version:",  "  \t\n \n\t This part of the string will remain. \t\t\t\n\n\n\n\n".rstrip())


In [None]:
# lstrip() will remove these characters only from the beginning ("left") of the string
print("lstrip() version:",  "  \t\n \t This part of the string will remain. \t\t\t\n\n\n\n\n".lstrip())


In [None]:
# But you can also provide these functions with specific characters to remove
# A common use to to remove end of line characters when reading lines from a file
    # Which you can do like this:
line = "When you read in lines from a file, they will end with a new line character.\n"
line = line.rstrip("\n")
line

In [None]:
# One of the most common things I do when reading in lines from a tab-delimited file
    # Is to remove the line ending character and split on tabs to make a list
    # And I can do this all in one go, like this

l = "one\ttwo\tthree\n"
cols = l.rstrip("\n").split("\t")
print(cols)

# Assignment 1.4
## In the next cell:
### 1. Step through the fasta file, "Assignment/694003_Betacoronavirus-1_Spike_mafft-ginsi_nearFull.fasta", line by line. 
### 2. Create a list called "names" that contains all of the sequence names from this fasta file. Remember, all name lines begin with ">", but ">" is NOT part of the name. Also make sure you remove the end of line characters before adding the names to the list. 
### 3. Use len() to determine the number of sequences contained in this file, and print this value to the screen.
### 4. Use print() to display the first and last items in "names"


# String formatting

In [None]:
# Up to this point, in this notebook and those from previous classes, I've been printing all different types of variables to the screen 
    # simply by separating them with commas following the print statement
# This approach is simple because it accepts any type of variable, but you don't have much control over how output is displayed
    # They are always just separated by a single space
# String formatting provides better control over the way that results are output
    # And in fact string formatting is useful in many contexts, not just printing results to the screen

# Here is the basic syntax for string formatting using the % operator, as described in your book.

# "string result" % (variable1, variable2, etc.)

# For each variable that you want to include in the string, you need to include a placeholder
    # which always starts with a '%' character

# Here are the primary placeholders you will need:
    # %s is for a string
    # %d is for an integer
    # %f is for a floating point

# Here is an example of string formatting including all of these variable types
string1 = "male"
string2 = "eagles"
integer = 331
floating = 55.9
output = "We observed %d %s, %.2f%% were %s." % (integer, string2, floating, string1)
print (output)

# A couple notes about this example:
#    1. The ".2" between '%' and 'f' controls the number of digits shown following the decimal point
#    2. Because '%' is a special character used in string formatting, 
#       if you actual want a '%' in your string, you need to use '%%'



In [None]:
# Here is an example where I'm printing a series of integers (one per line) without string formatting
for i in range(1,7):
    print(10**i)

In [None]:
# Here, I use string formatting to right-adjust these integers
for i in range(1,7):
    print("%7d" % (10**i))

In [None]:
# Another approach for string formatting is using f-strings
# Here, I've redone the first string formatting example using f-strings

output = f"We observed {integer} {string2}, {floating:.2f}% were {string1}."
print (output)

# Note the f character prior to the formatted string and 
    # the use of the curly brackets to denote the variables to insert

# Assignment 1.5
## In the next cell, I've defined several variables. Please use string formatting (% operator or f-strings) to print the following message using a single command AND using all of the defined variables:

## "88.9% of male patients and 67.3847% of female patients completed trial A-00100." 


In [None]:
a = 67.38475
b = 88.9183740
c = "male"
d = "female"
e = 100



# Writing to files

In [None]:
# Another important aspect of most analysis scripts is the writing of results to output files
# Here, I'm opening a new output file for writing 
with open("powers.tsv", "w") as fout:
    fout.write("Integer\tInteger^2\tInteger^3\tInteger^4\n") # Adding header row
    # Now I'm stepping through the integers 2-10
    for i in range(2,11):
        # And for each of these integers, I'm writing a new line to the output file
            # This line will include the starting integer, 
            # as well as that integer to the power of 2, 3 and 4, separated by tabs
        fout.write(f"{i}\t{i**2}\t{i**3}\t{i**4}\n")  #Note: I'm using string formatting to make the output tab-delimited

# After running this cell, open the resulting output file with your plain text editor to see the results

In [None]:
# You can also nest several different with statements
# For example, to open both read and write mode file objects

with open("output.txt", "w") as fout:       # First, I'm opening a write mode file object
    with open("input.txt", "r") as fin:     # Now, I'm opening a read mode file object
        
        for line in fin:                    # Stepping through the lines in the input file
            if line.startswith('Would'):
                fout.write(line)            # Writing lines that start with "Would" to the output file

# Assignment 1.6
## The file "assignment/MACVCarvallo68_R1_Q20_cutadapt_paired_bwamem_3.5_dels.txt" contains information about defective interfering genomes of Machupo virus. With the exception of the header, each line contains information about a particular deletion muant. 

## Write python code that reads through this file line by line and:

### 1. Creates a dictionary containing one key for each unique mutant (i.e., same "RefName", "DelLeft" and "DelRight") and with the values indicating the number of occurences of the corresponding mutant.


### 2. Creates an output file called "unique_deletions.tsv" that includes one row per unique mutant and four tab-delimited columns: "RefName", "DelLeft", "DelRight" and the number of occurrences. Please make sure that your output fille includes a header row. 

## Please upload your output file to BbLearn along with your completed jupyter notebook. 
