In [5]:
cd ~/Desktop/2016-adv-begin-python/

/Users/jason/Desktop/2016-adv-begin-python


In [6]:
# Problem 1: Hardcoded input filename
# Problem 2: No input checking
# Problem 3: No empty-file input checking => crash on division by zero


fp          = open('ecoli_ref-100k.fq') 
count       = 0
total_bases = 0
total_ns    = 0

for line in fp:
    if (n % 4) == 1:
        seq = line.strip()
        total_bases += len(seq)
        total_ns += seq.count('N')
    count += 1

print(count)
print(total_bases)
print(total_ns)

----

fp = open('ecoli_ref-100k.fq')

count = 0

total_bases = 0
total_ns = 0
for line in fp:
    if (count % 4) == 1:
        seq = line.strip()
        total_bases += len(seq)
        total_ns += seq.count('N')
    count += 1

print(count)
print(total_bases)
print(total_ns)

SyntaxError: invalid syntax (<ipython-input-6-d776faea8637>, line 18)

## Problem 1: Filename hardcoded

Change function to handle variable input filenames

In [7]:
import sys

print('Loading sequences from', sys.argv[1])
fp = open(sys.argv[1])

Loading sequences from -f


FileNotFoundError: [Errno 2] No such file or directory: '-f'

## Problem 2: No input checking

Simple refactoring: Splitting the function 

## Problem 3: No empty-file input check

When given an empty input-file the function currently crashes on a "division by zero" error

# Encapsulating functions

*Note: Doesn't simplify the code, instead it GENERALIZES the code*

- `enumerate` is an example of a generator

In [16]:
import sys

In [17]:
def load_sequences(filename):
    list_of_seqs = []
    
    fp = open(filename)
    for line in fp:
        if (count % 4) == 1:
            seq = line.strip()
            list_of_seqs.append(seq)
    return list_of_seqs

In [18]:
def count_Ns(seq):
    n_bases = len(seq)
    n_ns    = seq.count('N')
    return n_bases, n_ns


In [19]:
# Define a MAIN function
def main():

    print('Loading sequences from', sys.argv[1])
    seqs        = load_sequences(sys.argv[1])
    
    count       = 0
    total_bases = 0
    total_ns    = 0
    
    for seq in seqs:
        n_bases, n_ns = count_Ns(seq)
        total_bases += n_bases
        total_ns += n_ns
        count += 1

    print("total sequences", count)
    print("fraction of bases that are ns:", total_ns / total_bases)
    
# Call the MAIN function
main() # Why? To encapsulate for later use (i.e. generalizing)

Loading sequences from -f


FileNotFoundError: [Errno 2] No such file or directory: '-f'

## Code reusability & Modules

Creating a **module** of utility functions for code reuse: `utils.py`

- **Advantages: Easy to read**
- Can make code obvious by naming functions and variables well 
- Modules are how you can create re-usable code and that you can *trust*
- Can host modules at Python Package Index


- modules provide namespaces for functions, variables, etc.
   - **namespace**: grouping of functions, vars, etc.
   - **Module-Global variable**: variables defined w/n module are global within a run-instance of a program
   - If using global, then better to explicilty declare that global variable
   - M


In [25]:
import sys
import jfaUtils # Our own module

seqs = jfaUtils.load_sequences('ecoli_ref-100k.fq')

tracking = [] # initalize an empty list named 'tracking'

for seq in seqs:
    n_bases, n_ns = jfaUtils.count_Ns(seq)
    tracking.append(n_ns)

print(tracking)

AttributeError: module 'jfaUtils' has no attribute 'load_sequences'

# Creating a Summary of the data

In [26]:
import sys
import jfaUtils

seqs = jfaUtils.load_sequences(sys.argv[1])

tracking = []
for seq in seqs:
    n_bases, n_ns = jfaUtils.count_Ns(seq)
    tracking.append(n_ns)


# Summary vars: initialize     
num_with_zero  = 0
num_with_one   = 0
num_with_more  = 0

# Summary vars: accumulate
for count in tracking:
    if count == 0:
        num_with_zero += 1
    elif count == 1:
        num_with_one  += 1
    else:
        num_with_more += 1

# Summary vars: print
print(num_with_zero, num_with_one, num_with_more)
print(sum(tracking))

AttributeError: module 'jfaUtils' has no attribute 'load_sequences'

# Using `main` function


```
def main():
    ...
    ...

if __name__ == '__main__':
    main()
```

### Reasons for using a `main()` function
This prevents execution of the `main()` function, except when this script is being run on the command line

Example: `foo.py`

In [27]:
# foo.py

print('executing on import')

def f():
    print('executing in a function')
    
if(__name__ == '__main__'):
    print('executing in a script')


executing on import
executing in a script


## Handling input variables: Arg Parse module

**`sys.argv[1]`**

```
> python n-dist.py`
> python n-dist.py 'ecoli_ref-100k.fq'
```


- Can add configurability to function
 - example: Add plot ability
 
- What tells `argparse` the difference between option & req'd arg? Is it the use of dash (`-` or `--`)?
 - `nargs` parameter for `add_argument` function
  - `nargs='?'`: Enforces either 0 or 1 arguments
  - `nargs='+'`: Adds ability to add in multiple input arguments


Documentation:
- Google: 'docs python argparse'

## Sh-banging: `#! /usr/bin/env python3`

`#! /usr/bin/env python3`
- `#!` called a "sh-bang"
- Ensures the function runs in python version 3
- At top of a file made executable with: chmod +x *scriptname* means that you can specify the version of Python to run a script with

In [28]:
import sys
import utils
import argparse ## ARGPARSE

def main():
    parser = argparse.ArgumentParser()   # Create an arg parser
    
    # Configure: Specifying REQUIRED arguments
    parser.add_argument('sequencefile')  # takes in 1 required input
    
    # Configure: Specifying OPTIONAL arguments
    parser.add_argument('-v', '--version', help='print versions')
    parser.add_argument('sequencefile',
                        help='a FASTQ file full of sequences')
    
    # Parse arguments
    args = parser.parse_args()
    
    print('Using version', utils.version, 'of utils.py')

    # access input args via ARGS.SEQUENCEFILE
    seqs = utils.load_sequences(args.sequencefile)  

    tracking = []
    for seq in seqs:
        n_bases, n_ns = utils.count_Ns(seq)
        tracking.append(n_ns)

    num_with_zero = 0
    num_with_one = 0
    num_with_more = 0

    for count in tracking:
        if count == 0:
            num_with_zero += 1
        elif count == 1:
            num_with_one += 1
        else:
            num_with_more += 1

    print(num_with_zero, num_with_one, num_with_more)
    print(sum(tracking))

    
if __name__ == '__main__':
    main()


ImportError: No module named 'utils'

In [33]:
'ABcDSF'.upper.count('N')

AttributeError: 'builtin_function_or_method' object has no attribute 'count'