In [None]:
import skbio # scikit-bio belongs to the scipy stack.

for a library to become part of the scipy ecosystem, it must be based on current scipy libraries.
library authors cannot create objects unique to their libraries that are functionally equivalent
    to existing scipy or built-in objects.
    the rationale is that learning code to get started and be productive is time-intensive.
    scipy users should be able to leverage their knowledge of existing scipy objects and built-ins
    when using new objects.

In [None]:
inList = []
# documentation for skbio.io.read() says it's a generator.
# python generators can be iterated through only once, after which they're exhausted.
# we use the next command to iterate over generators.
for seq in skbio.io.read("../dataset/All-Unigene1000.fa.bz2", format='fasta', compression='bz2'):
    inList.append(seq) # append record to inList
    next

In [None]:
# the fasta recordset we're using has only 1000 fasta records
len(inList) # verifying that inList has as many elements as the file has records

In [None]:
# investigating the scikit-bio objects that we've collected
nuclList = inList[0].values # nuclList is an ndarray of byte-chars of the data string
nuclList = list(nuclList) # let's coerce the ndarray into a list
for index in range(len(nuclList)):
    # replace each value in index/value pair
    # with its ascii-decoded equivalent
    nuclList[index] = nuclList[index].decode('ascii')
# split and join are inverse functions
# split splits the elements of a string into elements of a list
# join joins the elements of a list (when they are all strings)
# into a single string
nuclStr = ''.join(nuclList) # sep.join(list-of-strings)

In [None]:
len(nuclStr) # length of the nucleotide string of the first fasta record

In [None]:
type(inList[0])

In [None]:
# documentation on skbio.sequence mentions the metadata, which is collected in a dictionary
inList[0].metadata

## now that we have a handle on the scikit-bio object, let's extract the data we want, add to it through computation, then write out our resultset to a datafile.

In [None]:
def create_nuclStr(inListPos: int) -> str:
    '''Given index of skbio.sequence object in list, return nucleotide string in skbio object.
    '''
    nuclList = inList[inListPos].values # nuclList is an ndarray of byte-chars
    nuclList = list(nuclList) # coerce the ndarray into a list
    for index in range(len(nuclList)):
        # replace each value in index/value pair
        # with its ascii-decoded equivalent
        nuclList[index] = nuclList[index].decode('ascii')
    return ''.join(nuclList)

In [None]:
dfList = []
for index in range(len(inList)):
    nuclStr = create_nuclStr(index)
    recID = inList[index].metadata['id']
    record = [recID, nuclStr] # each record is a list
    dfList.append(record) # our collection is a list-of-lists, or LoL

In [None]:
dfList[1]

In [None]:
len(dfList)

In [None]:
# create module, save to working directory
import myModule as mM

In [None]:
for record in dfList:
    record.append(mM.get_gc_content_per(record[1]))

In [None]:
dfList[:5]
# what went wrong?

In [None]:
# let's modify the function in our module to allow both upper- and lowercase nucleotide string as input
del mM
# these magics allow the notebook to reload contents of edited modules
# into session memory
%load_ext autoreload
%autoreload 2

In [None]:
import myModule as mM

### now we re-upload our (edited) module, re-run our code, and view the first 5 records

In [None]:
dfList = []
for index in range(len(inList)):
    nuclStr = create_nuclStr(index)
    recID = inList[index].metadata['id']
    record = [recID, nuclStr] # each record is a list
    dfList.append(record) # our collection is a list-of-lists, or LoL
    
for record in dfList:
    record.append(mM.get_gc_content_per(record[1]))
    
dfList[:5]

In [None]:
# let's make a pandas dataframe out of our dataset
import pandas as pd
df = pd.DataFrame(dfList, columns=('UniqID', 'nuclStr', 'gc_content_%'))

In [None]:
df[:5]

In [None]:
# let's say we want to append string length to each record
# it is part of the metadata description string in the scikit-bio objects
for index in range(len(inList)):
    strLength = inList[index].metadata['description'].split()[1]
    dfList[index].append(int(strLength))
dfList[:5]

In [None]:
# let's remake our pandas dataframe out of our dataset
import pandas as pd
df = pd.DataFrame(dfList, columns=('UniqID', 'nuclStr', 'gc_content_%', 'size'))

In [None]:
df[:5]

In [None]:
df = pd.DataFrame(df, columns=('UniqID', 'nuclStr', 'size', 'gc_content_%'))

In [None]:
df[:5]

# saving our work

In [None]:
# let's save our resultset. pandas gives us lots of file output options.
# we can save our work to share with colleagues, or to prepare as input for more data processing software.
df.to_csv('intermediate_record_set.csv', sep=',')
# we can also save our work to disk, with the intention of working on it later, picking up where we left off.
df.to_pickle('df.bin')

### pickling saves our collections to our mass storage drive (hard drive, usb drive, etc) in a format that allows us to restore them directly to python session memory later.
### this allows us to work on a collection over time --- update its values, add new values, and delete values --- as we might over long-running projects, writing it out when we stop working on the project, then reading it directly back into memory when we pick up where we left off.
### let's see how that works.

In [None]:
%ls -ltr

In [None]:
%whos

### in python, the del keyword allows us to delete objects from session memory.
### let's delete the df dataframe from memory, then restore it from the pickle file.

In [None]:
del df

In [None]:
%whos # it's gone!

In [None]:
# if we were starting a new session that we're restoring df to, we'd need to first import pandas ---
import pandas as pd

In [None]:
# and now we unpickle df into session memory...
df = pd.read_pickle('../pickle/df.bin')

In [None]:
%whos # it's back! tuh-duh!

In [None]:
# we may want to compress our file...
import tarfile
with tarfile.open('../dataset/intermediate_record_set.csv.tar.bz2', 'w:bz2') as tar:
    tar.add('intermediate_record_set.csv')