In [52]:
import skbio # scikit-bio belongs to the scipy stack.

for a library to become part of the scipy ecosystem, it must be based on current scipy libraries.
library authors cannot create objects unique to their libraries that are functionally equivalent
    to existing scipy or built-in objects.
    the rationale is that learning code to get started and be productive is time-intensive.
    scipy users should be able to leverage their knowledge of existing scipy objects and built-ins
    when using new objects.

In [53]:
inList = []
# documentation for skbio.io.read() says it's a generator.
# python generators can be iterated through only once, after which they're exhausted.
# we use the next command to iterate over generators.
for seq in skbio.io.read("All-Unigene1000.fa.bz2", format='fasta', compression='bz2'):
    inList.append(seq)
    next

In [54]:
# All-Unigene.fa has 148787 fasta records
len(inList) # verifying that inList has as many elements as the file has records

1000

In [55]:
# investigating the scikit-bio objects that we've collected
nuclList = inList[0].values # nuclList is an ndarray of byte-chars of the data string
nuclList = list(nuclList) # let's coerce the ndarray into a list
for index in range(len(nuclList)):
    # replace each value in index/value pair
    # with its ascii-decoded equivalent
    nuclList[index] = nuclList[index].decode('ascii')
# split and join are inverse functions
# split splits the elements of a string into elements of a list
# join joins the elements of a list (when they are all strings)
# into a single string
nuclStr = ''.join(nuclList) # sep.join(list-of-strings)

In [None]:
len(nuclStr) # length of the nucleotide string of the first fasta record

In [None]:
type(inList[0])

In [28]:
# documentation on skbio.sequence mentions the metadata, which is collected in a dictionary
inList[0].metadata

{'id': 'Unigene1_All', 'description': 'size  535    gap  0  0%'}

## now that we have a handle on the scikit-bio object, let's extract the data we want, add to it through computation, then write out our resultset to a datafile.

In [56]:
def create_nuclStr(inListPos: int) -> str:
    '''Given index of skbio.sequence object in list, return nucleotide string in skbio object.
    '''
    nuclList = inList[inListPos].values # nuclList is an ndarray of byte-chars
    nuclList = list(nuclList) # coerce the ndarray into a list
    for index in range(len(nuclList)):
        # replace each value in index/value pair
        # with its ascii-decoded equivalent
        nuclList[index] = nuclList[index].decode('ascii')
    return ''.join(nuclList)

In [57]:
dfList = []
for index in range(len(inList)):
    nuclStr = create_nuclStr(index)
    recID = inList[index].metadata['id']
    record = [recID, nuclStr] # each record is a list
    dfList.append(record) # our collection is a list-of-lists, or LoL

In [58]:
dfList[1]

['Unigene2_All',
 'CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCTGAGCATACATATTCTGCTGTGCAAGTACTTCTTTCTCTTCATGCTCCTTGGCTTTGACTTCCTTTTGAGCTTCAATTTTGTCCTTCACAAGCTCGTCGACTTTTCCGGTATACTCGCGCATAAACTGTAACAAGTATGGGAAGGCAAAGTCGATCATATTGTTTACCCAGGCAAGTTCAAGAGCCACATCAGGCCGAACTAAATCGTAACAAACGAAGAGGCATGAGGCAAAGCATTCTTTCTTTCCCTTTTCGATGAAGTAAACAAGCAACTCCTCTGCAAGTTCACGGTCACCAGATTGTGAGGCCGTCTCCATGGCATCTTTGTAAAGGTTGTCTTTCTTAGACAGCGCAATTGACTGTCTCCATCTGCCTGCTTTCTTATAAATGTAAGCAGCCACACGTCTCATTTCAAGAAGCTCGTGTTTCTCAATCTTCTGTGCGAGGCCAATCTGGTCAAAGTTATCATGCAAATCTATTGATTCATGAAGCCTGTCATAGTCTTCGTCCTCAACATAAATCTCATTTAAAGCCTCGTTCACAGCAGACACGTTATTACTCTGAACTGCAACCATGTATGGCTTCACAAGACGCAAATGACCAGCCTTCCGCATAATGTCAACAACACGTGTATGATCCACACGGAGTGCAAGCACATTGAGCATATCATTGATAAGATCAGGATGTTCTTGCAAGTAAAAATGAACGGCCTTATAATATAGCTCCACATTTGCAACTTTAACAGCAACATCCTTGAACTGCATGTGATCCCATGCTTCAGGAGAATGGTTCATAATGGTGGTTGCAGCATTATCAAACTCATCATACTGGATGTACAAATAAGTAAGTTCCTTCCAATGCTGTTGTTCATCGCAAGCTCGTATAAGCTTGGGAATATTGAGACGGGTTGAAAAAAGTTTGATATGCTCCATAAGC

In [10]:
len(dfList)

148787

In [60]:
import myModule as mM

In [14]:
for record in dfList:
    record.append(mM.get_gc_content_per(record[1]))

In [15]:
dfList[:5]
# what went wrong?

[['Unigene1_All',
  'ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCAATCCATACTATGATTCTCGCGTTGTGGGTAAATATTGTGAGAAACGTGATCCTACCCTGGCAGTTGTAGCTTACAGGAGAGGACAATGTGATGATGAACTCATCAATGTTACGAATAAGAACTCTTTGTTCAAACTGCAGGCCAGATATGTAGTTGAAAGGATGGACGGCGATCTGTGGGAAAAGGTTCTTACTCCTGATAATGCCTTTAGAAGACAGCTCATTGATCAAGTTGTGTCAACAGCTTTGCCTGAGAGTAAAAGCCCAGAGCAAGTTTCTGCTGCTGTTAAGGCTTTCATGACTGCTGATCTTCCCCATGAATTAATTGAGCTTCTTGAAAAGATAGTATTGCAGAATTCAGCATTCAGTGGGAACTTTAATCTGCAAAACCTGCTTATCTTAACAGCCATTAAAGCAGATCCAACTCGAGTTATGGATTACATTAATAGATTGGATAACTTTGATGGACCAGCTGTTGGTGAAGTGGCTA',
  0.0],
 ['Unigene2_All',
  'CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCTGAGCATACATATTCTGCTGTGCAAGTACTTCTTTCTCTTCATGCTCCTTGGCTTTGACTTCCTTTTGAGCTTCAATTTTGTCCTTCACAAGCTCGTCGACTTTTCCGGTATACTCGCGCATAAACTGTAACAAGTATGGGAAGGCAAAGTCGATCATATTGTTTACCCAGGCAAGTTCAAGAGCCACATCAGGCCGAACTAAATCGTAACAAACGAAGAGGCATGAGGCAAAGCATTCTTTCTTTCCCTTTTCGATGAAGTAAACAAGCAACTCCTCTGCAAGTTCACGGTCACCAGATTGTGAGGCCGTCTCCATGGCATCTTTGTAAAGGTTGTCTTTCTTAGACAGCGCAATTGACTGTCTCC

In [62]:
# let's modify the function in our module to allow both upper- and lowercase nucleotide string as input
del mM
# these magics allow the notebook to reload contents of edited modules
# into session memory
%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'autoreload # note that jupyter told us to %reload_ext'

In [66]:
import myModule as mM

### now we re-upload our (edited) module, re-run our code, and view the first 5 records

In [67]:
dfList = []
for index in range(len(inList)):
    nuclStr = create_nuclStr(index)
    recID = inList[index].metadata['id']
    record = [recID, nuclStr] # each record is a list
    dfList.append(record) # our collection is a list-of-lists, or LoL
    
for record in dfList:
    record.append(mM.get_gc_content_per(record[1]))
    
dfList[:5]

[['Unigene1_All',
  'ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCAATCCATACTATGATTCTCGCGTTGTGGGTAAATATTGTGAGAAACGTGATCCTACCCTGGCAGTTGTAGCTTACAGGAGAGGACAATGTGATGATGAACTCATCAATGTTACGAATAAGAACTCTTTGTTCAAACTGCAGGCCAGATATGTAGTTGAAAGGATGGACGGCGATCTGTGGGAAAAGGTTCTTACTCCTGATAATGCCTTTAGAAGACAGCTCATTGATCAAGTTGTGTCAACAGCTTTGCCTGAGAGTAAAAGCCCAGAGCAAGTTTCTGCTGCTGTTAAGGCTTTCATGACTGCTGATCTTCCCCATGAATTAATTGAGCTTCTTGAAAAGATAGTATTGCAGAATTCAGCATTCAGTGGGAACTTTAATCTGCAAAACCTGCTTATCTTAACAGCCATTAAAGCAGATCCAACTCGAGTTATGGATTACATTAATAGATTGGATAACTTTGATGGACCAGCTGTTGGTGAAGTGGCTA',
  41.1214953271028],
 ['Unigene2_All',
  'CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCTGAGCATACATATTCTGCTGTGCAAGTACTTCTTTCTCTTCATGCTCCTTGGCTTTGACTTCCTTTTGAGCTTCAATTTTGTCCTTCACAAGCTCGTCGACTTTTCCGGTATACTCGCGCATAAACTGTAACAAGTATGGGAAGGCAAAGTCGATCATATTGTTTACCCAGGCAAGTTCAAGAGCCACATCAGGCCGAACTAAATCGTAACAAACGAAGAGGCATGAGGCAAAGCATTCTTTCTTTCCCTTTTCGATGAAGTAAACAAGCAACTCCTCTGCAAGTTCACGGTCACCAGATTGTGAGGCCGTCTCCATGGCATCTTTGTAAAGGTTGTCTTTCTTAGACAGCGCA

In [68]:
# let's make a pandas dataframe out of our dataset
import pandas as pd
df = pd.DataFrame(dfList, columns=('UniqID', 'nuclStr', 'gc_content_%'))

In [27]:
df[:5]

Unnamed: 0,UniqID,nuclStr,gc_content_%
0,Unigene1_All,ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCA...,41.121495
1,Unigene2_All,CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCT...,42.135972
2,Unigene3_All,TATAAAACGACGTCGTTTAATCTCGGCTAATAGAATAGTTATATAA...,42.721893
3,Unigene4_All,CAACACCTGTTTCCATTAACTCCTCTCGATATAAAATTAAATTTCG...,45.15778
4,Unigene5_All,CTCCAAATTCTGCAATCTTGACTGTTTCAGTCTTGGTGTCTACCAG...,41.263941


In [69]:
# let's say we want to append string length to each record
# it is part of the metadata description string in the scikit-bio objects
for index in range(len(inList)):
    strLength = inList[index].metadata['description'].split()[1]
    dfList[index].append(int(strLength))
dfList[:5]

[['Unigene1_All',
  'ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCAATCCATACTATGATTCTCGCGTTGTGGGTAAATATTGTGAGAAACGTGATCCTACCCTGGCAGTTGTAGCTTACAGGAGAGGACAATGTGATGATGAACTCATCAATGTTACGAATAAGAACTCTTTGTTCAAACTGCAGGCCAGATATGTAGTTGAAAGGATGGACGGCGATCTGTGGGAAAAGGTTCTTACTCCTGATAATGCCTTTAGAAGACAGCTCATTGATCAAGTTGTGTCAACAGCTTTGCCTGAGAGTAAAAGCCCAGAGCAAGTTTCTGCTGCTGTTAAGGCTTTCATGACTGCTGATCTTCCCCATGAATTAATTGAGCTTCTTGAAAAGATAGTATTGCAGAATTCAGCATTCAGTGGGAACTTTAATCTGCAAAACCTGCTTATCTTAACAGCCATTAAAGCAGATCCAACTCGAGTTATGGATTACATTAATAGATTGGATAACTTTGATGGACCAGCTGTTGGTGAAGTGGCTA',
  41.1214953271028,
  535],
 ['Unigene2_All',
  'CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCTGAGCATACATATTCTGCTGTGCAAGTACTTCTTTCTCTTCATGCTCCTTGGCTTTGACTTCCTTTTGAGCTTCAATTTTGTCCTTCACAAGCTCGTCGACTTTTCCGGTATACTCGCGCATAAACTGTAACAAGTATGGGAAGGCAAAGTCGATCATATTGTTTACCCAGGCAAGTTCAAGAGCCACATCAGGCCGAACTAAATCGTAACAAACGAAGAGGCATGAGGCAAAGCATTCTTTCTTTCCCTTTTCGATGAAGTAAACAAGCAACTCCTCTGCAAGTTCACGGTCACCAGATTGTGAGGCCGTCTCCATGGCATCTTTGTAAAGGTTGTCTTTCTTAGA

In [70]:
# let's remake our pandas dataframe out of our dataset
import pandas as pd
df = pd.DataFrame(dfList, columns=('UniqID', 'nuclStr', 'gc_content_%', 'size'))

In [71]:
df[:5]

Unnamed: 0,UniqID,nuclStr,gc_content_%,size
0,Unigene1_All,ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCA...,41.121495,535
1,Unigene2_All,CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCT...,42.135972,3942
2,Unigene3_All,TATAAAACGACGTCGTTTAATCTCGGCTAATAGAATAGTTATATAA...,42.721893,845
3,Unigene4_All,CAACACCTGTTTCCATTAACTCCTCTCGATATAAAATTAAATTTCG...,45.15778,919
4,Unigene5_All,CTCCAAATTCTGCAATCTTGACTGTTTCAGTCTTGGTGTCTACCAG...,41.263941,269


In [72]:
df = pd.DataFrame(df, columns=('UniqID', 'nuclStr', 'size', 'gc_content_%'))

In [40]:
df[:5]

Unnamed: 0,UniqID,nuclStr,size,gc_content_%
0,Unigene1_All,ATCATTATTGATAGCAACAACAATCCGGAGCACTTCCTCACCACCA...,535,41.121495
1,Unigene2_All,CCCATTCCCGGCATAGGTGGTGCAGGTAAAGCAAGAGGCAAAAGCT...,3942,42.135972
2,Unigene3_All,TATAAAACGACGTCGTTTAATCTCGGCTAATAGAATAGTTATATAA...,845,42.721893
3,Unigene4_All,CAACACCTGTTTCCATTAACTCCTCTCGATATAAAATTAAATTTCG...,919,45.15778
4,Unigene5_All,CTCCAAATTCTGCAATCTTGACTGTTTCAGTCTTGGTGTCTACCAG...,269,41.263941


In [73]:
# let's save our resultset. pandas gives us lots of file output options.
df.to_csv('intermediate_record_set.csv', sep=',')

In [76]:
%ls -ltr

total 1246
-rw-rw---- 1 jderry CCBB_Workshops_1  11020 May  1 09:24 4_functions_modules.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1   5121 May  2 15:51 2_python_as_calculator.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1   3588 May  2 22:20 3_variables.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1   7147 May  5 14:01 5_algorithms_plus_data_structures_equal_programs.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1   9949 May  5 14:38 1_python_and_jupyter_preliminaries.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1    992 May  6 19:32 6_working_with_datasets_i_fasta.ipynb
-rw-rw---- 1 jderry CCBB_Workshops_1    428 May  7 10:56 myModule.py
drwxrwx--- 2 jderry CCBB_Workshops_1      3 May  7 10:58 [0m[01;34m__pycache__[0m/
-rw-rw---- 1 jderry CCBB_Workshops_1 240177 May  7 15:42 All-Unigene1000.fa.bz2
-rw-rw---- 1 jderry CCBB_Workshops_1 839663 May  7 15:48 intermediate_record_set.csv
-rw-rw---- 1 jderry CCBB_Workshops_1 232494 May  7 15:48 intermediate_record_set.csv.tar.bz2
-rw-rw---- 1 jd

In [75]:
import tarfile
with tarfile.open('intermediate_record_set.csv.tar.bz2', 'w:bz2') as tar:
    tar.add('intermediate_record_set.csv')