## 1. Dictionaries, Maps and Hashtables

#### *  Dictionaries are the central data structure in Python.

#### *  Dicts store an arbitrary number of objects, each identified by a unique hashable *key*.

#### *  Dicts are *unordered* (key, value) mappings.

#### *  Dicts are often called *maps*, *hashmaps*, *lookup tables*, or *associative arrays*.

#### *  Dicts allow for efficient lookup, insert, update and delete operations of any object associated with a given key.

### Built-in **dict** mapping data type

#### Indexed by keys that can be of any hashable data type (e.g. immutable types such as strings and numbers, or tuples of hashable types only)

#### Dicts are based on a well tuned hashtable implementation

#### *O(1)* time complexity for lookups, inserts, deletes and updates 

#### Dicts can be created by several means:

#### -  Curly-braces dictionary expression syntax, comma-separated list of key:value pairs
#### -  Dictionary comprehension: {}
#### -  **dict** type constructor: dict()

In [1]:
DNA_bases = {
    'A':'adenine',
    'T':'thymine',
    'C':'cytosine',
    'G':'guanine'
}

RNA_bases = dict(zip(['A','U','C','G'], ['adenine','uracil','cytosine','guanine']))

hexadecimals = {x:hex(x) for x in range(16)}

In [2]:
hexadecimals

{0: '0x0',
 1: '0x1',
 2: '0x2',
 3: '0x3',
 4: '0x4',
 5: '0x5',
 6: '0x6',
 7: '0x7',
 8: '0x8',
 9: '0x9',
 10: '0xa',
 11: '0xb',
 12: '0xc',
 13: '0xd',
 14: '0xe',
 15: '0xf'}

In [3]:
DNA_bases

{'A': 'adenine', 'T': 'thymine', 'C': 'cytosine', 'G': 'guanine'}

In [4]:
RNA_bases

{'A': 'adenine', 'U': 'uracil', 'C': 'cytosine', 'G': 'guanine'}

In [6]:
dir(dict)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [7]:
help(dict)

Help on class dict in module builtins:

class dict(object)
 |  dict() -> new empty dictionary
 |  dict(mapping) -> new dictionary initialized from a mapping object's
 |      (key, value) pairs
 |  dict(iterable) -> new dictionary initialized as if via:
 |      d = {}
 |      for k, v in iterable:
 |          d[k] = v
 |  dict(**kwargs) -> new dictionary initialized with the name=value pairs
 |      in the keyword argument list.  For example:  dict(one=1, two=2)
 |  
 |  Methods defined here:
 |  
 |  __contains__(self, key, /)
 |      True if the dictionary has the specified key, else False.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(...)
 |      x.__getitem__(y) <==> x[y]
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __init__(self,

In [8]:
# Extracting a subset of a dictionary
# Make a dict of purines using a dictionary comprehension

codes = {'A', 'G'}
purines = {k:v for k,v in DNA_bases.items() if k in codes}
purines

{'A': 'adenine', 'G': 'guanine'}

In [9]:
# Make a dict of pyrimidines
# Remove purines keys

DNA_pyrimidines = {k:DNA_bases[k] for k in DNA_bases.keys() - codes}
DNA_pyrimidines

{'C': 'cytosine', 'T': 'thymine'}

In [10]:
RNA_pyrimidines = {k:RNA_bases[k] for k in RNA_bases.keys() - codes}
RNA_pyrimidines

{'C': 'cytosine', 'U': 'uracil'}

In [11]:
list(RNA_bases)

['A', 'U', 'C', 'G']

In [12]:
len(RNA_bases)

4

In [13]:
it = iter(RNA_bases)
next(it)

'A'

In [14]:
RNA_pyrimidines.get('A') #returns default None since the key is not in the dictionary

In [15]:
hexadecimals_extra = {x:hex(x) for x in range(16, 32)}
hexadecimals.update(hexadecimals_extra)
hexadecimals

{0: '0x0',
 1: '0x1',
 2: '0x2',
 3: '0x3',
 4: '0x4',
 5: '0x5',
 6: '0x6',
 7: '0x7',
 8: '0x8',
 9: '0x9',
 10: '0xa',
 11: '0xb',
 12: '0xc',
 13: '0xd',
 14: '0xe',
 15: '0xf',
 16: '0x10',
 17: '0x11',
 18: '0x12',
 19: '0x13',
 20: '0x14',
 21: '0x15',
 22: '0x16',
 23: '0x17',
 24: '0x18',
 25: '0x19',
 26: '0x1a',
 27: '0x1b',
 28: '0x1c',
 29: '0x1d',
 30: '0x1e',
 31: '0x1f'}

In [16]:
# Find commonalities in two dictionaries

# keys in common
hexadecimals.keys() & hexadecimals_extra.keys()

{16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}

In [17]:
# (key, value) pairs not in common
RNA_bases.items() - DNA_bases.items()

{('U', 'uracil')}

### Specialized dictionary implementations from Python's standard library

### **collections.defaultdict**

#### dict subclass that calls a factory function to supply default values for missing keys

In [18]:
from collections import defaultdict

nucleobases = [('purine', 'A'), ('pyrimidine', 'T'), ('pyrimidine', 'U'), ('pyrimidine', 'C'), ('purine', 'G')]
nuc_defdict = defaultdict(list)

# accessing a missing key creates it and 
# initializes it using the default factory
# e.g. list(), set() or any callable passed to the constructor
for k, v in nucleobases:
    nuc_defdict[k].append(v)
    
sorted(nuc_defdict.items())

[('purine', ['A', 'G']), ('pyrimidine', ['T', 'U', 'C'])]

In [19]:
# accessing an uninitialized key
# results in default value assignment
# here an empty list
nuc_defdict['other_nuc']

[]

In [20]:
'''
defaultdict(lambda :None)
defaultdict(lambda :50)


def defaultvalue():
    return 50

other_defdict = defaultdict(defaultvalue)
'''

'\ndefaultdict(lambda :None)\ndefaultdict(lambda :50)\n\n\ndef defaultvalue():\n    return 50\n\nother_defdict = defaultdict(defaultvalue)\n'

### **collections.OrderedDict**

#### dict subclass that remembers the original insertion order of entries added to it when iterating or serializing

In [21]:
from collections import OrderedDict

gene_ids = {"symbol": "http://identifiers.org/hgnc.symbol/",
            "ensembl.gene": "http://identifiers.org/ensembl.gene/",
            "ensembl.protein": "http://identifiers.org/ensembl.protein/",
            "ensembl.transcript": "http://identifiers.org/ensembl.transcript/",
            "go.CC.id": "http://identifiers.org/go/",
            "go.CC.pubmed": "http://identifiers.org/pubmed/",
            "go.MF.id": "http://identifiers.org/go/",
            "go.MF.pubmed": "http://identifiers.org/pubmed/",
            "go.BP.id": "http://identifiers.org/go/",
            "go.BP.pubmed": "http://identifiers.org/pubmed/",
            "generif.pubmed": "http://identifiers.org/pubmed/",
            "homologene.id": "http://identifiers.org/homologene/",
            "interpro.id": "http://identifiers.org/interpro/",
            "pathway.biocarta.id": "http://identifiers.org/biocarta.pathway/",
            "pathway.kegg.id": "http://identifiers.org/kegg.pathway/",
            "pathway.pharmgkb.id": "http://identifiers.org/pharmgkb.pathways/",
            "pathway.reactome.id": "http://identifiers.org/reactome/",
            "pathway.sympdb.id": "http://identifiers.org/smpdb/",
            "pathway.wikipathways.id": "http://identifiers.org/wikipathways/",
            "refseq.genomic": "http://identifiers.org/refseq/",
            "refseq.protein": "http://identifiers.org/refseq/",
            "refseq.rna": "http://identifiers.org/refseq/",
            "uniprot.Swiss-Prot": "http://identifiers.org/uniprot/"
           }

gene_info_ordered = OrderedDict(gene_ids)

In [22]:
gene_info_ordered

OrderedDict([('symbol', 'http://identifiers.org/hgnc.symbol/'),
             ('ensembl.gene', 'http://identifiers.org/ensembl.gene/'),
             ('ensembl.protein', 'http://identifiers.org/ensembl.protein/'),
             ('ensembl.transcript',
              'http://identifiers.org/ensembl.transcript/'),
             ('go.CC.id', 'http://identifiers.org/go/'),
             ('go.CC.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.MF.id', 'http://identifiers.org/go/'),
             ('go.MF.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.BP.id', 'http://identifiers.org/go/'),
             ('go.BP.pubmed', 'http://identifiers.org/pubmed/'),
             ('generif.pubmed', 'http://identifiers.org/pubmed/'),
             ('homologene.id', 'http://identifiers.org/homologene/'),
             ('interpro.id', 'http://identifiers.org/interpro/'),
             ('pathway.biocarta.id',
              'http://identifiers.org/biocarta.pathway/'),
             ('pathway.keg

In [23]:
# reassignment of an existing key doesn't change the order
# OrderedDict maintains internally a doubly linked list for ordering the keys in the insertion order

gene_info_ordered['generif.pubmed'] = 'some_uri'
gene_info_ordered

OrderedDict([('symbol', 'http://identifiers.org/hgnc.symbol/'),
             ('ensembl.gene', 'http://identifiers.org/ensembl.gene/'),
             ('ensembl.protein', 'http://identifiers.org/ensembl.protein/'),
             ('ensembl.transcript',
              'http://identifiers.org/ensembl.transcript/'),
             ('go.CC.id', 'http://identifiers.org/go/'),
             ('go.CC.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.MF.id', 'http://identifiers.org/go/'),
             ('go.MF.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.BP.id', 'http://identifiers.org/go/'),
             ('go.BP.pubmed', 'http://identifiers.org/pubmed/'),
             ('generif.pubmed', 'some_uri'),
             ('homologene.id', 'http://identifiers.org/homologene/'),
             ('interpro.id', 'http://identifiers.org/interpro/'),
             ('pathway.biocarta.id',
              'http://identifiers.org/biocarta.pathway/'),
             ('pathway.kegg.id', 'http://identif

In [24]:
for key in gene_info_ordered:
    print(key, gene_info_ordered[key])

symbol http://identifiers.org/hgnc.symbol/
ensembl.gene http://identifiers.org/ensembl.gene/
ensembl.protein http://identifiers.org/ensembl.protein/
ensembl.transcript http://identifiers.org/ensembl.transcript/
go.CC.id http://identifiers.org/go/
go.CC.pubmed http://identifiers.org/pubmed/
go.MF.id http://identifiers.org/go/
go.MF.pubmed http://identifiers.org/pubmed/
go.BP.id http://identifiers.org/go/
go.BP.pubmed http://identifiers.org/pubmed/
generif.pubmed some_uri
homologene.id http://identifiers.org/homologene/
interpro.id http://identifiers.org/interpro/
pathway.biocarta.id http://identifiers.org/biocarta.pathway/
pathway.kegg.id http://identifiers.org/kegg.pathway/
pathway.pharmgkb.id http://identifiers.org/pharmgkb.pathways/
pathway.reactome.id http://identifiers.org/reactome/
pathway.sympdb.id http://identifiers.org/smpdb/
pathway.wikipathways.id http://identifiers.org/wikipathways/
refseq.genomic http://identifiers.org/refseq/
refseq.protein http://identifiers.org/refseq/
r

In [25]:
gene_info_ordered['generif.pubmed'] = 'http://identifiers.org/pubmed/'
gene_info_ordered

OrderedDict([('symbol', 'http://identifiers.org/hgnc.symbol/'),
             ('ensembl.gene', 'http://identifiers.org/ensembl.gene/'),
             ('ensembl.protein', 'http://identifiers.org/ensembl.protein/'),
             ('ensembl.transcript',
              'http://identifiers.org/ensembl.transcript/'),
             ('go.CC.id', 'http://identifiers.org/go/'),
             ('go.CC.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.MF.id', 'http://identifiers.org/go/'),
             ('go.MF.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.BP.id', 'http://identifiers.org/go/'),
             ('go.BP.pubmed', 'http://identifiers.org/pubmed/'),
             ('generif.pubmed', 'http://identifiers.org/pubmed/'),
             ('homologene.id', 'http://identifiers.org/homologene/'),
             ('interpro.id', 'http://identifiers.org/interpro/'),
             ('pathway.biocarta.id',
              'http://identifiers.org/biocarta.pathway/'),
             ('pathway.keg

In [26]:
for key in gene_info_ordered:
    print(key, gene_info_ordered[key])

symbol http://identifiers.org/hgnc.symbol/
ensembl.gene http://identifiers.org/ensembl.gene/
ensembl.protein http://identifiers.org/ensembl.protein/
ensembl.transcript http://identifiers.org/ensembl.transcript/
go.CC.id http://identifiers.org/go/
go.CC.pubmed http://identifiers.org/pubmed/
go.MF.id http://identifiers.org/go/
go.MF.pubmed http://identifiers.org/pubmed/
go.BP.id http://identifiers.org/go/
go.BP.pubmed http://identifiers.org/pubmed/
generif.pubmed http://identifiers.org/pubmed/
homologene.id http://identifiers.org/homologene/
interpro.id http://identifiers.org/interpro/
pathway.biocarta.id http://identifiers.org/biocarta.pathway/
pathway.kegg.id http://identifiers.org/kegg.pathway/
pathway.pharmgkb.id http://identifiers.org/pharmgkb.pathways/
pathway.reactome.id http://identifiers.org/reactome/
pathway.sympdb.id http://identifiers.org/smpdb/
pathway.wikipathways.id http://identifiers.org/wikipathways/
refseq.genomic http://identifiers.org/refseq/
refseq.protein http://ide

In [27]:
# control the order of fields in a JSON encoding
# using the data contained in an OrderedDict

import json
json.dumps(gene_info_ordered)

'{"symbol": "http://identifiers.org/hgnc.symbol/", "ensembl.gene": "http://identifiers.org/ensembl.gene/", "ensembl.protein": "http://identifiers.org/ensembl.protein/", "ensembl.transcript": "http://identifiers.org/ensembl.transcript/", "go.CC.id": "http://identifiers.org/go/", "go.CC.pubmed": "http://identifiers.org/pubmed/", "go.MF.id": "http://identifiers.org/go/", "go.MF.pubmed": "http://identifiers.org/pubmed/", "go.BP.id": "http://identifiers.org/go/", "go.BP.pubmed": "http://identifiers.org/pubmed/", "generif.pubmed": "http://identifiers.org/pubmed/", "homologene.id": "http://identifiers.org/homologene/", "interpro.id": "http://identifiers.org/interpro/", "pathway.biocarta.id": "http://identifiers.org/biocarta.pathway/", "pathway.kegg.id": "http://identifiers.org/kegg.pathway/", "pathway.pharmgkb.id": "http://identifiers.org/pharmgkb.pathways/", "pathway.reactome.id": "http://identifiers.org/reactome/", "pathway.sympdb.id": "http://identifiers.org/smpdb/", "pathway.wikipathways.

In [28]:
#delete
removed = gene_info_ordered.pop('generif.pubmed')
removed

'http://identifiers.org/pubmed/'

In [29]:
gene_info_ordered

OrderedDict([('symbol', 'http://identifiers.org/hgnc.symbol/'),
             ('ensembl.gene', 'http://identifiers.org/ensembl.gene/'),
             ('ensembl.protein', 'http://identifiers.org/ensembl.protein/'),
             ('ensembl.transcript',
              'http://identifiers.org/ensembl.transcript/'),
             ('go.CC.id', 'http://identifiers.org/go/'),
             ('go.CC.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.MF.id', 'http://identifiers.org/go/'),
             ('go.MF.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.BP.id', 'http://identifiers.org/go/'),
             ('go.BP.pubmed', 'http://identifiers.org/pubmed/'),
             ('homologene.id', 'http://identifiers.org/homologene/'),
             ('interpro.id', 'http://identifiers.org/interpro/'),
             ('pathway.biocarta.id',
              'http://identifiers.org/biocarta.pathway/'),
             ('pathway.kegg.id', 'http://identifiers.org/kegg.pathway/'),
             ('path

In [30]:
# insert
gene_info_ordered['generif.pubmed'] = 'http://identifiers.org/pubmed/'
gene_info_ordered

OrderedDict([('symbol', 'http://identifiers.org/hgnc.symbol/'),
             ('ensembl.gene', 'http://identifiers.org/ensembl.gene/'),
             ('ensembl.protein', 'http://identifiers.org/ensembl.protein/'),
             ('ensembl.transcript',
              'http://identifiers.org/ensembl.transcript/'),
             ('go.CC.id', 'http://identifiers.org/go/'),
             ('go.CC.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.MF.id', 'http://identifiers.org/go/'),
             ('go.MF.pubmed', 'http://identifiers.org/pubmed/'),
             ('go.BP.id', 'http://identifiers.org/go/'),
             ('go.BP.pubmed', 'http://identifiers.org/pubmed/'),
             ('homologene.id', 'http://identifiers.org/homologene/'),
             ('interpro.id', 'http://identifiers.org/interpro/'),
             ('pathway.biocarta.id',
              'http://identifiers.org/biocarta.pathway/'),
             ('pathway.kegg.id', 'http://identifiers.org/kegg.pathway/'),
             ('path

### **collections.Counter**

#### dict subclass for counting hashable objects

#### a dictionary that maps the hashable input items to the number of occurrences

#### Counter instances can be combined using various mathematical operations

In [32]:
from collections import Counter

seq = 'CAGCCTCCCGCGACGATGCCCCTCAACGTTAGCTTCACCAACAGGAACTATGACCTCGAC\
TACGACTCGGTGCAGCCGTATTTCTACTGCGACGAGGAGGAGAACTTCTACCAGCAGCAG\
CAGCAGAGCGAGCTGCAGCCCCCGGCGCCCAGCGAGGATATCTGGAAGAAATTCGAGCTG\
CTGCCCACCCCGCCCCTGTCCCCTAGCCGCCGCTCCGGGCTCTGCTCGCCCTCCTACGTT\
GCGGTCACACCCTTCTCCCTTCGGGGAGACAACGACGGCGGTGGCGGGAGCTTCTCCACG\
GCCGACCAGCTGGAGATGGTGACCGAGCTGCTGGGAGGAGACATGGTGAACCAGAGTTTC\
ATCTGCGACCCGGACGACGAGACCTTCATCAAAAACATCATCATCCAGGACTGTATGTGG\
AGCGGCTTCTCGGCCGCCGCCAAGCTCGTCTCAGAGAAGCTGGCCTCCTACCAGGCTGCG\
CGCAAAGACAGCGGCAGCCCGAACCCCGCCCGCGGCCACAGCGTCTGCTCCACCTCCAGC\
TTGTACCTGCAGGATCTGAGCGCCGCCGCCTCAGAGTGCATCGACCCCTCGGTGGTCTTC\
CCCTACCCTCTCAACGACAGCAGCTCGCCCAAGTCCTGCGCCTCGCAAGACTCCAGCGCC\
TTCTCTCCGTCCTCGGATTCTCTGCTCTCCTCGACGGAGTCCTCCCCGCAGGGCAGCCCC\
GAGCCCCTGGTGCTCCATGAGGAGACACCGCCCACCACCAGCAGCGACTCTG'

counts = Counter(seq)
counts

Counter({'C': 292, 'A': 146, 'G': 204, 'T': 130})

In [33]:
# two most frequently occuring items in the seq
counts.most_common(2)

[('C', 292), ('G', 204)]

### **collections.ChainMap**

#### dict-like class for creating a single, updatable view of multiple mappings

#### underlying mappings, stored in a list, are successively scanned till a key is found

#### operations that alter the mappings always affect the first mapping listed

In [34]:
from collections import ChainMap

binding_motif_core = {'pos2': 'A', 'pos3': 'T', 'pos4': 'T', 'pos5': 'G', 'pos6': 'C', 
                      'pos7': 'A', 'pos8': 'C', 'pos9': 'A', 'pos10': 'A'}

binding_motif_extended_variant = {'pos1': 'T', 'pos2': 'G', 'pos3': 'T', 'pos4': 'T', 'pos5': 'T', 'pos6': 'C', 
                      'pos7': 'G', 'pos8': 'T', 'pos9': 'C', 'pos10': 'A', 'pos11': 'T'}

binding_motif = ChainMap(binding_motif_core, binding_motif_extended_variant)
 
print(binding_motif['pos1'])    # from extended variant
print(binding_motif['pos2'])    # from core

print(len(binding_motif))
print(list(binding_motif.keys()))
print(list(binding_motif.values()))

print("First mapping: ", binding_motif_core)
del binding_motif['pos2']
print("Altered first mapping: ", binding_motif_core)

try:
    del binding_motif['pos2']
except KeyError as err:
    print("Error trying to alter the second mapping: ", err)
    
binding_motif['pos2'] = 'A'
print(sorted(binding_motif_core, key=lambda x: int(x[3:])))

T
A
11
['pos1', 'pos2', 'pos3', 'pos4', 'pos5', 'pos6', 'pos7', 'pos8', 'pos9', 'pos10', 'pos11']
['T', 'A', 'T', 'T', 'G', 'C', 'A', 'C', 'A', 'A', 'T']
First mapping:  {'pos2': 'A', 'pos3': 'T', 'pos4': 'T', 'pos5': 'G', 'pos6': 'C', 'pos7': 'A', 'pos8': 'C', 'pos9': 'A', 'pos10': 'A'}
Altered first mapping:  {'pos3': 'T', 'pos4': 'T', 'pos5': 'G', 'pos6': 'C', 'pos7': 'A', 'pos8': 'C', 'pos9': 'A', 'pos10': 'A'}
Error trying to alter the second mapping:  "Key not found in the first mapping: 'pos2'"
['pos2', 'pos3', 'pos4', 'pos5', 'pos6', 'pos7', 'pos8', 'pos9', 'pos10']


![TF_Sequence_Logo](https://jaspar.genereg.net/static/logos/all/svg/MA0466.1.svg)

### **types.MappingProxyType**

#### wrapper for creating immutable versions of dictionaries

In [35]:
from types import MappingProxyType

DNA_bases_read_only = MappingProxyType(DNA_bases)

print(DNA_bases_read_only['T'])

DNA_bases_read_only['T'] = 'uracil'    #invalid

thymine


TypeError: 'mappingproxy' object does not support item assignment

![DNA_RNA_cartoon.png](./Images/DNA_RNA_cartoon.png)

## 2. Array Data Structures

#### * Arrays are contiguous (not linked) data structures that store information in adjoining blocks of memory.

#### * Arrays consist of fixed-size data records that allow each element to be efficiently located based in its index.

#### * *O(1)* time complexity for accessing an element by its index. 

### **list**

#### *ordered* and *mutable* dynamic arrays

#### elements can be added or removed and the list will automatically adjust the storage by allocating or releasing memory

#### can hold elements of arbitrary data types 

#### less tightly packed than a types array, takes up more space

In [36]:
from scipy import stats

arr = [min, max, type, stats.describe, [0, 1, 1, 2, 3, 5, 8]]
arr

[<function min>,
 <function max>,
 type,
 <function scipy.stats.stats.describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate')>,
 [0, 1, 1, 2, 3, 5, 8]]

In [37]:
arr[0]

<function min>

In [38]:
arr[-1]

[0, 1, 1, 2, 3, 5, 8]

In [39]:
arr[1](arr[-1])

8

In [40]:
arr[2](arr[-1])

list

In [41]:
arr[4].append(13)

In [42]:
arr

[<function min>,
 <function max>,
 type,
 <function scipy.stats.stats.describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate')>,
 [0, 1, 1, 2, 3, 5, 8, 13]]

In [43]:
arr[-2](arr[-1])

DescribeResult(nobs=8, minmax=(0, 13), mean=4.125, variance=19.553571428571427, skewness=1.0874194280027905, kurtosis=-0.03979733533495944)

In [44]:
arr[-1][::-1]

[13, 8, 5, 3, 2, 1, 1, 0]

In [45]:
from numpy import mean, std

arr.append(mean)
arr.append(std)

arr

[<function min>,
 <function max>,
 type,
 <function scipy.stats.stats.describe(a, axis=0, ddof=1, bias=True, nan_policy='propagate')>,
 [0, 1, 1, 2, 3, 5, 8, 13],
 <function numpy.mean(a, axis=None, dtype=None, out=None, keepdims=<no value>)>,
 <function numpy.std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=<no value>)>]

In [46]:
arr[-2](arr[-3])

4.125

In [47]:
# a list comprehension
[idx for idx, item in enumerate(arr[-3]) if item % 2 == 0]

[0, 3, 6]

### **tuple**

#### *immutable* 

#### elements can't be added or removed dynamically 

#### elements are defined at creation time

#### can hold elements of arbitrary data types 

In [48]:
arr = 'A', 'T', 'G'
arr

('A', 'T', 'G')

In [49]:
arr[0] ='T'

TypeError: 'tuple' object does not support item assignment

In [50]:
del arr[0]

TypeError: 'tuple' object doesn't support item deletion

In [51]:
arr.append('T')

AttributeError: 'tuple' object has no attribute 'append'

In [52]:
# adding elements creates a copy of the tuple
arr + ('A',)

('A', 'T', 'G', 'A')

### **array.array**

#### *mutable space-efficient typed* arrays 

#### store characters, integer or floating point numeric data types

#### the *type* is specified at creation time using a single character *typecode*

In [54]:
import array

help(array.array)

Help on class array in module array:

class array(builtins.object)
 |  array(typecode [, initializer]) -> array
 |  
 |  Return a new array whose items are restricted by typecode, and
 |  initialized from the optional initializer value, which must be a list,
 |  string or iterable over elements of the appropriate type.
 |  
 |  Arrays represent basic values and behave very much like lists, except
 |  the type of objects stored in them is constrained. The type is specified
 |  at object creation time by using a type code, which is a single character.
 |  The following type codes are defined:
 |  
 |      Type code   C Type             Minimum size in bytes 
 |      'b'         signed integer     1 
 |      'B'         unsigned integer   1 
 |      'u'         Unicode character  2 (see note) 
 |      'h'         signed integer     2 
 |      'H'         unsigned integer   2 
 |      'i'         signed integer     2 
 |      'I'         unsigned integer   2 
 |      'l'         signed int

In [55]:
arr = array.array('f', (0.001, 0.005, 0.01, 0.05, 0.1, 0.5))
arr

array('f', [0.0010000000474974513, 0.004999999888241291, 0.009999999776482582, 0.05000000074505806, 0.10000000149011612, 0.5])

In [56]:
arr.append(1)
arr

array('f', [0.0010000000474974513, 0.004999999888241291, 0.009999999776482582, 0.05000000074505806, 0.10000000149011612, 0.5, 1.0])

In [57]:
arr[0] = 'I am typed'

TypeError: must be real number, not str

### **other array data structures**

#### **str** - *immutable* arrays of Unicode characters

#### **bytes** - *immutable* arrays of single bytes

#### **bytearray** - *mutable* arrays of single bytes

In [58]:
# str
arr = 'ATCG'

# immutable
arr[0] = 'Y'

TypeError: 'str' object does not support item assignment

In [59]:
# mutable representation of a string - list of characters 
list(arr)

['A', 'T', 'C', 'G']

In [60]:
''.join(list(arr))

'ATCG'

In [61]:
arr = bytes([*range(0, 16, 4)])
arr

b'\x00\x04\x08\x0c'

In [62]:
del arr[0]

TypeError: 'bytes' object doesn't support item deletion

In [63]:
arr = bytearray([*range(0, 16, 4)])
arr

bytearray(b'\x00\x04\x08\x0c')

In [64]:
arr[3] = 256

ValueError: byte must be in range(0, 256)

In [65]:
bytes(arr) #conversion copies the data

b'\x00\x04\x08\x0c'

## 3. Sets

#### * *unordered* and *mutable* collections of *unique* elements

#### * backed by the **dict** data type

#### * *O(1)* time complexity for membership tests 

#### * *O(n)* for set operations: union, intersection, difference, etc.

In [66]:
DNA_bases = {'A', 'T', 'C', 'G'}
RNA_bases = {'A', 'U', 'C', 'G'}

In [67]:
# membership test
'A' in DNA_bases

True

In [68]:
DNA_bases.add('A')

In [69]:
DNA_bases

{'A', 'C', 'G', 'T'}

In [70]:
 DNA_bases | RNA_bases   #equivalent to DNA_bases.union(RNA_bases)

{'A', 'C', 'G', 'T', 'U'}

In [71]:
# intersection
DNA_bases & RNA_bases   #equivalent to DNA_bases.intersection(RNA_bases)

{'A', 'C', 'G'}

In [72]:
# difference
DNA_bases - RNA_bases   #equivalent to DNA_bases.difference(RNA_bases)

{'T'}

In [73]:
# symmetric difference
DNA_bases ^ RNA_bases   #equivalent to DNA_bases.symmetric_difference(RNA_bases)

{'T', 'U'}

In [74]:
set([0, 1, 1, 2, 3, 5, 8, 13])

{0, 1, 2, 3, 5, 8, 13}

In [75]:
from itertools import product

# a set comprehension
codons = {''.join(l) for l in product('AUCG', repeat = 3)}
codons

{'AAA',
 'AAC',
 'AAG',
 'AAU',
 'ACA',
 'ACC',
 'ACG',
 'ACU',
 'AGA',
 'AGC',
 'AGG',
 'AGU',
 'AUA',
 'AUC',
 'AUG',
 'AUU',
 'CAA',
 'CAC',
 'CAG',
 'CAU',
 'CCA',
 'CCC',
 'CCG',
 'CCU',
 'CGA',
 'CGC',
 'CGG',
 'CGU',
 'CUA',
 'CUC',
 'CUG',
 'CUU',
 'GAA',
 'GAC',
 'GAG',
 'GAU',
 'GCA',
 'GCC',
 'GCG',
 'GCU',
 'GGA',
 'GGC',
 'GGG',
 'GGU',
 'GUA',
 'GUC',
 'GUG',
 'GUU',
 'UAA',
 'UAC',
 'UAG',
 'UAU',
 'UCA',
 'UCC',
 'UCG',
 'UCU',
 'UGA',
 'UGC',
 'UGG',
 'UGU',
 'UUA',
 'UUC',
 'UUG',
 'UUU'}

In [76]:
len(codons)

64

In [77]:
codons.add('UUU')
len(codons)

64

#### **frozenset** - *immutable* sets

#### frozen sets are hashable and can be used as dictionary keys or elements of other sets

In [78]:
codons_read_only = frozenset(codons)
codons_read_only.add('UUU')

AttributeError: 'frozenset' object has no attribute 'add'

![codon_wheel.png](./Images/codon_wheel.png)

## 4. Stacks & Queues

#### * Stacks - collections that support *last-in, first-out (LIFO)* semantics for insert and delete operations

#### * Queues - collections that support *first-in, first-out (FIFO)* semantics for inserts and deletes

### **collections.deque**

#### *double-ended queue* used to implement fast and robust stacks and queues

#### *deque* objects implemented as doubly-linked lists

#### supports adding and removing elements from either end with constant *O(1)* performance

In [80]:
from collections import deque

help(deque)

Help on class deque in module collections:

class deque(builtins.object)
 |  deque([iterable[, maxlen]]) --> deque object
 |  
 |  A list-like sequence optimized for data accesses near its endpoints.
 |  
 |  Methods defined here:
 |  
 |  __add__(self, value, /)
 |      Return self+value.
 |  
 |  __bool__(self, /)
 |      self != 0
 |  
 |  __contains__(self, key, /)
 |      Return key in self.
 |  
 |  __copy__(...)
 |      Return a shallow copy of a deque.
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __eq__(self, value, /)
 |      Return self==value.
 |  
 |  __ge__(self, value, /)
 |      Return self>=value.
 |  
 |  __getattribute__(self, name, /)
 |      Return getattr(self, name).
 |  
 |  __getitem__(self, key, /)
 |      Return self[key].
 |  
 |  __gt__(self, value, /)
 |      Return self>value.
 |  
 |  __iadd__(self, value, /)
 |      Implement self+=value.
 |  
 |  __imul__(self, value, /)
 |      Implement self*=value.
 |  
 |  __init__(self, /, 

In [81]:
# stack implementation using collections.deque 

# the stack maintains a history of commands 
# for installing deepchem - a library built on the TensorFlow platform,
# here with GPU acceleration -
# to facilitate the use of deep learning in the life sciences
# from genomics, to microscopy to drug discovery and beyond

from collections import deque
history = deque()

history.appendleft('conda create --name deepchem python=3.7')
history.appendleft('conda activate deepchem')
history.appendleft('conda install -c conda-forge rdkit deepchem=2.5.0')
history.appendleft('pip install tensorflow-gpu~=2.4')
history.appendleft('conda list | grep tensor')
history.appendleft('conda install cudatoolkit=11')
history.appendleft('conda install -c conda-forge cudnn')
history.appendleft('conda list cudnn')
history.appendleft('python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"')

history

deque(['python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"',
       'conda list cudnn',
       'conda install -c conda-forge cudnn',
       'conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem',
       'conda create --name deepchem python=3.7'])

In [82]:
history.popleft()

'python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"'

In [83]:
history.popleft()

'conda list cudnn'

In [84]:
history.popleft()

'conda install -c conda-forge cudnn'

In [85]:
history

deque(['conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem',
       'conda create --name deepchem python=3.7'])

In [86]:
history.appendleft('conda install -c conda-forge cudnn')
history

deque(['conda install -c conda-forge cudnn',
       'conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem',
       'conda create --name deepchem python=3.7'])

In [87]:
history.appendleft('conda list cudnn')

In [88]:
history.appendleft('python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"')
history

deque(['python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"',
       'conda list cudnn',
       'conda install -c conda-forge cudnn',
       'conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem',
       'conda create --name deepchem python=3.7'])

In [89]:
history.rotate(1)
history

deque(['conda create --name deepchem python=3.7',
       'python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"',
       'conda list cudnn',
       'conda install -c conda-forge cudnn',
       'conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem'])

In [90]:
history.rotate(-1)
history

deque(['python -c "import tensorflow as tf; print(tf.config.list_physical_devices(device_type="GPU"))"',
       'conda list cudnn',
       'conda install -c conda-forge cudnn',
       'conda install cudatoolkit=11',
       'conda list | grep tensor',
       'pip install tensorflow-gpu~=2.4',
       'conda install -c conda-forge rdkit deepchem=2.5.0',
       'conda activate deepchem',
       'conda create --name deepchem python=3.7'])

In [91]:
# queue implementation using collections.deque 

# the queue holds a batch of CP2K QM/MM simulations 
# submission jobs to be executed on HPC clusters in the FIFO order

from collections import deque

jobs = deque()

with open('./run_02011.bash', 'rt') as f:
    [jobs.append(job.strip()) for job in f if not job.startswith('#')]
    
jobs

deque(['jobid=`qsub -N trHbN_02011_01                          ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_02 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_03 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_04 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_05 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_06 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_07 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_08 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_09 -W depend=afterok:$jobid ./run.cp2k`',
       'jobid=`qsub -N trHbN_02011_10 -W depend=afterok:$jobid ./run.cp2k`'])

In [92]:
jobs.popleft()

'jobid=`qsub -N trHbN_02011_01                          ./run.cp2k`'

In [93]:
jobs.popleft()

'jobid=`qsub -N trHbN_02011_02 -W depend=afterok:$jobid ./run.cp2k`'