In [43]:
__description__ = \
"""
FOR EACH ROUND:
1. Sample LIBRARY_SIZE sequences from a larger pool.  
    Problems: random sampling and (maybe) bias in initial library.
2. Sample from LIBRARY_SIZE according to protein/peptide binding constant.
    Limiting reagent: phage particles binding to an infinite field of protiens (Do individual clones become
                      limiting?)
    Problems: Random sampling low numbers, competition between proteins if we run out, 
    
3. Amplify output from previous sample (phage into log-growth bacteria).  
    Limiting reagent: phage particles into infinite vat of bacteria
    Problems: multiple hits, slow growers, fast growers
    

TO QUERY EACH ROUND
4. Amplify amplified phage using PCR (to append adapters)
5. Amplify PCR to obtain illumina ma-jiggits.
6. Sample reads in Illumina machine
"""

import numpy as np
import scipy.sparse
import string

AMINO_ACIDS = ("A","C","D","E","F",
               "G","H","I","K","L",
               "M","N","P","Q","R",
               "S","T","V","W","Y")

BASES = ("A","T","G","C")

BINARY = ("A","D")

class SeqIntegerMapper:
    """
    To save memory and avoid big-ole dicts, treat sequences as base-"alphabet size" numbers that
    are converted to base 10 integers.  Can handle alphabets up to 36 letters long. As written, we'll
    run out of long ints for 64 bit systems for 15 amino acid peptides, but supposedly python3 doesn't
    actually run out of ints.
    """
    
    def __init__(self,seq_length,alphabet=AMINO_ACIDS):
        
        self._alphabet = alphabet
        self._base = len(alphabet)
        self._seq_length = seq_length
        self.possible_digits = string.digits + string.ascii_lowercase
        
        self.max_int = self._base**(self._seq_length)
        
        self.string_to_base = dict([(letter,self.possible_digits[i]) for i, letter in enumerate(alphabet)])
                
    @property
    def alphabet(self):
        """ Get alphabet. """
        return self._alphabet

    @property
    def base(self):
        """ Get base. """
        return self._base
    
    @property
    def seq_length(self):
        """ Get base. """
        return self._seq_length
        
    def seqToInt(self,sequence_string):
        """
        Return the base 10 integer equivalent of a sequence.
        """
    
        sequence_in_base = "".join([self.string_to_base[s] for s in sequence_string])
        
        return int(sequence_in_base,self._base)
    
    
    def intToSeq(self,sequence_integer):
        """
        Return the sequence encoded by this base 10 integer.
        """
            
        i = 0
        digits = list(self._alphabet[0])*self._seq_length
        while sequence_integer:
            digits[i] = self._alphabet[sequence_integer % self._base]
            sequence_integer //= self._base
            i += 1
          
        digits.reverse()
        
        return "".join(digits)
    
def uniqueCounter(x):
    
    # count unique elements, like bincount but (supposedly) faster
    # Code fragment by Eelco Hoogendoorn
    # http://w3facility.org/question/numpy-frequency-counts-for-unique-values-in-an-array/
    
    unique, inverse = np.unique(x, return_inverse=True)
    count = np.zeros(len(unique), np.int)
    np.add.at(count, inverse, 1)
    
    return np.vstack(( unique, count)).T



In [44]:
class Pool:
    """
    Basic class holding a pool of sequence. 
    """ 

    def __init__(self,sequence_length=12,alphabet=(0,1)):
        """
        """

        self.sequence_length = sequence_length
        self.alphabet = alphabet[:]
        
        self.mapper = SeqIntegerMapper(self.sequence_length,
                                       self.alphabet)
        
        self._pool_exists = False
    
    
    def createRandomPool(self,initial_pool_size,max_K=1e6):
        """
        """
        
        initial_sample = np.random.randint(0,self.mapper.max_int,initial_pool_size)
      
        unique_out = uniqueCounter(initial_sample)
    
        self._all_seq = unique_out[:,0]
    
        # Return this sucker as a sparse matrix that will match the indices 
        self._pool_pops = []        
        self._pool_pops.append(scipy.sparse.lil_matrix((self.mapper.max_int,1),dtype=int))
        self._pool_pops[-1][unique_out[:,0],np.zeros(unique_out[:,0].size,dtype=int)] = unique_out[:,1]
        self._pool_pops[-1] = scipy.sparse.csr_matrix(self._pool_pops[-1])
        
        
        # Generate random affinities (uniform in log scale)
        max_affinity = np.log10(max_K) 
        self._affinities = 10**(np.random.uniform(low=0.0,high=max_affinity,size=self._pool_pops[-1].size))
        
        self._pool_exists = True
        
    
    def addNewStep(self,new_bit):
        """
        """
        
        self._pool_pops.append(new_bit)
    
    @property
    def pool_exists(self):
        """
        """
        
        return self._pool_exists
    
    @property
    def current_counts(self):
        """
        """
        
        return self._pool_pops[-1][self._all_seq,0]
    
    @property
    def affinities(self):
        """
        """
        
        return self._affinities

    @property
    def all_seq(self):
        """
        """
        
        return self._all_seq
    

In [59]:
class SamplerBaseClass(object):
    """
    
    """
    
    def __init__(self):
        """
        """
        
        # Not a lot to initialize for the base class
        
        pass
        
    def _sample(self,possibilities,weights,sample_size):
        """
        Core sampling function.
        """

        sampled = np.random.choice(possibilities,
                                   size=sample_size,
                                   replace=True,
                                   p=weights)
        
        unique_out = uniqueCounter(sampled)
    
        # Return this sucker as a sparse matrix that will match the indices 
        out_array = scipy.sparse.lil_matrix((self.mapper.max_int,1),dtype=int)
        out_array[unique_out[:,0],np.zeros(unique_out[:,0].size,dtype=int)] = unique_out[:,1]
        out_array = scipy.sparse.csr_matrix(out_array)
        
        return out_array
    
    def poolSanityCheck(self,pool_instance):
        """
        Make sure that it makes sense to sample this pool. 
        """
        
        if not pool_instance.pool_exists:
            err = "pool must be initialized prior to sampling.\n\n"
            raise PhageDisplaySimulatorError(err)
    
    
    def runExperiment(self,pool_instance,sample_size):
        """
        Method assuming that everything is awesome; input frequencies reflect output frequuencies
        limited only by sampling error.  
        """
        
        
        self.poolSanityCheck(pool_instance)
        
        weights = (pool_instance.current_counts/(pool_instance.current_counts.sum())).toarray().flatten()        
        
        return self._sample(pool_instance.all_seq,
                            weights=np.array(weights),
                            sample_size=sample_size)
        

class PCRAmplificationSampler(SamplerBaseClass):
    """
    """

    pass
    

class PhageAmplificationSampler(SamplerBaseClass):
    """
    Class for simulating the amplification of phage that occurs after a binding round.
    """

    pass

class PipetteSampler(SamplerBaseClass):
    """
    Class for simulating the downsampling that occurs when you take a small fraction of the 
    total pool using a pipette.  
    """

    pass

        
class BindingSampler(SamplerBaseClass):
    """
    """
    
    def runExperiment(self,pool_instance,sample_size):
        """
        asdf
        """

        self.poolSanityCheck(pool_instance)
        
        weights = pool_instance.current_counts*pool_instance.affinities
        weights = weights/np.sum(weights)
        
        return self._sample(pool_instance.all_seqs,
                            weights=weights,
                            sample_size=sample_size)
        


class IlluminaRunSampler(SamplerBaseClass):
    pass
    

In [60]:

# P = instance_of_pool_class
num_rounds = 3

pipet = PipetteSampler()
screen = BindingSampler()
amplify = PhageAmplificationSampler()

pool = Pool()
pool.createRandomPool(1000)
for i in range(num_rounds):
    pool.addNewStep(pipet.runExperiment(pool,100))
    pool.addNewStep(screen.runExperiment(pool,10))
    pool.addNewStep(amplify.runExperiment(pool,1000))


    




[ 0.002  0.002  0.001  0.001  0.002  0.001  0.002  0.001  0.001  0.001
  0.004  0.002  0.001  0.003  0.001  0.001  0.001  0.001  0.001  0.001
  0.002  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.002  0.001  0.001  0.001
  0.002  0.001  0.002  0.001  0.001  0.002  0.001  0.001  0.001  0.001
  0.001  0.002  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.002  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.002  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.002  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.002  0.001  0.001
  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001  0.001
  0.00

AttributeError: 'PipetteSampler' object has no attribute 'mapper'