In [3]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Add mavenn to path
import sys
path_to_mavenn_local = '/Users/jkinney/github/mavenn'
sys.path.insert(0,path_to_mavenn_local)

# Load mavenn and check path
import mavenn
mavenn.__path__

['/Users/jkinney/github/mavenn/mavenn']

In [60]:
# Define built-in alphabets to use with MAVE-NN
alphabet_dict = {
    'dna': np.array(['A', 'C', 'G', 'T']),
    'rna': np.array(['A', 'C', 'G', 'U']),
    'protein': np.array(['A', 'C', 'D', 'E', 'F',
                         'G', 'H', 'I', 'K', 'L',
                         'M', 'N', 'P', 'Q', 'R',
                         'S', 'T', 'V', 'W', 'Y']),
    'protein*': np.array(['A', 'C', 'D', 'E', 'F',
                          'G', 'H', 'I', 'K', 'L',
                          'M', 'N', 'P', 'Q', 'R',
                          'S', 'T', 'V', 'W', 'Y','*'])
}

from mavenn.src.error_handling import handle_errors, check
@handle_errors
def validate_alphabet(alphabet):
    """
    Returns a validated alphabet. String inputs are interpreted
    as the name of one of four alphabets:
        ['dna','rna','protein','protein*']. 
    Otherwise alphabet must be one of 
        [set, list, np.ndarray, pd.Series],
    containing only unique characters.
    """
    
    valid_types = (str,list,set,np.ndarray,pd.Series)
    check(isinstance(alphabet,valid_types),
          f'type(alphabet)={type(alphabet)} is invalid. ' 
          f'Must be one of {valid_types}.')
    
    # If alphabet is a string, replace with array from alphabet_dict
    if isinstance(alphabet,str):
        check(alphabet in alphabet_dict, 
              f'Unknown alphabet={alphabet}. Must be one of [{alphabet_dict.keys()}].')
        alphabet = alphabet_dict[alphabet]
        
    # If alphabet is a set, cast as np.ndarray 
    elif isinstance(alphabet,set):
        alphabet = np.array(list(alphabet))

    # If alphabet is a list, cast an np.ndarray 
    elif isinstance(alphabet,list):
        alphabet = np.array(alphabet)
        
    # If alphabet is a pd.Series, get values
    elif isinstance(alphabet, pd.Series):
        alphabet = alphabet.values
        
    # Make sure alphabet is 1D
    check(len(alphabet.shape)==1, 
          f'Alphabet must be 1D. alphabet.shape={alphabet.shape}')
        
    # Make sure the entries of alphabet are unique
    check(len(alphabet) == len(set(alphabet)), 
          f'Entries of alphabet are not unique.')
        
    # Make sure alphabet is not empty
    check(len(alphabet) > 0,
          f'len(alphabet)={len(alphabet)}; must be >= 1.')

    # Make sure all alphabet entries are strings
    check(all([isinstance(c,str) for c in alphabet]), 
          'Alphabet contains non-string characters.')

    # Make sure all alphabet entries are single-character
    check(all([len(c)==1 for c in alphabet]), 
          'Alphabet contains non-string characters.')
    
    # Sort alphabet
    alphabet.sort()
    
    return alphabet

In [59]:
# Tests that should pass
valid_inputs = [
    'dna',
    'rna',
    'protein',
    'protein*',
    np.array(['A','B','C']),
    {'A','B','C'},
    ['A','B','C'],
    pd.Series(['A','B','C'])
]

for arg in valid_inputs:
    result = validate_alphabet(arg, should_fail=False)
    if result.mistake:
        print(f'Failure on input {arg}; got {result.result}')

# Tests that should fail
invalid_inputs = [
    'xna',
    'protein-',
    ['A','B','A'],
    [],
    {'A':5},
    np.array([['A','B'],['C','D']]),
    np.arange(5),
    pd.Series([])
]
        
for arg in invalid_inputs:
    result = validate_alphabet(arg, should_fail=True)
    if result.mistake:
        print(f'Failure on input {arg}; got {result.result}')

Expected success.
Expected success.
Expected success.
Expected success.
Expected success.
Expected success.
Expected success.
Expected success.
Expected error: Unknown alphabet=xna. Must be one of [dict_keys(['dna', 'rna', 'protein', 'protein*'])].
Expected error: Unknown alphabet=protein-. Must be one of [dict_keys(['dna', 'rna', 'protein', 'protein*'])].
Expected error: Entries of alphabet are not unique.
Expected error: len(alphabet)=0; must be >= 1.
Expected error: type(alphabet)=<class 'dict'> is invalid. Must be one of (<class 'str'>, <class 'list'>, <class 'set'>, <class 'numpy.ndarray'>, <class 'pandas.core.series.Series'>).
Expected error: Alphabet must be 1D. alphabet.shape=(2, 2)
Expected error: Alphabet contains non-string characters.
Expected error: len(alphabet)=0; must be >= 1.
