In [22]:
import json
from pomegranate import DiscreteDistribution, HiddenMarkovModel
from utils import *
from Bio.Alphabet import IUPAC


class GenerativeHMM(Model):

    def __init__(self, args):
        """
        Initializes the HMM to perform generative tasks
        Parameters
        ----------
        args : dictionary
            defines the hyper-parameters of the HMM
        args.name : string 
            defines the name of the HMM
        args.hidden_size : int 
            defines the hidden size
        args.epochs: int
            sets the number of epochs
        args.batch_size : int
            sets the batch size
        args.vocabulary : str
            all the characters used in the hmm
        """
        self.name = args["name"]
        self.hidden_size = args["hidden_size"]
        self.epochs = args["epochs"]
        self.batch_size = args["batch_size"]
        self.all_characters = args["vocabulary"]
        self.num_characters = len(self.all_characters)
        self.character_to_int = dict(zip(self.all_characters, range(self.num_characters)))
        self.int_to_character = dict(zip(range(self.num_characters), self.all_characters))
        self.indexes = [pair[1] for pair in sorted(self.character_to_int.items(), key = lambda x : x[1])]
        self.emission_size = len(self.indexes)
        self.model = None
        self.train_loss_history = []
        self.valid_loss_history = []
        self.build_model()
    
    def build_model(self): 
        distributions = []
        for _ in range(self.hidden_size): 
            emission_probs = np.random.random(self.num_characters)
            emission_probs = emission_probs / emission_probs.sum()
            distributions.append(DiscreteDistribution(dict(zip(self.all_characters, emission_probs))))
        trans_mat = np.random.random((self.hidden_size, self.hidden_size))
        trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1)
        starts = np.random.random(self.hidden_size)
        starts = starts / starts.sum()
        # testing initializations
        np.testing.assert_almost_equal(starts.sum(), 1)
        np.testing.assert_array_almost_equal(np.ones(self.hidden_size), trans_mat.sum(axis=1))
        self.model = HiddenMarkovModel.from_matrix(trans_mat, distributions, starts)
        self.model.bake()

    def print_vars(self):
        print(self.__dict__)
        
    def fit(self, train_dataloader, valid_dataloader=None, verbose=True, logger=None, save_model=True, weights=None):
        """
        Fits the model on an HMM with self.hidden_size
        """    
        start_time = time.time()
        self.train_loss_history, self.valid_loss_history = [], []
        num_of_batches = train_dataloader.shape[0]
        batch_size = train_dataloader.shape[1]
        num_of_datapoints = num_of_batches * batch_size
        for epoch in range(1, self.epochs + 1):
            total_train_loss = 0
            np.random.shuffle(train_dataloader)
            for batch_idx, x in enumerate(train_dataloader):
                total_train_loss += model.summarize(x)
                model.from_summaries()
            self.train_loss_history.append(total_train_loss / num_of_datapoints)
            
            if valid_dataloader:
                valid_loss = self.evaluate(valid_dataloader, verbose=False, logger=logger)
                self.valid_loss_history.append(valid_loss)
            if verbose: 
                print("-" * 50, file = logger)
                print('epoch: {0}, train loss: {1:.4f}, valid loss: {2:.4f}, time: {3:.4f}'.format(
                      epoch, self.train_loss_history[-1], self.valid_loss_history[-1], time.time() - start_time), file = logger)
                print("-" * 50, file = logger)
            if epoch % 50 == 0:
                self.save_model(epoch, self.train_loss_history[-1]) 
            
    def sample(self, n, length):
        """
        Input:
        n is number of samples
        length is how long you want each sample to be
        """
        return np.array(["".join(seq) for seq in self.model.sample(n=n, length=length)])
        
    def predict(self, x_test): 
        """
        predict the log probability of obtaining the sequences in x_test
        log(P(X1, X2, ..., X_test)) = sum(log(P(Xi)))
        Input: x_test a list of sequences. should be 2 or 3 dimensional
        """
        assert(len(np.array(x_test).shape) == 2 or len(np.array(x_test).shape) == 3)
        return sum([self.model.log_probability(seq) for seq in np.array(x_test)])


    def show_model(self): 
        self.model.plot()
        
    def save_model(self, path): 
        with open(path, 'w') as f:
            json.dump(self.model.to_json(), f)
    
    def load_model(self, path): 
        with open(path, 'r') as f:
            json_model = json.load(f)
        self.model = HiddenMarkovModel.from_json(json_model)


def hmm_base_args(): 
    return {
        "name" : "base HMM",
        "hidden_size" : 5,
        "max_iterations" : 10,
        "n_jobs" : 1,
        "batch_size" : 5,
        "epoch" : 2,
        "char_to_int" : {"A" : 0, "C" : 1, "T" : 2, "G" : 3},
        "build_from_samples" : False
    }



def hmm_amino_acid_args(): 
    args = hmm_base_args()
    amino_acids = get_all_amino_acids()
    indexes = list(range(len(amino_acids)))
    assert(len(amino_acids) == 21)
    assert(amino_acids == "*" + IUPAC.protein.letters) #*ACDEFGHIKLMNPQRSTVWY
    args["char_to_int"] = dict(zip(amino_acids, indexes))
    return args


In [48]:
print("Loading data...")
start_time = time.time()
X_train, X_test, y_train, y_test = load_gfp_data("./data/gfp_amino_acid_")
print("Finished loading data in {0:.2f} seconds".format(time.time() - start_time))
wild_type_amino_acid = get_wild_type_amino_acid_sequence()
assert(X_train[0] == wild_type_amino_acid)
assert(count_substring_mismatch(wild_type_amino_acid, X_train[1000]) == 8)

def get_data(X_train, length, n = 100, random=True): 
    if not random: 
        data = X_train[0:length]
    else: 
        indexes = np.random.choice(len(X_train), n)
        data = X_train[indexes]
    return np.array([list(x)[0:length] for x in data])

def sample_and_score(hmm, wild_type, n = 100, length = 100, logger = None):
    """
    use the hmm model to sample n sequences of size = length. 
    then use the wild_type to count how far off the average sample is from the wild_type
    """
    assert(len(wild_type) == length)
    samples = hmm.sample(n, length)        
    average_diff = np.average([count_substring_mismatch(seq, wild_type) for seq in samples])
    print("Average difference: {0:.2f}, or {1:.2f} mismatches per letter".format(average_diff, 
                                                                 average_diff / length), file = logger)
    print("Example sequence {0}".format(samples[np.random.randint(0, n)]), file = logger)
    return average_diff

small_length, medium_length, large_length = 15, len(wild_type_amino_acid) // 4, len(wild_type_amino_acid)
small_X = get_data(X_train, small_length, 100)
medium_X = get_data(X_train, medium_length, 100)
large_X = get_data(X_train, large_length, 100)

Loading data...
Finished loading data in 0.12 seconds


In [24]:
def get_args(parser_args):
    args = hmm_amino_acid_args()
    args["n_jobs"] = parser_args.n_jobs
    args["hidden_size"] = parser_args.hidden_size
    args["max_iterations"] = parser_args.max_iterations
    args["name"] = parser_args.name
    args["length"] = parser_args.length
    return args

def get_base_args():
    base_args = hmm_amino_acid_args()
    base_args["name"] = "hmm_base"
    base_args["epochs"] = 100
    base_args["vocabulary"] = get_all_amino_acids()
    base_args["hidden_size"] = 20
    base_args["n_jobs"] = 10
    base_args["length"] = 15
    return base_args

args = get_base_args()
print(args)

{'name': 'hmm_base', 'hidden_size': 20, 'max_iterations': 10, 'n_jobs': 10, 'batch_size': 5, 'epoch': 2, 'char_to_int': {'*': 0, 'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10, 'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15, 'S': 16, 'T': 17, 'V': 18, 'W': 19, 'Y': 20}, 'build_from_samples': False, 'epochs': 100, 'vocabulary': '*ACDEFGHIKLMNPQRSTVWY', 'length': 15}


In [25]:
start_time = time.time()
hmm = GenerativeHMM(args)
logger = None
hmm.fit(small_X)
print("Finished training in {:.2f} seconds".format(time.time() - start_time), file = logger)
print("HMM Parameters:", file = logger)
print(hmm.print_vars(), file = logger)
sample_and_score(hmm, wild_type_amino_acid[0:args["length"]], 100, args["length"], logger = logger)
wild_type_prob = np.e ** hmm.predict([list(wild_type_amino_acid[0:args["length"]])])
mutation_prob = np.e ** hmm.predict([list(wild_type_amino_acid[0:args["length"] - 3] + "ACG")])
print("Wild type prob: {0}. Mutation prob: {1}".format(wild_type_prob, mutation_prob), file = logger)
model_path = "./models/{0}.json".format(hmm.name)
hmm.save_model(model_path)
cached_hmm = GenerativeHMM(args)
cached_hmm.load_model(model_path)
try: 
    for i in get_all_amino_acids():
        for j in get_all_amino_acids(): 
            np.testing.assert_almost_equal(hmm.predict([list(i + j)]), cached_hmm.predict([list(i + j)]))
    print("Successfully finished training and saving {0} model!".format(hmm.name), file = logger)
    if logger: logger.close()
except:
    for i in get_all_amino_acids():
        for j in get_all_amino_acids(): 
            print(hmm.predict([list(i + j)]), cached_hmm.predict([list(i + j)]), file = logger)
    print("Error in loading {0} hmm".format(hmm.name), file = logger)
    if logger: logger.close()

[1] Improvement: 1336.223475011725	Time (s): 0.1046
[2] Improvement: 111.1468854854229	Time (s): 0.1024
[3] Improvement: 218.65429119075043	Time (s): 0.0865
[4] Improvement: 348.08273661015346	Time (s): 0.07817
[5] Improvement: 420.8811535710579	Time (s): 0.08485
[6] Improvement: 463.89714353256977	Time (s): 0.1015
[7] Improvement: 474.79322156771514	Time (s): 0.09721
[8] Improvement: 441.09742083295635	Time (s): 0.07907
[9] Improvement: 268.7403434433527	Time (s): 0.1107
[10] Improvement: 99.07030035367359	Time (s): 0.09766
[11] Improvement: 38.110772435433034	Time (s): 0.08867
[12] Improvement: 21.9508826067443	Time (s): 0.09634
[13] Improvement: 42.44725766911097	Time (s): 0.08218
[14] Improvement: 95.53541038232635	Time (s): 0.1089
[15] Improvement: 103.91593258642538	Time (s): 0.07432
[16] Improvement: 20.756637162305424	Time (s): 0.06649
[17] Improvement: 0.13027005433551153	Time (s): 0.07377
[18] Improvement: 0.0019168209456523755	Time (s): 0.1161
[19] Improvement: 5.54282139830

Successfully finished training and saving hmm_base model!


In [33]:
distributions = []
hidden_size = 50
num_characters = 21
all_characters = get_all_amino_acids()
for _ in range(hidden_size): 
    emission_probs = np.random.random(num_characters)
    emission_probs = emission_probs / emission_probs.sum()
    distributions.append(DiscreteDistribution(dict(zip(all_characters, emission_probs))))
trans_mat = np.random.random((hidden_size, hidden_size))
trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1)
starts = np.random.random(hidden_size)
starts = starts / starts.sum()
# testing initializations
np.testing.assert_almost_equal(starts.sum(), 1)
np.testing.assert_array_almost_equal(np.ones(hidden_size), trans_mat.sum(axis=1))
model = HiddenMarkovModel.from_matrix(trans_mat, distributions, starts)

In [49]:
num_data = 1000
batch_size = 10
x_train = get_data(X_train, medium_length, num_data)
x_train = x_train.reshape(num_data // batch_size, batch_size, x_train.shape[1])
x_valid = get_data(X_train, medium_length, num_data)
x_valid = x_valid.reshape(num_data // batch_size, batch_size, x_valid.shape[1])
x_test = get_data(X_test, medium_length, num_data)
x_test = x_test.reshape(num_data // batch_size, batch_size, x_test.shape[1])

In [61]:
start_time = time.time()
train_loss_history, valid_loss_history = [], []
num_of_batches = x_train.shape[0]
batch_size = x_train.shape[1]
num_of_datapoints = num_of_batches * batch_size
for epoch in range(1, 100 + 1):
    total_train_loss = 0
    np.random.shuffle(x_train)
    for batch_idx, x in enumerate(x_train):
        print()
    train_loss_history.append(total_train_loss / num_of_datapoints)
    print(total_train_loss / num_of_datapoints)

[['S' 'K' 'G' 'E' 'G' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'R' 'P' 'T' 'L']
 ['S' 'E' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'Y' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'H' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'E' 'L' 'T' 'L' 'E' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'E' 'E' 'L' 'S' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'P' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'V' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'T' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'E' 'E' 'L' 'S' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'Q' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

[['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'N' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'L' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'V' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'Q' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'F' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E

[['S' 'K' 'G' 'G' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'R' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'E' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'A' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'G' 'E' 'L' 'F' 'I' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'L' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'L' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' '*' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'R' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




[['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'L' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'R' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' '

  'P' 'W' 'P' 'T' 'L']]
[['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'R' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'P' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'A' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'V' 'H' 'K' 'F' 'S' 'E' 'S' 'G' 'E' 'G' 'E' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'F' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' 'L']
 ['S' 'K' 'G' 'E' 'E' 'L' 'F' 'T' 'G' 'V' 'V' 'P' 'I' 'L' 'V' 'E' 'L' 'D'
  'G' 'D' 'V' 'N' 'G' 'H' 'K' 'F' 'S' 'V' 'S' 'G' 'E' 'G' 'G' 'G' 'D' 'A'
  'T' 'Y' 'G' 'K' 'L' 'T' 'L' 'K' 'F' 'I' 'C' 'T' 'T' 'G' 'K' 'L' 'P' 'V'
  'P' 'W' 'P' 'T' '

KeyboardInterrupt: 

In [67]:
z = get_data(X_train, medium_length, num_data)
model.fit(z, max_iterations=100, verbose=True)
wild_type_prob = np.e ** model.predict([list(wild_type_amino_acid[0:medium_length])])
mutation_prob = np.e ** model.predict([list(wild_type_amino_acid[0:medium_length - 3] + "ACG")])
print(wild_type_prob, mutation_prob)

[1] Improvement: nan	Time (s): 1.669
Total Training Improvement: nan
Total Training Time (s): 3.5096


TypeError: unhashable type: 'numpy.ndarray'