In [49]:
import pandas as pd
import numpy as np
import time 

from sklearn.model_selection import train_test_split
from utils import load_gfp_data, one_hot_decode, one_hot_encode, normalize

In [2]:
print("Loading data...")
start_time = time.time()
X_train, X_test, y_train, y_test = load_gfp_data()
print("Finished loading data in {0:.2f} seconds".format(time.time() - start_time))

Loading data...
Finished loading data in 18.83 seconds


In [116]:
def generate_random_gfp_data_mutations(num_of_mutations_lst, num_per_mutation_count = 1000): 
    """
    Input: num_of_mutations_lst is a list defining the count of mutations from the base sequence we want, 
           num_per_mutation_count is an int defining how many of each mutation count do we want. Often set to the test size
    Output: a pandas dataframe comprised of two columns: the number of mutations and the mutation dna sequence
    """
    start_time = time.time()
    total_data_points = len(num_of_mutations_lst) * num_per_mutation_count
    assert(total_data_points < 200000)
    wild_type_sequence = pd.read_csv("./data/gfp_data.csv")["nucSequence"].values[0]
    wild_type_lst = list(wild_type_sequence)
    mutation_lst = np.vstack([wild_type_lst] * total_data_points)
    mutation_count_lst = np.array([[mutation_count for _ in range(num_per_mutation_count)] for mutation_count in num_of_mutations_lst]).flatten()
    """
    np.testing.assert_array_equal(mutation_lst[0], mutation_lst[1000])
    np.testing.assert_array_equal(mutation_lst[0], wild_type_lst)
    assert(len(mutation_lst) == total_data_points and len(mutation_count_lst) == len(mutation_lst))
    assert(mutation_count_lst[0] == 1 and mutation_count_lst[num_per_mutation_count] == 2 and mutation_count_lst[num_per_mutation_count - 1] == 1)
    """
    bases = "ACTG"
    index = list(range(0, 4))
    base_index_map = dict(zip(bases, index))
    index_base_map = dict(zip(index, bases))

    for i, mutation_count in enumerate(mutation_count_lst):   
        mutation_index = np.random.choice(len(wild_type_sequence), mutation_count, replace = False).tolist()
        for j in mutation_index: 
            k = base_index_map[mutation_lst[i, j]]
            new_index = (k + np.random.randint(1, 4)) % 4
            mutation_lst[i, j] = index_base_map[new_index] 
    mutation_sequence_lst = np.array(["".join(lst) for lst in mutation_lst])
    mutated_df = pd.DataFrame.from_dict({'mutation_count' : mutation_count_lst, 'mutation_sequence' : mutation_sequence_lst})
    print(time.time() - start_time)
    return mutated_df
        

In [138]:
def save_gfp_data(X_train, X_test, y_train, y_test): 
    np.save("./data/gfp_x_train.npy", X_train)
    np.save("./data/gfp_x_test.npy", X_test)
    np.save("./data/gfp_y_train.npy", y_train)
    np.save("./data/gfp_y_test.npy", y_test)
    
def load_saved_gfp_data(): 
    X_train = np.load("./data/gfp_x_train.npy")
    X_test = np.load("./data/gfp_x_test.npy")
    y_train = np.load("./data/gfp_y_train.npy")
    y_test = np.load("./data/gfp_y_test.npy")
    return X_train, X_test, y_train, y_test

def save_mutated_gfp_data(mutated_df): 
    mutated_df.to_csv("./data/mutated_df.csv", index = None)
    
def load_saved_mutated_gfp_data(): 
    return pd.read_csv("./data/mutated_df.csv")


In [None]:
def test_mutation_df(mutated_df): 
    for i in range(len(mutated_df)): 
        assert(mutated_df["mutation_count"].values[i] == count_substring_mismatch(base_sequence, mutated_df["mutation_sequence"].values[i]))
    assert(len(mutated_df) == test_size * len(num_of_mutations_lst))

In [144]:
def test_mutation_df(mutated_df): 
    for i in range(len(mutated_df)): 
        assert(mutated_df["mutation_count"].values[i] == count_substring_mismatch(base_sequence, mutated_df["mutation_sequence"].values[i]))
    assert(len(mutated_df) == test_size * len(num_of_mutations_lst))
    
def test_load_save_mutation_df(mutated_df): 
    save_mutated_gfp_data(mutated_df)
    df_mutated = load_saved_mutated_gfp_data()
    np.testing.assert_array_equal(df_mutated.columns, mutated_df.columns)
    np.testing.assert_array_equal(df_mutated.index, mutated_df.index)
    np.testing.assert_array_equal(df_mutated.values, mutated_df.values)
    
def test_load_save_gfp_data(X_train, X_test, y_train, y_test):
    save_gfp_data(X_train, X_test, y_train, y_test)
    train_X, test_X, train_y, test_y = load_saved_gfp_data()
    np.testing.assert_array_equal(train_X, X_train)
    np.testing.assert_array_equal(test_X, X_test)
    np.testing.assert_array_equal(train_y, y_train)
    np.testing.assert_array_equal(test_y, y_test)

In [139]:
save_gfp_data(X_train, X_test, y_train, y_test)

In [126]:
train_X, test_X, train_y, test_y = load_saved_gfp_data()

In [134]:
df_mutated = pd.read_csv("./data/mutated_df.csv")

In [146]:
test_mutation_df(mutated_df)
test_load_save_mutation_df(mutated_df)
test_load_save_gfp_data(X_train, X_test, y_train, y_test)

In [140]:
np.testing.assert_array_equal(train_X, X_train)
np.testing.assert_array_equal(test_X, X_test)
np.testing.assert_array_equal(train_y, y_train)
np.testing.assert_array_equal(test_y, y_test)