# Prepare sort-seq dataset for use in MAVE-NN

In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Insert mavenn at beginning of path
import sys
path_to_mavenn_local = '../../../../'
sys.path.insert(0,path_to_mavenn_local)

#Load mavenn and check path
import mavenn
print(mavenn.__path__)

# For testing
from mavenn.src.utils import vec_data_to_mat_data

['../../../../mavenn']


In [2]:
# Shit, these sequences aren't the right length! Get full sequences from Kinney et al., 2010
# I can mess with this later; would be nice to include full sequence
len('GGCTGTTCACTTTATGCTTCCGGCTTGTATTTTGTGTGC')

39

In [3]:
# Load raw data file
raw_df = pd.read_csv(mavenn.__path__[0] +
    '/examples/datasets/sort_seq/full-wt/full-wt-sort_seq.csv',
    index_col=[0])
raw_df.head()

Unnamed: 0,seq,bin,ct
0,GGCTGTTCACTTTATGCTTCCGGCTTGTATTTTGTGTGC,4,23.0
1,GGTTTTACACATTATGCTTCCGGCTCGTCTCTTGTGTGG,2,12.0
2,GGCTTAACACTTAATGCTTCCGGCTCGTATGTTGTGTGG,1,11.0
3,GGTTTTACACTTTATGCTTCCCGCTCGTAAGGTGTGTCG,5,10.0
4,GGCTTTACACTTTATGCGTCCGGCTCGTATGTTGCGTGG,2,10.0


In [4]:
# Refine contents of raw data file
sequences = raw_df['seq'].values
raw_df.columns = ['x','y','ct']
raw_df['ct'] = raw_df['ct'].astype(int)
raw_df.head()

Unnamed: 0,x,y,ct
0,GGCTGTTCACTTTATGCTTCCGGCTTGTATTTTGTGTGC,4,23
1,GGTTTTACACATTATGCTTCCGGCTCGTCTCTTGTGTGG,2,12
2,GGCTTAACACTTAATGCTTCCGGCTCGTATGTTGTGTGG,1,11
3,GGTTTTACACTTTATGCTTCCCGCTCGTAAGGTGTGTCG,5,10
4,GGCTTTACACTTTATGCGTCCGGCTCGTATGTTGCGTGG,2,10


In [5]:
# Pivot and set training/test data
data_df = pd.pivot(raw_df, values='ct', index='x', columns='y').fillna(0).astype(int)
data_df.columns.name = None

# Get y_cols
data_df.columns = [f'ct_{x}' for x in data_df.columns]
y_cols = list(data_df.columns)

# Do all columns still sum to > 0?
print('rows summing to 0:', (data_df.values.sum(axis=1)==0).sum())
data_df.head()

rows summing to 0: 0


Unnamed: 0_level_0,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9
x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG,0,0,0,1,0,0,0,0,0,0
AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC,0,0,1,0,0,0,0,0,0,0
AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG,1,0,0,0,0,0,0,0,0,0
AAATTTACACTTTATGCTTCTGGCGCGTATGCGGCGTGG,0,0,0,1,0,0,0,0,0,0
AACATTACATTTTATGCTTCCGGCTCGTATGGTGTGTGG,0,1,0,0,0,0,0,0,0,0


In [6]:
N = len(data_df)
training_frac=.8
np.random.seed(0)
data_df['training_set'] = (np.random.rand(N) < training_frac)
data_df.reset_index(inplace=True)
data_df.head()

Unnamed: 0,x,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9,training_set
0,AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG,0,0,0,1,0,0,0,0,0,0,True
1,AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC,0,0,1,0,0,0,0,0,0,0,True
2,AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG,1,0,0,0,0,0,0,0,0,0,True
3,AAATTTACACTTTATGCTTCTGGCGCGTATGCGGCGTGG,0,0,0,1,0,0,0,0,0,0,True
4,AACATTACATTTTATGCTTCCGGCTCGTATGGTGTGTGG,0,1,0,0,0,0,0,0,0,0,True


In [7]:
# Remove entries where ct is 0
ix = data_df[y_cols].sum(axis=1) > 0
print(f'Dropping {sum(~ix)} columns with 0 counts.')
data_df = data_df[ix].reset_index(drop=True)
data_df.head()

Dropping 0 columns with 0 counts.


Unnamed: 0,x,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9,training_set
0,AAATACACACTTGCTGCTTCCGGCTCGTATGTTGTGTGG,0,0,0,1,0,0,0,0,0,0,True
1,AAATTTACACTGTATGCTTCCGGCTCGCATGGCGTTTGC,0,0,1,0,0,0,0,0,0,0,True
2,AAATTTACACTTTATGCATCAGACTCGTATGTTGTGTGG,1,0,0,0,0,0,0,0,0,0,True
3,AAATTTACACTTTATGCTTCTGGCGCGTATGCGGCGTGG,0,0,0,1,0,0,0,0,0,0,True
4,AACATTACATTTTATGCTTCCGGCTCGTATGGTGTGTGG,0,1,0,0,0,0,0,0,0,0,True


In [8]:
# Assign to trianing and test sets
N = len(data_df)
training_frac=.8
np.random.seed(0)
r = np.random.rand(N)
test_frac = .2
val_frac = .2
ix_train = (test_frac + val_frac <= r)
ix_val = (test_frac <= r) & (r < test_frac + val_frac)
ix_test = (r < test_frac)
data_df['set'] = ''
data_df.loc[ix_train, 'set'] = 'training'
data_df.loc[ix_val, 'set'] = 'validation'
data_df.loc[ix_test, 'set'] = 'test'
assert all([len(x)>0 for x in data_df['set']])

# Shuffle data for extra safety
data_df = data_df.sample(frac=1).reset_index(drop=True)

# Order columns
data_df = data_df[['set'] + y_cols + ['x']]
data_df.head(20)

Unnamed: 0,set,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9,x
0,test,0,0,0,0,0,0,0,0,1,0,GGCTTTACACTTTAAGCTGCCGCATCGTATGTTATGTGG
1,training,0,1,0,0,0,0,0,0,0,0,GGCTATACATTTTATGTTTCCGGGTCGTATTTTGTGTGG
2,training,0,0,0,0,0,0,0,0,1,0,GGCTTTACATTTTATGCTTCCTTCACGTATGTTGTGTCT
3,test,0,0,0,0,0,1,0,0,0,0,GGCATTACTCTTTGTGCTTCCGGCTCGTATGTTGTGTGG
4,test,0,0,0,0,0,0,0,1,0,0,GACTTTTCAATTTATGCTTTCAGTTGGTATGTTGTGTAG
5,training,0,0,0,0,0,0,0,0,0,1,CGCTTTACACTTTCCGCTGCCGGCCCGTATTTTATGGGG
6,training,0,0,0,1,0,0,0,0,0,0,GGCTCTACCGTTTAGGCTTACGCCTCGAATATTGTGTGG
7,training,0,0,0,0,0,0,0,1,0,0,GGCTTTACACTTTACGCTTCCGTATCTTACGTTATGTGG
8,test,0,0,0,5,0,0,0,0,0,0,GCCTGTACACTGTCTGCTTCCGCCTCGTATGTTGTGTGG
9,test,0,0,0,0,0,0,0,1,0,0,GGCTTGACACTGTTTGCTTCCGGCTCCTATGGTGTGTGG


In [9]:
# Show size of compressed dataset file
file_name = 'sortseq_data.csv.gz'
data_df.to_csv(file_name, compression='gzip', index=False)
print('df (zipped):')
!du -mh $file_name
!mv $file_name ../.

df (zipped):
396K	sortseq_data.csv.gz


In [10]:
# Test loading
loaded_df = mavenn.load_example_dataset('sortseq')
loaded_df.head()

Unnamed: 0,set,ct_0,ct_1,ct_2,ct_3,ct_4,ct_5,ct_6,ct_7,ct_8,ct_9,x
0,test,0,0,0,0,0,0,0,0,1,0,GGCTTTACACTTTAAGCTGCCGCATCGTATGTTATGTGG
1,training,0,1,0,0,0,0,0,0,0,0,GGCTATACATTTTATGTTTCCGGGTCGTATTTTGTGTGG
2,training,0,0,0,0,0,0,0,0,1,0,GGCTTTACATTTTATGCTTCCTTCACGTATGTTGTGTCT
3,test,0,0,0,0,0,1,0,0,0,0,GGCATTACTCTTTGTGCTTCCGGCTCGTATGTTGTGTGG
4,test,0,0,0,0,0,0,0,1,0,0,GACTTTTCAATTTATGCTTTCAGTTGGTATGTTGTGTAG
