# Prepare the GB1 dataset for easy packaging and loading into MAVE-NN

In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Insert mavenn at beginning of path
import sys
path_to_mavenn_local = '../../../../'
sys.path.insert(0,path_to_mavenn_local)

#Load mavenn and check path
import mavenn
print(mavenn.__path__)

# MAVE-NN utilities
from mavenn.src.dev import mutations_to_dataset

['../../../../mavenn']


In [2]:
# GB1 WT sequence
wt_seq = 'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE'

# WT sequence library and selection counts.
WT_input_count = 1759616
WT_selection_count = 3041819
wt_y = np.log2((WT_selection_count+1)/(WT_input_count+1))

In [3]:
# Create mut_df
mut0_df = pd.DataFrame(columns=['id','l','c'])
mut0_df.head()

Unnamed: 0,id,l,c


In [4]:
# Create y_df
y0_df = pd.DataFrame()
y0_df['id'] = [0]
y0_df['input_ct'] = [WT_input_count]
y0_df['selected_ct'] = [WT_selection_count]
y0_df['hamming_dist'] = [0]
y0_df['y'] = np.log2((y0_df['selected_ct']+1)/(y0_df['input_ct']+1)) - wt_y
y0_df['dy'] = np.log2(np.exp(1))*np.sqrt(1/(y0_df['selected_ct']+1) + 1/(y0_df['input_ct']+1))
#y0_df.set_index('id', drop=True, inplace=True)

# Number of sequences 
N0 = len(y0_df)

# Preview dataframe
y0_df.head()

Unnamed: 0,id,input_ct,selected_ct,hamming_dist,y,dy
0,0,1759616,3041819,0,0.0,0.001366


## Create data1_df for 1pt mutants

In [5]:
# load single mutants data
csv_1pt = 'oslon_data_single_mutants_ambler.csv'
olson_1pt_df = pd.read_csv(csv_1pt,
                           na_values="nan")
olson_1pt_df.reset_index(inplace=True, drop=True)
olson_1pt_df.head()

Unnamed: 0,WT amino acid,Position,Mutation,Input Count,Selection Count
0,Q,2,A,14663,38476
1,Q,2,C,13001,23023
2,Q,2,D,11488,18085
3,Q,2,E,9501,15629
4,Q,2,F,4770,13332


In [6]:
### Prepare mut_df for 2pt mutatnts ###

# Keep only the columns we want
mut1_df = pd.DataFrame()
mut1_df['id'] = olson_1pt_df.index + N0 # For the wt sequence
mut1_df['l'] = olson_1pt_df['Position']
mut1_df['c'] = olson_1pt_df['Mutation']

# Concatenate
mut1_df['l'] = [str(l) for l in mut1_df['l']]
mut1_df.sort_values(by=['id','l','c'], inplace=True)
mut1_df.reset_index(inplace=True, drop=True)

# Compute number of sequences
N1 = max(mut1_df['id'])+1   

# Drop entries where l cannot be cast as an integer. 
import re
ix = [bool(re.match('[0-9\.]+',l)) for l in mut1_df['l']]
mut1_df = mut1_df[ix]
mut1_df['l'] = mut1_df['l'].astype(float).astype(int)-2

print(f'len(mut1_df): {len(mut1_df)}')
print(f'N1: {N1}')
mut1_df.head()

len(mut1_df): 1045
N1: 1046


Unnamed: 0,id,l,c
0,1,0,A
1,2,0,C
2,3,0,D
3,4,0,E
4,5,0,F


In [7]:
### Prepare mut_df for 1pt mutatnts ###

y1_df = pd.DataFrame()
y1_df['id'] = range(N0,N1)
y1_df['input_ct'] = olson_1pt_df['Input Count']
y1_df['selected_ct'] = olson_1pt_df['Selection Count']
y1_df['hamming_dist'] = 1
y1_df['y'] = np.log2((y1_df['selected_ct']+1)/(y1_df['input_ct']+1)) - wt_y
y1_df['dy'] = np.log2(np.exp(1))*np.sqrt(1/(y1_df['selected_ct']+1) + 1/(y1_df['input_ct']+1))
#y1_df.set_index('id', drop=True, inplace=True)
print(f'len(y1_df): {len(y1_df)}')
y1_df.head()

len(y1_df): 1045


Unnamed: 0,id,input_ct,selected_ct,hamming_dist,y,dy
0,1,14663,38476,1,0.602044,0.014001
1,2,13001,23023,1,0.034732,0.015827
2,3,11488,18085,1,-0.135053,0.017212
3,4,9501,15629,1,-0.071659,0.018767
4,5,4770,13332,1,0.692965,0.024338


## Create data2_df for 2pt mutants

In [8]:
# load double mutant data
csv_2pt = 'oslon_data_double_mutants_ambler.csv'
olson_df = pd.read_csv(csv_2pt, na_values="nan")
olson_df.reset_index(inplace=True, drop=True)
olson_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Mut1 WT amino acid,Mut1 Position,Mut1 Mutation,Mut2 WT amino acid,Mut2 Position,Mut2 Mutation,Input Count,Selection Count,Mut1 Fitness,Mut2 Fitness
0,Q,2,A,Y,3.0,A,173.0,33.0,1.518,0.579
1,Q,2,A,Y,3.0,C,18.0,8.0,1.518,0.616
2,Q,2,A,Y,3.0,D,66.0,2.0,1.518,0.01
3,Q,2,A,Y,3.0,E,72.0,1.0,1.518,0.009
4,Q,2,A,Y,3.0,F,69.0,168.0,1.518,1.054


In [9]:
### Prepare mut_df for 2pt mutatnts ###

# Keep only the columns we want
df1 = pd.DataFrame()
df1['id'] = olson_df.index + N1
df1['l'] = olson_df['Mut1 Position']
df1['c'] = olson_df['Mut1 Mutation']

df2 = pd.DataFrame()
df2['id'] = olson_df.index + N1
df2['l'] = olson_df['Mut2 Position']
df2['c'] = olson_df['Mut2 Mutation']

# Concatenate
mut2_df = pd.concat([df1, df2], axis=0)
mut2_df['l'] = [str(l) for l in mut2_df['l']]
mut2_df.sort_values(by=['id','l','c'], inplace=True)
mut2_df.reset_index(inplace=True, drop=True)

# Get number of sequences
N2 = max(mut2_df['id'])+1

# Drop entries where l cannot be cast as an integer. 
import re
ix = [bool(re.match('[0-9\.]+',l)) for l in mut2_df['l']]
mut2_df = mut2_df[ix]
mut2_df['l'] = mut2_df['l'].astype(float).astype(int)-2

print(f'len(mut2_df): {len(mut2_df)}')
print(f'N2: {N2}')
mut2_df.head()

len(mut2_df): 1071834
N2: 536964


Unnamed: 0,id,l,c
0,1046,0,A
1,1046,1,A
2,1047,0,A
3,1047,1,C
4,1048,0,A


In [10]:
### Prepare mut_df for 2pt mutatnts ###

y2_df = pd.DataFrame()
y2_df['id'] = range(N1,N2)
y2_df['input_ct'] = olson_df['Input Count']
y2_df['selected_ct'] = olson_df['Selection Count']
y2_df['hamming_dist'] = 2
y2_df['y'] = np.log2((y2_df['selected_ct']+1)/(y2_df['input_ct']+1)) - wt_y
y2_df['dy'] = np.log2(np.exp(1))*np.sqrt(1/(y2_df['selected_ct']+1) + 1/(y2_df['input_ct']+1))
#y2_df.set_index('id', drop=True, inplace=True)
print(f'len(y2_df): {len(y2_df)}')
y2_df.head()

len(y2_df): 535918


Unnamed: 0,id,input_ct,selected_ct,hamming_dist,y,dy
0,1046,173.0,33.0,2,-3.145154,0.270515
1,1047,18.0,8.0,2,-1.867676,0.583788
2,1048,66.0,2.0,2,-5.2708,0.851384
3,1049,72.0,1.0,2,-5.979498,1.03402
4,1050,69.0,168.0,2,0.481923,0.20506


## Concatenate datasets, and save, and assess compression schemes

In [11]:
# Concatenate to mut_df
mut_df = pd.concat([mut0_df, mut1_df, mut2_df], axis=0, ignore_index=True)
mut_df.reset_index(inplace=True, drop=True)
mut_df.head()

Unnamed: 0,id,l,c
0,1,0,A
1,2,0,C
2,3,0,D
3,4,0,E
4,5,0,F


In [12]:
# Concatenate to y_df 
y_df= pd.concat([y0_df, y1_df, y2_df])
y_df['id'] = y_df['id'].astype(int)

# Make sure that all ids are unique
assert len(y_df['id']) == len(y_df['id'].unique())

y_df.set_index('id', drop=True, inplace=True)

# Set training set and testing set with an 80:20 split
target_split = .8
np.random.seed(0)
training_flag = (np.random.rand(len(y_df)) < target_split)
y_df.insert(loc=3, column='training_set', value=training_flag)

# Report actual splite
split = y_df['training_set'].sum()/len(y_df)
print(f'Split {100*split:.2f}% of sequences into training set')

# Preview df
y_df.head(10)

Split 80.02% of sequences into training set


Unnamed: 0_level_0,input_ct,selected_ct,hamming_dist,training_set,y,dy
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1759616.0,3041819.0,0,True,0.0,0.001366
1,14663.0,38476.0,1,True,0.602044,0.014001
2,13001.0,23023.0,1,True,0.034732,0.015827
3,11488.0,18085.0,1,True,-0.135053,0.017212
4,9501.0,15629.0,1,True,-0.071659,0.018767
5,4770.0,13332.0,1,True,0.692965,0.024338
6,12460.0,27778.0,1,True,0.366901,0.015555
7,33615.0,71252.0,1,False,0.294129,0.009546
8,13180.0,28931.0,1,False,0.344533,0.015161
9,10166.0,23382.0,1,True,0.411893,0.017139


In [13]:
# Create dataset
data_df = mutations_to_dataset(wt_seq=wt_seq, mut_df=mut_df, y_df=y_df)
data_df.dropna(inplace=True)
data_df.head(10)

Unnamed: 0_level_0,input_ct,selected_ct,hamming_dist,training_set,y,dy,x
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1759616.0,3041819.0,0,True,0.0,0.001366,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1,14663.0,38476.0,1,True,0.602044,0.014001,AYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2,13001.0,23023.0,1,True,0.034732,0.015827,CYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3,11488.0,18085.0,1,True,-0.135053,0.017212,DYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4,9501.0,15629.0,1,True,-0.071659,0.018767,EYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
5,4770.0,13332.0,1,True,0.692965,0.024338,FYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
6,12460.0,27778.0,1,True,0.366901,0.015555,GYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
7,33615.0,71252.0,1,False,0.294129,0.009546,HYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
8,13180.0,28931.0,1,False,0.344533,0.015161,IYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
9,10166.0,23382.0,1,True,0.411893,0.017139,KYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...


In [14]:
# Show size of compressed dataset file
file_name = 'gb1_data.csv.gz'
data_df.to_csv(file_name, compression='gzip', index=False)
print('data_df (zipped):')
!du -mh $file_name
!mv $file_name ../.

data_df (zipped):
 14M	gb1_data.csv.gz


In [15]:
# Show sizes of compressed mut_df and y_df files
# Then delete these files because they're not used

# mut_df
file_name = 'gb1_mut_df.csv.gz'
mut_df.to_csv(file_name, compression='gzip', index=False)
print('mut_df (zipped):')
!du -mh $file_name
!rm $file_name

# y_df
file_name = 'gb1_y_df.csv.gz'
y_df.to_csv(file_name, compression='gzip', index=False)
print('y_df (zipped):')
!du -mh $file_name
!rm $file_name

mut_df (zipped):
3.0M	gb1_mut_df.csv.gz
y_df (zipped):
 11M	gb1_y_df.csv.gz


In [16]:
# Show sizes of original files
print('Original files (unzipped):')
!du -mh $csv_1pt
!du -mh $csv_2pt

Original files (unzipped):
 20K	oslon_data_single_mutants_ambler.csv
 17M	oslon_data_double_mutants_ambler.csv


## Conclusion: It's better to just create data_df as we want it, then zip it, rather than store separate mutation, measurement, and wt sequence files. Zipping reduces the size just as much as this more complex storage scheme does.