In [1]:
# Standard imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import sys
sys.path.insert(0, '/Users/tareen/Desktop/Research_Projects/2020_mavenn_github/mavenn_git_ssh_local')

# Load mavenn
import mavenn
print(mavenn.__path__)

['/Users/tareen/Desktop/Research_Projects/2020_mavenn_github/mavenn_git_ssh_local/mavenn']


In [2]:
GB1_WT_seq = 'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE'

In [3]:
# raw single mutants data from the Gb1 paper. 
GB1_data_single_mutants = pd.read_csv('GB1_single_mutant_data/oslon_data_single_mutants_ambler.csv')
GB1_data_single_mutants.head()

Unnamed: 0,WT amino acid,Position,Mutation,Input Count,Selection Count
0,Q,2,A,14663,38476
1,Q,2,C,13001,23023
2,Q,2,D,11488,18085
3,Q,2,E,9501,15629
4,Q,2,F,4770,13332


In [4]:
def load_olson_data_GB1():

    """
    Helper function to turn single mutant data provided by 
    Olson et al. into sequence-values arrays. 

    return
    ------
    gb1_df: (pd dataframe)
        dataframe containing sequences (single)
        and their corresponding log2 enrichment values.
        pseudo count of 1 is added to numerator and 
        denominator.

    """

    # GB1 WT sequences
    WT_seq = 'QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTE'

    # WT sequence library and selection counts.
    WT_input_count = 1759616
    WT_selection_count = 3041819
    
    # lists that will contain sequences and their values
    sequences = []
    enrichment = []
    input_ct = []
    selected_ct = []
    
    # load single mutants data
    oslon_single_mutant_positions_data = pd.read_csv('GB1_single_mutant_data/oslon_data_single_mutants_ambler.csv',
                                                     na_values="nan")
    
    # add WT_sequence to top
    sequences.append(GB1_WT_seq)
    enrichment.append(1)
    
    input_ct.append(1759616)
    selected_ct.append(3041819)
    
    for loop_index in range(len(oslon_single_mutant_positions_data)):
        mut_index = int(oslon_single_mutant_positions_data['Position'][loop_index]) - 2

        mut = oslon_single_mutant_positions_data['Mutation'][loop_index]

        temp_seq = list(WT_seq)
        temp_seq[mut_index] = mut

        # calculate enrichment for sequence
        input_count = oslon_single_mutant_positions_data['Input Count'][loop_index]
        selection_count = oslon_single_mutant_positions_data['Selection Count'][loop_index]
        
        input_ct.append(input_count)
        selected_ct.append(selection_count)
        
        # added pseudo count to ensure log doesn't throw up
        temp_fitness = ((selection_count + 1) / (input_count+1)) / (WT_selection_count / WT_input_count)

        sequences.append(''.join(temp_seq))
        enrichment.append(temp_fitness)    

    enrichment = np.array(enrichment).copy()

    gb1_df = pd.DataFrame({'x': sequences,'input_ct':input_ct,'selected_ct':selected_ct, 'y': np.log2(enrichment)}, columns=['x','input_ct', 'selected_ct','y'])
    return gb1_df

gb1_single_mutants_df = load_olson_data_GB1()

In [5]:
gb1_single_mutants_df['set']='training'

In [6]:
gb1_single_mutants_df

Unnamed: 0,x,input_ct,selected_ct,y,set
0,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,1759616,3041819,0.000000,training
1,AYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,14663,38476,0.602044,training
2,CYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,13001,23023,0.034731,training
3,DYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,11488,18085,-0.135054,training
4,EYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,9501,15629,-0.071659,training
...,...,...,...,...,...
1041,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,42231,26020,-1.488334,training
1042,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,29883,21199,-1.284983,training
1043,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,29251,12541,-2.011442,training
1044,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...,17351,9023,-1.732937,training


In [7]:
# Load example data
data_df = mavenn.load_example_dataset('gb1')

# # Separate test from data_df
# ix_test = data_df['set']=='test'
# test_df = data_df[ix_test].reset_index(drop=True)
# print(f'test N: {len(test_df):,}')

# # Remove test data from data_df
# data_df = data_df[~ix_test].reset_index(drop=True)
# print(f'training + validation N: {len(data_df):,}')
data_df

Unnamed: 0,set,dist,input_ct,selected_ct,y,x
0,training,2,173,33,-3.145154,AAKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
1,training,2,18,8,-1.867676,ACKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
2,training,2,66,2,-5.270800,ADKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
3,training,2,72,1,-5.979498,AEKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
4,training,2,69,168,0.481923,AFKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
...,...,...,...,...,...,...
530732,training,2,462,139,-2.515259,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530733,training,2,317,84,-2.693165,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530734,training,2,335,77,-2.896589,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...
530735,training,2,148,28,-3.150861,QYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDD...


In [8]:
data_df = gb1_single_mutants_df.append(data_df, ignore_index=True).copy()

In [9]:
data_df.shape

(531783, 6)

In [10]:
gb1_single_mutants_df.to_csv('GB1_single_mutant_data/GB1_single_mutant_data_mavenn_format.csv.gz')