In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
new_gapn_data_file = "20211018-GapN-data-up-to-SM160.csv"
data = pd.read_csv(new_gapn_data_file, index_col=0)
# data = data.dropna()
# drop the barcode of the sequences: MGSSHHHHHHSSGLVPRGSH
data['Sequence'] = data['Sequence'].apply(lambda x: x.replace('MGSSHHHHHHSSGLVPRGSH', '').strip('*'))
data.index = data.index.str.strip('p')

In [3]:
data['mutation_from_annot'] = data.WT_mutations.fillna('').str.split(' \+ ')\
    .apply(lambda x: set(xi.strip().rstrip() for xi in x))

In [4]:
query_row = data.loc['SM156']

def get_one_edit_matches(query_row):
    
    # Get mutations that are a subset of the current one
    subset_matches = data[data['mutation_from_annot'].apply(
        lambda x: x.issubset(query_row.mutation_from_annot))].drop(query_row.name)
    
    # If there's only one mutation, it's a direct anscestor of the WT
    if len(query_row.mutation_from_annot) == 1:
        return ('WT',)
    
    # Just checks how many missing mutations from each subset
    missing_lengths = (len(query_row.mutation_from_annot) - 
                       subset_matches.mutation_from_annot.apply(len))
    
    
    one_edit_matches = subset_matches[missing_lengths == 1]
    
    return tuple(one_edit_matches.index)

In [5]:
data['ancestor'] = data.apply(get_one_edit_matches, axis=1)

In [6]:
# These ones seem to be missing a direct ancestor
data[data['ancestor'] == ()]

Unnamed: 0,Sequence,WT_mutations,NAD+_initial_rate,NADP+_initial_rate,mutation_from_annot,ancestor
SM050,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,K176F + S210E,-0.00333,1.566667,"{K176F, S210E}",()
SM051,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,K176F + T179E,0.02,0.113333,"{K176F, T179E}",()
SM052,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,T179F + G209E + S210E,0.01333,0.006667,"{T179F, G209E, S210E}",()
SM054,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,N153A + K176H + T179E,-0.01333,0.04,"{K176H, N153A, T179E}",()
SM056,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,K176H + S210E,-0.04,1.253333,"{K176H, S210E}",()
SM057,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151S + N153A + K176F + T179E + Q180I + S210E,0.04,0.006667,"{P151S, S210E, T179E, Q180I, K176F, N153A}",()
SM058,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151A + S210E,-0.10667,3.493333,"{P151A, S210E}",()
SM060,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151S + N153A + T179E,-0.02333,-0.08,"{N153A, P151S, T179E}",()
SM061,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,K176F + R208W + S210E + I212V,0.03333,0.16,"{K176F, R208W, S210E, I212V}",()
SM062,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,N153A + K176F + I212V,0.00333,0.0,"{K176F, N153A, I212V}",()


In [11]:
# Some datapoints have multiple ancestors, some have none
exploded_data = data.explode(column='ancestor').dropna(subset=['ancestor'])
exploded_data

Unnamed: 0,Sequence,WT_mutations,NAD+_initial_rate,NADP+_initial_rate,mutation_from_annot,ancestor
WT,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,,0.17667,41.342222,{},WT
SM038,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,G209V,0.80667,39.055556,{G209V},WT
SM053,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,K176H + T179E + S210E,0.00333,0.053333,"{K176H, S210E, T179E}",SM056
SM055,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151A + T179E + Q180I + S210E,-0.00667,-0.013333,"{P151A, S210E, T179E, Q180I}",SM059
SM059,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151A + Q180I + S210E,0.00667,-0.106667,"{P151A, S210E, Q180I}",SM058
...,...,...,...,...,...,...
SM157,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151A + F152W + P177V + P178A + T179Q + R208Y ...,0.43000,3.920000,"{P178A, F152W, R208Y, P151A, T179Q, P177V, G209V}",SM155
SM157,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P151A + F152W + P177V + P178A + T179Q + R208Y ...,0.43000,3.920000,"{P178A, F152W, R208Y, P151A, T179Q, P177V, G209V}",SM156
SM158,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,S150I + P151A + F152W + P177V + P178A + T179Q ...,0.01000,0.055556,"{P178A, F152W, R208Y, P151A, T179Q, P177V, G20...",SM157
SM159,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P178A + T179Q + G209V + I233T,16.56000,21.205556,"{P178A, G209V, I233T, T179Q}",SM110


In [12]:
exploded_data['NAD+_rate_diff'] = (
    exploded_data['NAD+_initial_rate'].values - 
    data.reindex(exploded_data.ancestor)['NAD+_initial_rate'].values)

exploded_data.sort_values('NAD+_rate_diff')

Unnamed: 0,Sequence,WT_mutations,NAD+_initial_rate,NADP+_initial_rate,mutation_from_annot,ancestor,NAD+_rate_diff
SM144,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,F152Q + P178A + T179Q + G209V,0.62000,8.157778,"{P178A, T179Q, G209V, F152Q}",SM110,-15.64778
SM140,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,F152W + P178A + T179Q + G209V,1.03000,24.264444,"{P178A, G209V, F152W, T179Q}",SM110,-15.23778
SM129,MTKQYKNYVNGEWKLSENEFKIYEPASGAELGSVPAMSTEEVDYVY...,I19F + P178A + T179Q + R208Y + G209V,9.24511,20.014118,"{P178A, I19F, T179Q, R208Y, G209V}",SM126,-12.90460
SM150,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,F152S + P178A + T179Q + G209V,3.67000,7.070000,"{F152S, G209V, P178A, T179Q}",SM110,-12.59778
SM149,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,F152Q + P178A + T179E + G209V,0.10000,0.287778,"{P178A, G209V, T179E, F152Q}",SM108,-12.04667
...,...,...,...,...,...,...,...
SM109,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,G209V + P178S + T179Q,10.12889,27.493300,"{G209V, P178S, T179Q}",SM101,10.08111
SM108,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,G209V + P178A + T179E,12.14667,10.453300,"{G209V, P178A, T179E}",SM087,11.58445
SM125,MTKQYKNYVNGEWKLSENEFKIYEPASGAELGSVPAMSTEEVDYVY...,I19F + P178S + T179Q + G209V,22.37864,45.544518,"{G209V, T179Q, P178S, I19F}",SM109,12.24975
SM160,MTKQYKNYVNGEWKLSENEIKIYEPASGAELGSVPAMSTEEVDYVY...,P178A + T179Q + G209V + I233V,32.06000,23.254444,"{P178A, G209V, I233V, T179Q}",SM110,15.79222


In [13]:
exploded_data['NAD+_ancestor'] = data.reindex(exploded_data.ancestor)['NAD+_initial_rate'].values
exploded_data.to_csv('ancestor_rates.csv')