### Long-term fitness trajectories from the LTEE

Data available in the Supplemental Material of Good et al. Nature 2017. 
Download possible from Ben Good's github repository [here](https://github.com/benjaminhgood/LTEE-metagenomic/blob/master/additional_data/Concatenated.LTEE.data.all.csv)

We follow the procedures from Wiser et al. 2013 [here](https://doi.org/10.1126/science.1243357). From the Supplemental Material, we are given the following information. 


- Summarizing statistical procedures to fit the two models
- Models were fit to fitness trajectories using the ‘nls’ package in r. 
- Model fits were compared using the BIC information criterion scores. These were then converted into an odds ratio. 
    - Table S1 shows the BIC scores and odds ratios for fits to subsets of the data: a) all 12 populations and all time points, b) excluding 3 populations with incomplete trajectories and c) excluding 6 populations that evolved hypermutability
    - Table S2 summarizes BIC scores for fits to individual populations. This also indicates if the population was truncated or a hypermutator 
    - Table S4 lists the estimated parameters for the power law fit

On the bigger picture, there is also the talk from 2013 by Wiser on [Youtube](https://www.youtube.com/watch?v=CmyBn5Cezy4) with 127 views as of September 2022. 

In [None]:
### load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from scipy.optimize import curve_fit

In [None]:
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR = f'./figures/LTEE_fit/'
os.makedirs(FIG_DIR, exist_ok=True)
print("All  plots will be stored in: \n" + FIG_DIR)

In [None]:
df = pd.read_csv('./data/Concatenated.LTEE.data.all.csv')

In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

### explanation of column headers and labels

    607 is the wild-type strain 'REL607'
    D.0 is the dilution factor applied to count colonies in initial inoculum. 
    D.1 is the dilution factor applied to count colonies in the saturated population. We expect D.1 = 100*D.0. 
    

In [None]:
df.head(2)

In [None]:

### split into two subsets, according to the Ara-marker of evolving population
is_Ara_positive = np.array(['+' in v for v in df['Population'].values])

df_pos = df[is_Ara_positive]
df_neg = df[~is_Ara_positive] ### only use Ara negative lineages

In [None]:
## treat Ara-posative population
assert all(df_pos['Red.Pop'] == '606') # wild-type is always the red population

### re-construct population sizes
df_pos['Nwt.0'] = df_pos['Red.0']*df_pos['D.0']
df_pos['Nmut.0'] = df_pos['White.0']*df_pos['D.0']
df_pos['Nwt.1'] = df_pos['Red.1']*df_pos['D.1']
df_pos['Nmut.1'] = df_pos['White.1']*df_pos['D.1']



In [None]:
## treat Ara-negative population
assert all(df_neg['White.Pop'] == '607') # wild-type is always the white population

### re-construct population sizes
df_neg['Nwt.0'] = df_neg['White.0']*df_neg['D.0']
df_neg['Nmut.0'] = df_neg['Red.0']*df_neg['D.0']
df_neg['Nwt.1'] = df_neg['White.1']*df_neg['D.1']
df_neg['Nmut.1'] = df_neg['Red.1']*df_neg['D.1']


In [None]:
### join

df = df_pos.append(df_neg)

## reconstruct frequencies
df['xmut.0'] = df['Nmut.0']/(df['Nwt.0'] + df['Nmut.0'])
df['xmut.1'] = df['Nmut.1']/(df['Nwt.1']  + df['Nmut.1'])

In [None]:
### reconstruct fitness statistics
df['s'] = np.log(df['Nmut.1']/df['Nwt.1']) - np.log(df['Nmut.0']/df['Nwt.0'])
df['W'] = np.divide( np.log(df['Nmut.1']/df['Nmut.0']),\
                            np.log(df['Nwt.1']/df['Nwt.0']))
df['delta_log'] = np.log(df['xmut.1']) - np.log(df['xmut.0'])

In [None]:
### check that  my number is consistent with existing value for 'Fitness' in the dataset
np.allclose(df['W'], df['Fitness'],equal_nan=True)

In [None]:
## manual check
df['gap'] = df['W'] - df['Fitness']
print(df['gap'].max())


In [None]:
color_s = 'tab:grey'
color_W = 'firebrick'
color_deltalog = 'navy'


In [None]:
color_hyper = 'tab:red'
def hyperbolic(t, a, b):
    ## compare first Equation in paper
    return 1 + np.divide(a*t,t+b)

color_power = 'tab:blue'
def powerlaw(t, a, b):
    ## compare second Equation in paper
    return np.power(b*t + 1,a)

In [None]:
## we drop some superfluous columns
columns_auxiliary = [ 'Red.0', 'White.0', 'Red.1', 'White.1', 'D.0', 'D.1', 'gap', 'White.Pop', 'Red.Pop', 'Fitness', 'Nwt.0', 'Nmut.0', 'Nwt.1', 'Nmut.1', 'Nwt.0']

df = df.drop(columns_auxiliary, axis = 1)

In [None]:
## shift data points for alternative statistics, which are based at zero
df['s+1'] = df['s'] +1
df['delta_log+1'] = df['delta_log'] +1

## reproduce fits to grand mean

In [None]:
## set up different subsets

truncated_to_remove = ['Ara + 6' , 'Ara - 2', 'Ara - 3']
hypermutators_to_remove = ['Ara - 1', 'Ara - 2', 'Ara - 3', 'Ara - 4', 'Ara + 3', 'Ara + 6']

### recreate different datasets

In [None]:
### set up  dict to later access the dataframe

subset2data = {}

In [None]:
for subset_label, pop_to_exclude in zip(['all', 'truncated_removed', 'hypermutators_removed'],\
                                        [[], truncated_to_remove, hypermutators_to_remove]):

    ### remove populations 
    index_to_include = [v not in pop_to_exclude for v in df['Population'].values ]
    df_subset = df[index_to_include].copy(deep = True)

    ### we create a new dataframe where each timepoint and population is only represented once
    df_subset = df_subset.sort_values(by = ['Population', 'Generation', 'Rep']) # first sort for nice look
    df_averaged = df_subset.drop_duplicates([ 'Population', 'Generation']).copy(deep=True)
    df_averaged = df_averaged.drop(['Rep'], axis = 1)
    df_averaged= df_averaged.reset_index()

    ## we average across the number of replicates
    df_averaged['no_replicates'] = -1 # as a collateral statistic, we count the number of replicates

    for i in df_averaged.index:
        row = df_averaged.loc[i]

        pop = row['Population']
        gen = row['Generation']

        is_gen = np.array([v == gen for v in df['Generation'].values])
        is_pop = np.array([v == pop for v in df['Population'].values])
        df_replicates = df.loc[is_gen & is_pop]
        df_averaged.at[i,'no_replicates'] = df_replicates.shape[0]

        for v in ['xmut.0', 'xmut.1', 's', 'W', 'delta_log']:
            df_averaged.at[i, v] = df_replicates[~df_replicates[v].isna()][v].mean()


    ## shift data points for alternative statistics, which are based at zero
    df_averaged['s+1'] = df_averaged['s'] +1
    df_averaged['delta_log+1'] = df_averaged['delta_log'] +1
    
    ## store reference
    subset2data[subset_label] = df_averaged

In [None]:
### plot distribution for degree of replication

for subset_label in ['all', 'truncated_removed', 'hypermutators_removed']:
    df_averaged = subset2data[subset_label]
    
    assert df_averaged['no_replicates'].min() >= 2, 'expect at least 2 replicates per point'

    ### manual check: does everyone have at least 2 replicates
    fig, ax =plt.subplots(figsize = (FIGWIDTH_TRIPLET, FIGHEIGHT_TRIPLET))
    ax = df_averaged['no_replicates'].hist(bins=np.arange(0.5,11.5),log=True)
    ax.set_xlabel('number of replicates for a single timepoint and population')
    ax.set_ylabel('count')
    ax.set_title('set with ' + subset_label + f' (n={df_averaged.shape[0]})')
    fig.savefig(FIG_DIR + f"histogram_of_replicates_for_set_with_{subset_label}.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)
    
    ## remove cluttering output
    if subset_label != 'all':
        plt.close(fig)

In [None]:
## inspect outliers manually
df_averaged = subset2data['all']

#df_averaged[df_averaged['no_replicates'] == 10]
#df_averaged[df_averaged['no_replicates'] == 4].sort_values('Generation')

## compute alternative statistics
## we add constant to data values, so we can fit them to the same model

### Perform model fit on each dataset

In [None]:
### set up Dataframe to store results
df_results = pd.DataFrame()
for subset_label in ['all', 'truncated_removed', 'hypermutators_removed']:
    df_tmp= pd.DataFrame(data = {'subset_label':3*[subset_label], 'target':['W', 's+1', 'delta_log+1']})
    df_results = df_results.append(df_tmp)
    
df_results = df_results.set_index(['subset_label','target'])


In [None]:
for subset_label in ['all','truncated_removed', 'hypermutators_removed']:
    for target  in ['W', 's+1', 'delta_log+1']:
        
        df_averaged = subset2data[subset_label]

        n_datapoints = df_averaged[~df_averaged[target].isna()].shape[0]
        df_results.at[(subset_label, target),'n_datapoints'] = n_datapoints

        fig, ax =plt.subplots(figsize = (1.5*FIGWIDTH_TRIPLET, 1.1*FIGHEIGHT_TRIPLET))




        t = df_averaged['Generation']
        y = df_averaged[target]

        ## plot raw data
        sns.scatterplot(x=t,y=y, ax=ax, color = 'grey')
        sns.lineplot(x=t,y=y, ax =ax, color = 'grey')

        ## fit hyperbolic model
        popt_hyperbolic,_ = curve_fit(f=hyperbolic, xdata=t, ydata=y)
        ## store
        df_results.at[(subset_label,target), 'hyper_a'] = popt_hyperbolic[0]
        df_results.at[(subset_label,target), 'hyper_b'] = popt_hyperbolic[1]
        ## compute trajectory
        y_hat = hyperbolic(t, *popt_hyperbolic)
        ## store sum of residuals squared
        rss = np.power(y_hat - y,2).sum()
        rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
        df_results.at[(subset_label, target), 'hyper_rss']  = rss
        df_results.at[(subset_label, target), 'hyper_rsquared']  = rsquared
        ## plot trajectory
        param_label = 'hyperbolic: a=%.2f, b=%.0f' % tuple(popt_hyperbolic)
        sns.lineplot(x=t, y =y_hat, color = color_hyper,ax=ax,
                     label = param_label + rf", $R^2={rsquared:.2f}$")
        

    


        ## fit powerlaw model
        popt_powerlaw,_ = curve_fit(f=powerlaw, xdata=t, ydata=y)
        ## store
        df_results.at[(subset_label,target), 'powerlaw_a'] = popt_powerlaw[0]
        df_results.at[(subset_label,target), 'powerlaw_b'] = popt_powerlaw[1]
        ## compute trajectory
        y_hat = powerlaw(t, *popt_powerlaw)
        ## store sum of residuals squared
        rss = np.power(y_hat - y,2).sum()
        rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
        df_results.at[(subset_label, target), 'powerlaw_rss']  = rss
        df_results.at[(subset_label, target), 'powerlaw_rsquared']  = rsquared
        ## plot trajectory
        param_label = 'powerlaw: a=%5.4f, b=%5.5f' % tuple(popt_powerlaw)
        sns.lineplot(x=t, y =y_hat, color = color_power,ax=ax,\
                     label = param_label + rf", $R^2={rsquared:.2f}$")
        
        ## remove cluttering output
        if subset_label != 'all':
            plt.close(fig)

## Calculate model comparison stats


In [None]:
## set parameters

k = 3 # include one extra parameter for variance of error distribution


for subset_label in ['all','truncated_removed', 'hypermutators_removed']:
    for target  in ['W', 's+1', 'delta_log+1']:

        ## read number of datapoints
        n = df_results.at[(subset_label, target),'n_datapoints']

        ## read hyperbolic model results
        rss = df_results.at[(subset_label, target),'hyper_rss']
        ## compute information criteria
        aic_hyper = 2*k + n*np.log(rss)
        bic_hyper = n*np.log(rss) + k*np.log(n)
        likelihood_hyper = -n/2*(np.log(2*np.pi) + 1) -n/2*np.log(rss/n)
        ## store
        df_results.at[(subset_label, target),'hyper_aic'] = aic_hyper
        df_results.at[(subset_label, target),'hyper_bic'] = bic_hyper
        df_results.at[(subset_label, target),'hyper_likelihood'] = likelihood_hyper

        ## read powerlaw model results
        rss = df_results.at[(subset_label, target),'powerlaw_rss']
        ## compute information criteria
        aic_powerlaw = 2*k + n*np.log(rss)
        bic_powerlaw = n*np.log(rss) + k*np.log(n)
        likelihood_powerlaw = -n/2*(np.log(2*np.pi) + 1) -n/2*np.log(rss/n)
        ## store
        df_results.at[(subset_label, target),'powerlaw_aic'] = aic_powerlaw
        df_results.at[(subset_label, target),'powerlaw_bic'] = bic_powerlaw
        df_results.at[(subset_label, target),'powerlaw_likelihood'] = likelihood_powerlaw



In [None]:
### compute model comparison stats based on BIC

df_results['delta_bic'] = df_results['hyper_bic'] - df_results['powerlaw_bic']
df_results['likelihood_ratio'] = np.exp(df_results['powerlaw_likelihood'] - df_results['hyper_likelihood'])


In [None]:
## coompute model comparison stats based on AIC
df_results['delta_aic'] = df_results['hyper_aic'] - df_results['powerlaw_aic']

for subset_label in ['all','truncated_removed', 'hypermutators_removed']:
    for target  in ['W', 's+1', 'delta_log+1']:
        aic_hyper = df_results.at[(subset_label, target), 'hyper_aic']
        aic_powerlaw = df_results.at[(subset_label, target), 'powerlaw_aic']

        ## identify model with smaller aic
        aic_min = np.min([aic_hyper,aic_powerlaw])
        aic_max = np.max([aic_hyper,aic_powerlaw])

        ## compute probability
        ### see https://en.wikipedia.org/wiki/Akaike_information_criterion#How_to_use_AIC_in_practice
        prob_max_model_is_better = np.exp((aic_min - aic_max)/2)


        df_results.at[(subset_label, target), 'akaike_pvalue'] = prob_max_model_is_better

In [None]:
col_to_print = ['hyper_bic', 'powerlaw_bic', 'delta_bic', 'likelihood_ratio',\
               'hyper_aic', 'powerlaw_aic', 'delta_aic', 'akaike_pvalue']

df_results[col_to_print]