### Long-term fitness trajectories from the LTEE

Data available in the Supplemental Material of Good et al. Nature 2017. 
Download possible from Ben Good's github repository [here](https://github.com/benjaminhgood/LTEE-metagenomic/blob/master/additional_data/Concatenated.LTEE.data.all.csv)

We follow the procedures from Wiser et al. 2013 [here](https://doi.org/10.1126/science.1243357). From the Supplemental Material, we are given the following information. 


- Summarizing statistical procedures to fit the two models
- Models were fit to fitness trajectories using the ‘nls’ package in r. 
- Model fits were compared using the BIC information criterion scores. These were then converted into an odds ratio. 
    - Table S1 shows the BIC scores and odds ratios for fits to subsets of the data: a) all 12 populations and all time points, b) excluding 3 populations with incomplete trajectories and c) excluding 6 populations that evolved hypermutability
    - Table S2 summarizes BIC scores for fits to individual populations. This also indicates if the population was truncated or a hypermutator 
    - Table S4 lists the estimated parameters for the power law fit

On the bigger picture, there is also the talk from 2013 by Wiser on [Youtube](https://www.youtube.com/watch?v=CmyBn5Cezy4) with 127 views as of September 2022. 

In [None]:
### load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from scipy.optimize import curve_fit

In [None]:
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR = f'./figures/LTEE_fit/'
os.makedirs(FIG_DIR, exist_ok=True)
print("All  plots will be stored in: \n" + FIG_DIR)

In [None]:
df= pd.read_csv('./output/LTEE_averaged_data.csv')

In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

In [None]:
color_logit_percycle =  'tab:grey' 
color_logit_pergen = 'firebrick'
color_log_percycle = 'navy'


In [None]:
color_hyper = 'magenta'
def hyperbolic(t, a, b):
    ## compare first Equation in paper
    return 1 + np.divide(a*t,t+b) # as used by the LTEE, assuming starting fitness = 1

color_power = 'cyan'
def powerlaw(t, a, b):
    ## compare second Equation in paper
    return np.power(b*t + 1,a) # as used by the LTEE, assuming starting fitness = 1

In [None]:
## we drop some superfluous columns
columns_auxiliary = ['Nwt.0', 'Nmut.0', 'Nwt.1', 'Nmut.1', 'Nwt.0']

df = df.drop(columns_auxiliary, axis = 1)

In [None]:
## shift data points for alternative statistics, which are based at zero
df['logit_pergen+1'] = df['logit_pergen'] +1
df['logit_percycle+1'] = df['logit_percycle'] +1
df['log_percycle+1'] = df['log_percycle'] +1


## reproduce fits to grand mean

### recreate different datasets

In [None]:
### set up  dict to later access the dataframe

subset2data = {}

### Define list of targets

In [None]:
list_targets = ['logit_pergen+1', 'logit_percycle+1', 'log_percycle+1']
list_labels  = [ r'$s^{\mathrm{logit}}_{\mathrm{gen}}+1$',
                 r'$s^{\mathrm{logit}}_{\mathrm{cycle}}+1$',
                 r'$s^{\mathrm{log}}_{\mathrm{cycle}}+1$']


### Perform model fit on each dataset

In [None]:
df_averaged = df

In [None]:
### set up Dataframe to store results
df_averaged_fit = pd.DataFrame()

In [None]:

for target  in list_targets:


    n_datapoints = df_averaged[~df_averaged[target].isna()].shape[0]
    df_averaged_fit.at[ target,'n_datapoints'] = n_datapoints

    t = df_averaged['Generation']
    y = df_averaged[target]

    ## fit hyperbolic model
    popt_hyperbolic,_ = curve_fit(f=hyperbolic, xdata=t, ydata=y)
    ## store
    df_averaged_fit.at[target, 'hyper_a'] = popt_hyperbolic[0]
    df_averaged_fit.at[target, 'hyper_b'] = popt_hyperbolic[1]
    ## compute trajectory
    y_hat = hyperbolic(t, *popt_hyperbolic)
    ## store sum of residuals squared
    rss = np.power(y_hat - y,2).sum()
    rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
    df_averaged_fit.at[ target, 'hyper_rss']  = rss
    df_averaged_fit.at[ target, 'hyper_rsquared']  = rsquared



    ## fit powerlaw model
    popt_powerlaw,_ = curve_fit(f=powerlaw, xdata=t, ydata=y)
    ## store
    df_averaged_fit.at[target, 'powerlaw_a'] = popt_powerlaw[0]
    df_averaged_fit.at[target, 'powerlaw_b'] = popt_powerlaw[1]
    ## compute trajectory
    y_hat = powerlaw(t, *popt_powerlaw)
    ## store sum of residuals squared
    rss = np.power(y_hat - y,2).sum()
    rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
    df_averaged_fit.at[ target, 'powerlaw_rss']  = rss
    df_averaged_fit.at[ target, 'powerlaw_rsquared']  = rsquared


In [None]:
### plot with different graphical layout


fig, axes = plt.subplots(3,1, sharex=True, figsize = (2*FIGHEIGHT_TRIPLET,2*FIGHEIGHT_TRIPLET))


for target, ylabel, ax in zip(list_targets,list_labels,axes):
    

    t = df_averaged['Generation']
    y = df_averaged[target]

    ## plot raw data
    sns.scatterplot(x=t,y=y, ax=ax, color = 'grey', alpha=1)
 
    ## read fit results
    popt_hyperbolic = df_averaged_fit.loc[target, ['hyper_a', 'hyper_b']].values
    ## compute trajectory
    y_hat = hyperbolic(t, *popt_hyperbolic)
    ## plot trajectory
    hyper_rsquared = df_averaged_fit.at[target, 'hyper_rsquared']
    hyper_label = fr"hyperbolic:$R^2={hyper_rsquared:.3f}$"
    sns.lineplot(x=t, y =y_hat, color = color_hyper,ax=ax, lw = 3,
                label = hyper_label)


    ## read fit results
    popt_powerlaw = df_averaged_fit.loc[target, ['powerlaw_a', 'powerlaw_b']].values
    ## compute trajectory
    y_hat = powerlaw(t, *popt_powerlaw)
    ## plot trajectory
    powerlaw_rsquared = df_averaged_fit.at[target, 'powerlaw_rsquared']
    powerlaw_label = fr"powerlaw: $R^2={powerlaw_rsquared:.3f}$"
    sns.lineplot(x=t, y =y_hat, color = color_power,ax=ax, lw =3,
                label = powerlaw_label)
        
        
    ## create axis title
    title = hyper_label + ', ' + powerlaw_label
    #ax.set_title(title, loc = 'left')
    ax.legend(frameon=False,labelspacing = 0.1, borderpad = -0.1)

    ax.set_ylabel(ylabel)


axes[-1].set_xlabel('time [#generations in evolution experiment]')


fig.savefig(FIG_DIR + "fit_trajectory_fitness+1_allpoints.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)

#### expect specific parameters for the powerlaw fit

    a = 0.0950
    b = 0.00515

In [None]:
df_averaged_fit.loc['logit_pergen+1', ['powerlaw_a', 'powerlaw_b']]

### Calculate correlation coefficient

In [None]:
from scipy.stats import pearsonr, spearmanr

In [None]:
### for hyperbolic

## read observed trajectory
t_vec = df_averaged['Generation']
y = df_averaged['logit_pergen+1']

## reconstruct fitted trajectory
popt_hyperbolic = df_averaged_fit.loc['logit_pergen+1',['hyper_a', 'hyper_b']].values
y_hat = hyperbolic(t_vec, *popt_hyperbolic)

corr_hyperbolic = pearsonr(y,y_hat)[0]
print(corr_hyperbolic)

In [None]:
corr_hyperbolic**2

In [None]:
df_averaged_fit.loc['logit_pergen+1', 'hyper_rsquared']

In [None]:
### for powerlaw

## read observed trajectory
t_vec = df_averaged['Generation']
y = df_averaged['logit_pergen+1']

## reconstruct fitted trajectory
popt_powerlaw = df_averaged_fit.loc['logit_pergen+1',['powerlaw_a', 'powerlaw_b']].values
y_hat = powerlaw(t_vec, *popt_powerlaw)

corr_powerlaw = pearsonr(y,y_hat)[0]
print(corr_powerlaw)

### Calculate grandmean trajectory

In [None]:
## construct data container
df_grandmean = pd.DataFrame(columns = df_averaged.columns)
df_grandmean = df_grandmean.drop(columns=['Population'])
df_grandmean = df_grandmean.set_index('Generation')

In [None]:
## generate iterable list§ of timepoints
list_timepoints = list(set(df_averaged['Generation'].values))
list_timepoints.sort()

In [None]:
## averaged over all populations
for t in list_timepoints: 
    this_time = df_averaged['Generation'] == t
    df_grandmean.at[t,'no_replicates'] = sum(this_time)
    for v in ['xmut.0', 'xmut.1', 'logit_percycle', 'logit_pergen', 'log_percycle']:
        df_grandmean.at[t, v] = df_averaged.loc[this_time, v].mean()
    
    

## shift data points for alternative statistics, based at fitness = 1
df_grandmean['logit_percycle+1'] = df_grandmean['logit_percycle'] +1
df_grandmean['log_percycle+1'] = df_grandmean['log_percycle'] +1
df_grandmean['logit_pergen+1'] = df_grandmean['logit_pergen'] +1


## Calculate correlation with mean trajectory

In [None]:
### for hyperbolic

## read observed trajectory
t_vec = df_grandmean.index
y = df_grandmean['logit_pergen+1']

## reconstruct fitted trajectory
popt_hyperbolic = df_averaged_fit.loc['logit_pergen+1',['hyper_a', 'hyper_b']].values
y_hat = hyperbolic(t_vec, *popt_hyperbolic)

corr_hyperbolic = pearsonr(y,y_hat)[0]
print(corr_hyperbolic)



In [None]:
### for powerlaw

## read observed trajectory
t_vec = df_grandmean.index
y = df_grandmean['logit_pergen+1']

## reconstruct fitted trajectory
popt_powerlaw = df_averaged_fit.loc['logit_pergen+1',['powerlaw_a', 'powerlaw_b']].values
y_hat = powerlaw(t_vec, *popt_powerlaw)

corr_powerlaw = pearsonr(y,y_hat)[0]
print(corr_powerlaw)

##### expected values from the main text

     correlation of mean trajectory with fit of
     powerlaw model  r=0.986
     hyperbolic model (r=0.969). 

In [None]:
print(corr_powerlaw)
print(corr_hyperbolic)

### fit to the grandmean

In [None]:
## prepare new data container for storing
df_grandmean_fit = pd.DataFrame()

## add column for Generation to input data
df_grandmean['Generation'] = df_grandmean.index

In [None]:

for target  in list_targets:


    n_datapoints = df_grandmean[~df_grandmean[target].isna()].shape[0]
    df_grandmean_fit.at[ target,'n_datapoints'] = n_datapoints

    t = df_grandmean['Generation']
    y = df_grandmean[target]

    ## fit hyperbolic model
    popt_hyperbolic,_ = curve_fit(f=hyperbolic, xdata=t, ydata=y)
    ## store
    df_grandmean_fit.at[target, 'hyper_a'] = popt_hyperbolic[0]
    df_grandmean_fit.at[target, 'hyper_b'] = popt_hyperbolic[1]
    ## compute trajectory
    y_hat = hyperbolic(t, *popt_hyperbolic)
    ## store sum of residuals squared
    rss = np.power(y_hat - y,2).sum()
    rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
    df_grandmean_fit.at[ target, 'hyper_rss']  = rss
    df_grandmean_fit.at[ target, 'hyper_rsquared']  = rsquared



    ## fit powerlaw model
    popt_powerlaw,_ = curve_fit(f=powerlaw, xdata=t, ydata=y)
    ## store
    df_grandmean_fit.at[target, 'powerlaw_a'] = popt_powerlaw[0]
    df_grandmean_fit.at[target, 'powerlaw_b'] = popt_powerlaw[1]
    ## compute trajectory
    y_hat = powerlaw(t, *popt_powerlaw)
    ## store sum of residuals squared
    rss = np.power(y_hat - y,2).sum()
    rsquared = 1 - (rss/np.power(y - y.mean(),2).sum())
    df_grandmean_fit.at[ target, 'powerlaw_rss']  = rss
    df_grandmean_fit.at[ target, 'powerlaw_rsquared']  = rsquared


In [None]:
### plot with different graphical layout


fig, axes = plt.subplots(3,1, sharex=True, figsize = (2*FIGHEIGHT_TRIPLET,2*FIGHEIGHT_TRIPLET))


for target, ylabel, ax in zip(list_targets,list_labels,axes):
    

    t = df_grandmean['Generation']
    y = df_grandmean[target]

    ## plot raw data
    sns.scatterplot(x=t,y=y, ax=ax, color = 'grey', zorder = 10, alpha=1)
 
    ## read fit results
    popt_hyperbolic = df_grandmean_fit.loc[target, ['hyper_a', 'hyper_b']].values
    ## compute trajectory
    y_hat = hyperbolic(t, *popt_hyperbolic)
    ## plot trajectory
    hyper_rsquared = df_grandmean_fit.at[target, 'hyper_rsquared']
    hyper_label = fr"hyperbolic:$R^2={hyper_rsquared:.3f}$"
    sns.lineplot(x=t, y =y_hat, color = color_hyper,ax=ax, lw = 3,
                label = hyper_label)


    ## read fit results
    popt_powerlaw = df_grandmean_fit.loc[target, ['powerlaw_a', 'powerlaw_b']].values
    ## compute trajectory
    y_hat = powerlaw(t, *popt_powerlaw)
    ## plot trajectory
    powerlaw_rsquared = df_grandmean_fit.at[target, 'powerlaw_rsquared']
    powerlaw_label = fr"powerlaw: $R^2={powerlaw_rsquared:.3f}$"
    sns.lineplot(x=t, y =y_hat, color = color_power,ax=ax, lw =3,
                label = powerlaw_label)
        
        
    ## create axis title
    title = hyper_label + ', ' + powerlaw_label
    #ax.set_title(title, loc = 'left')
    ax.legend(frameon=False,labelspacing = 0.1, borderpad = -0.1)

    ax.set_ylabel(ylabel)


axes[-1].set_xlabel('time [#generations in evolution experiment]')


fig.savefig(FIG_DIR + "fit_trajectory_fitness+1_mean_timeseries.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)