### Long-term fitness trajectories from the LTEE

Data available in the Supplemental Material of Good et al. Nature 2017. 
Download possible from Ben Good's github repository [here](https://github.com/benjaminhgood/LTEE-metagenomic/blob/master/additional_data/Concatenated.LTEE.data.all.csv)

We follow the procedures from Wiser et al. 2013 [here](https://doi.org/10.1126/science.1243357). From the Supplemental Material, we are given the following information. 


- Summarizing statistical procedures to fit the two models
- Models were fit to fitness trajectories using the ‘nls’ package in r. 
- Model fits were compared using the BIC information criterion scores. These were then converted into an odds ratio. 
    - Table S1 shows the BIC scores and odds ratios for fits to subsets of the data: a) all 12 populations and all time points, b) excluding 3 populations with incomplete trajectories and c) excluding 6 populations that evolved hypermutability
    - Table S2 summarizes BIC scores for fits to individual populations. This also indicates if the population was truncated or a hypermutator 
    - Table S4 lists the estimated parameters for the power law fit

On the bigger picture, there is also the talk from 2013 by Wiser on [Youtube](https://www.youtube.com/watch?v=CmyBn5Cezy4) with 127 views as of September 2022. 

In [None]:
### load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

from scipy.optimize import curve_fit

In [None]:
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR = f'./figures/LTEE_fit/'
os.makedirs(FIG_DIR, exist_ok=True)
print("All  plots will be stored in: \n" + FIG_DIR)

In [None]:
df = pd.read_csv('./data/Concatenated.LTEE.data.all.csv')

In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

### explanation of column headers and labels

    607 is the wild-type strain 'REL607'
    D.0 is the dilution factor applied to count colonies in initial inoculum. 
    D.1 is the dilution factor applied to count colonies in the saturated population. We expect D.1 = 100*D.0. 
    

In [None]:
df.head(2)

In [None]:

### split into two subsets, according to the Ara-marker of evolving population
is_Ara_positive = np.array(['+' in v for v in df['Population'].values])

df_pos = df[is_Ara_positive]
df_neg = df[~is_Ara_positive] ### only use Ara negative lineages

In [None]:
## treat Ara-posative population
assert all(df_pos['Red.Pop'] == '606') # wild-type is always the red population

### re-construct population sizes
df_pos['Nwt.0'] = df_pos['Red.0']*df_pos['D.0']
df_pos['Nmut.0'] = df_pos['White.0']*df_pos['D.0']
df_pos['Nwt.1'] = df_pos['Red.1']*df_pos['D.1']
df_pos['Nmut.1'] = df_pos['White.1']*df_pos['D.1']



In [None]:
## treat Ara-negative population
assert all(df_neg['White.Pop'] == '607') # wild-type is always the white population

### re-construct population sizes
df_neg['Nwt.0'] = df_neg['White.0']*df_neg['D.0']
df_neg['Nmut.0'] = df_neg['Red.0']*df_neg['D.0']
df_neg['Nwt.1'] = df_neg['White.1']*df_neg['D.1']
df_neg['Nmut.1'] = df_neg['Red.1']*df_neg['D.1']


In [None]:
### join

df = df_pos.append(df_neg)

## reconstruct frequencies
df['xmut.0'] = df['Nmut.0']/(df['Nwt.0'] + df['Nmut.0'])
df['xmut.1'] = df['Nmut.1']/(df['Nwt.1']  + df['Nmut.1'])

## reconstruct log fold-changes
df['logfc_mut'] = np.log(df['Nmut.1']/df['Nmut.0'])
df['logfc_wt']  =  np.log(df['Nwt.1']/df['Nwt.0'])

## reconstuct selection coefficients based on logit
df['logit_percycle_from_freq'] = np.log(df['xmut.1']/(1-df['xmut.1'])) - np.log(df['xmut.0']/(1-df['xmut.0'])) 
df['logit_percycle']  = df['logfc_mut'] - df['logfc_wt']
df['logit_pergen']  = df['logit_percycle']/df['logfc_wt']
df['W'] = df['logit_pergen'] + 1

## reconstruct selection coefficients based on log
df['log_percycle']  =  np.log(df['xmut.1']) - np.log(df['xmut.0']) 

In [None]:
### check that  my number is consistent with existing value for 'Fitness' in the dataset
np.allclose(df['W'], df['Fitness'],equal_nan=True)

In [None]:
## manual check
df['gap'] = df['W'] - df['Fitness']
print(df['gap'].max())


### Investigate truncation

In [None]:
pop2truncation = dict()
list_pop = set(df['Population'])
for pop in list_pop:
    truncation = df.loc[df['Population']==pop,'Generation'].max()
    pop2truncation[pop] = truncation

In [None]:
pop2truncation

### Prepare the data for plotting

In [None]:


### split into two subsets, according to the Ara-marker of evolving population
is_Ara_positive = np.array(['+' in v for v in df['Population'].values])

df_pos = df[is_Ara_positive]
df_neg = df[~is_Ara_positive] ### only use Ara negative lineages

In [None]:
## treat Ara-posative population
assert all(df_pos['Red.Pop'] == '606') # wild-type is always the red population

### re-construct population sizes
df_pos['Nwt.0'] = df_pos['Red.0']*df_pos['D.0']
df_pos['Nmut.0'] = df_pos['White.0']*df_pos['D.0']
df_pos['Nwt.1'] = df_pos['Red.1']*df_pos['D.1']
df_pos['Nmut.1'] = df_pos['White.1']*df_pos['D.1']



In [None]:
## treat Ara-negative population
assert all(df_neg['White.Pop'] == '607') # wild-type is always the white population

### re-construct population sizes
df_neg['Nwt.0'] = df_neg['White.0']*df_neg['D.0']
df_neg['Nmut.0'] = df_neg['Red.0']*df_neg['D.0']
df_neg['Nwt.1'] = df_neg['White.1']*df_neg['D.1']
df_neg['Nmut.1'] = df_neg['Red.1']*df_neg['D.1']


In [None]:
### join

df = df_pos.append(df_neg)

## reconstruct frequencies
df['xmut.0'] = df['Nmut.0']/(df['Nwt.0'] + df['Nmut.0'])
df['xmut.1'] = df['Nmut.1']/(df['Nwt.1']  + df['Nmut.1'])

## reconstruct log fold-changes
df['logfc_mut'] = np.log(df['Nmut.1']/df['Nmut.0'])
df['logfc_wt']  =  np.log(df['Nwt.1']/df['Nwt.0'])

## reconstuct selection coefficients based on logit
df['logit_percycle_from_freq'] = np.log(df['xmut.1']/(1-df['xmut.1'])) - np.log(df['xmut.0']/(1-df['xmut.0'])) 
df['logit_percycle']  = df['logfc_mut'] - df['logfc_wt']
df['logit_pergen']  = df['logit_percycle']/df['logfc_wt']
df['W'] = df['logit_pergen'] + 1

## reconstruct selection coefficients based on log
df['log_percycle']  =  np.log(df['xmut.1']) - np.log(df['xmut.0']) 

In [None]:
## test
fig, axes = plt.subplots(1,2, figsize = (2*FIGHEIGHT_TRIPLET, FIGHEIGHT_TRIPLET))
axes[0].scatter(df['logit_percycle_from_freq'], df['logit_percycle'])
axes[1].scatter(df['logit_percycle_from_freq'], df['logit_percycle_from_freq'] -df['logit_percycle'])

In [None]:
### check that  my number is consistent with existing value for 'Fitness' in the dataset
np.allclose(df['W'], df['Fitness'],equal_nan=True)

In [None]:
## manual check
df['gap'] = df['W'] - df['Fitness']
print(df['gap'].max())


In [None]:
### ok, can drop these auxiliary columns
df = df.drop(['Fitness','logit_percycle_from_freq', 'gap' ], axis = 1)

In [None]:
## save
df.to_csv('./output/LTEE_all_data.csv',index=True)

### Handle the replication

Since we have different numbers of replicates for the different population, we cannot pool all the replicates into one fit. Instead, we calculate the average for each populationa at each timepoint. This way, each population contributes exactly one datapoint at each timepoint (except for later timepoints, where some populations are truncated and do not contribute at all.)

We will calculate averages of the fitness values, frequencies and LFC values. For the later analysis, we will use the average at the level of fitness values. This ensures that the grand mean (average over all populations) is the same in our procedure as in the original analysis (Wiser et al 2013). 

In [None]:
## drop metavariables to simplify the averaging
columns_to_drop = ['Red.Pop', 'White.Pop', # we do not want to distinguish these case
                   'Rep', # we do not need that marker variable, we do it manually
                   'Complete','Mutator.Ever', # these are strings, cannot be averaged
                   'Red.0', 'White.0', 'D.0', 'Red.1', 'White.1', 'D.1'] # superfluous information for our analysis

## drop columns
df_simple = df.drop(columns_to_drop, axis = 1)
## reset index
df_simple = df_simple.set_index(['Generation', 'Population'])
## show shape
df_simple.shape

In [None]:
## create dataframe for avg
index = df_simple.index.drop_duplicates() #only on entry per population per timepoint
df_avg = pd.DataFrame(index=index, columns = df_simple.columns)

## 
for v in df_avg.index: 
    data = df_simple.loc[v] ## get all replicate entries
    df_avg.loc[v] = data.mean(axis = 0, skipna = True)
    df_avg.at[v,'#Rep'] = data.shape[0]

In [None]:
df_avg['#Rep'].max()

In [None]:
## show degree of replication
ax = df_avg['#Rep'].hist(log = True, bins = np.arange(-0.5,12.5, step=1))
ax.set_xlabel('no. replicates')
ax.set_ylabel('no. evolved lineages\n(specific timepoint & replicate)')

In [None]:
## save
df_avg.to_csv('./output/LTEE_averaged_data.csv',index=True)

In [None]:
### alternative script for averaging


### remove populations 
df_subset = df.copy(deep = True)

### we create a new dataframe where each timepoint and population is only represented once
df_subset = df_subset.sort_values(by = ['Population', 'Generation', 'Rep']) # first sort for nice look
df_averaged = df_subset.drop_duplicates([ 'Population', 'Generation']).copy(deep=True)
df_averaged = df_averaged.drop(['Rep'], axis = 1)
df_averaged= df_averaged.reset_index()

## we average across the number of replicates
df_averaged['no_replicates'] = -1 # as a collateral statistic, we count the number of replicates

for i in df_averaged.index:
    row = df_averaged.loc[i]

    pop = row['Population']
    gen = row['Generation']

    is_gen = np.array([v == gen for v in df['Generation'].values])
    is_pop = np.array([v == pop for v in df['Population'].values])
    df_replicates = df.loc[is_gen & is_pop]
    df_averaged.at[i,'no_replicates'] = df_replicates.shape[0]

    for v in ['xmut.0', 'xmut.1', 'logit_percycle', 'logit_pergen', 'log_percycle']:
        df_averaged.at[i, v] = df_replicates[~df_replicates[v].isna()][v].mean()


## shift data points for alternative statistics, based at fitness = 1
df_averaged['logit_percycle+1'] = df_averaged['logit_percycle'] +1
df_averaged['log_percycle+1'] = df_averaged['log_percycle'] +1
df_averaged['logit_pergen+1'] = df_averaged['logit_pergen'] +1

