### Long-term fitness trajectories from the LTEE

Data available in the Supplemental Material of Good et al. Nature 2017. 
Download possible from Ben Good's github repository [here](https://github.com/benjaminhgood/LTEE-metagenomic/blob/master/additional_data/Concatenated.LTEE.data.all.csv)

In [None]:
### load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR = f'./figures/LTEE_trajectories/'
os.makedirs(FIG_DIR, exist_ok=True)
print("All  plots will be stored in: \n" + FIG_DIR)

In [None]:
#df = pd.read_csv('./output/LTEE_all_data.csv')
df = pd.read_csv('./output/LTEE_averaged_data.csv')

In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

### explanation of column headers and labels

    607 is the wild-type strain 'REL607'
    D.0 is the dilution factor applied to count colonies in initial inoculum. 
    D.1 is the dilution factor applied to count colonies in the saturated population. We expect D.1 = 100*D.0. 
    

In [None]:
color_logit_percycle =  'tab:grey' 
color_logit_pergen = 'firebrick'
color_log_percycle = 'navy'


In [None]:
## plot for single lineage

fig, axes = plt.subplots(3,1, sharex=True, figsize = (2*FIGHEIGHT_TRIPLET,2*FIGHEIGHT_TRIPLET))

ax = axes[0]
ax2 =  axes[1]
ax3 = axes[2]

for pop in list(set(df['Population'].values)):
    
    df_subset = df[df['Population'] == pop]
    ## sort by timepoint
    df_subset = df_subset.sort_values('Generation', ascending = True)
    

    x = df_subset['Generation']
    y = df_subset['logit_percycle']
    ax.plot(x,y, color = color_logit_percycle, label = pop)


    x = df_subset['Generation']
    y = df_subset['logit_pergen']
    ax2.plot(x,y, color = color_logit_pergen)



    
    x = df_subset['Generation']
    y = df_subset['log_percycle']
    ax3.plot(x,y, color = color_log_percycle)

## set labels


ax.set_ylabel('evolved fitness' + r'  $s^{\mathrm{logit}}_{\mathrm{cycle}}$')
ax2.set_ylabel('evolved fitness' + r'  $s^{\mathrm{logit}}_{\mathrm{gen}}$')
ax3.set_ylabel('evolved fitness' + r'  $s^{\mathrm{log}}_{\mathrm{cycle}}$')
ax3.set_xlabel('time [#generations in evolution experiment]')

for ax in axes:
    ax.set_xlim(0,df_subset['Generation'].max())
    ax.set_ylim(0)

fig.savefig(FIG_DIR + "timeseries_fitness_alternative_statistics.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)


In [None]:
list_generations = list(set(df['Generation'].values))
list_generations.sort()

df_stats = pd.DataFrame(index = list_generations)
#df_stats['rho_Spearman'] = -1 # dummy default value

In [None]:
## calculate correlation coefficient at all timepoints as a vector
for t in df_stats.index:
    
    df_to_calc = df[df['Generation'] == t]

    ## correlation between logit_percycle and logit_pergen
    x = df_to_calc['logit_percycle'].values
    y = df_to_calc['logit_pergen'].values
    rho, pval = stats.spearmanr(x,y)
    df_stats.at[t,'rho_percycle-pergen'] = rho
    r, pval = stats.pearsonr(x,y)
    df_stats.at[t,'r_percycle-pergen'] = r
    
    ### correlation between s and delta_log
    x = df_to_calc['logit_percycle'].values
    z = df_to_calc['log_percycle'].values
    rho, pval = stats.spearmanr(x,z)
    df_stats.at[t,'rho_logit-log'] = rho
    r, pval = stats.pearsonr(x,z)
    df_stats.at[t,'r_logit-log'] = r
    
    ### maximum disranking
    x = df_to_calc['logit_percycle'].rank()
    y = df_to_calc['logit_pergen'].rank()
    deltaranks = y-x
    df_stats.at[t,'deltarank_max'] = deltaranks.max()
    df_stats.at[t,'deltarank_abs_max']  = np.abs(deltaranks).max()


### Focus on comparison per-time point

In [None]:
list_timepoints = [4000, 15000, 30000]

In [None]:

## compare different measures of  correlation

fig, ax = plt.subplots(1,1, figsize = (2*FIGHEIGHT_TRIPLET,2/3*FIGHEIGHT_TRIPLET))

x = df_stats.index
y = df_stats['rho_percycle-pergen']
ax.plot(x,y, color = 'silver', lw = 3)

for t in list_timepoints:
    ax.axvline(t, ls = 'dotted', color = 'black')
ax.set_ylabel(r'$s^{\mathrm{logit}}_{\mathrm{gen}}$ and $s^{\mathrm{logit}}_{\mathrm{cycle}}$:'+'\n'
             + r"rank correlation $\rho$")

ax.set_xlabel('time [# generations in evolution experiment]')
ax.set_xlim(0,x.max())

fig.savefig(FIG_DIR + "timeseries_correlation.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)


In [None]:

## compare different measures of  correlation

fig, ax = plt.subplots(1,1, figsize = (2*FIGHEIGHT_TRIPLET,2/3*FIGHEIGHT_TRIPLET))

x = df_stats.index
y = df_stats['deltarank_abs_max']
ax.plot(x,y, color = 'silver', lw = 3)

for t in list_timepoints:
    ax.axvline(t, ls = 'dotted', color = 'black')
ax.set_ylabel('max. rank difference\n' + 'betw. $s^{\mathrm{logit}}_{\mathrm{gen}}$ and $s^{\mathrm{logit}}_{\mathrm{cycle}}$')

ax.set_xlabel('time [# generations in evolution experiment]')
ax.set_xlim(0,x.max())
ax.set_ylim(0)

fig.savefig(FIG_DIR + "timeseries_maximum_mismatch.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)


In [None]:
    
### redefine colormap
cmap = sns.color_palette('crest', as_cmap=True)
norm = plt.Normalize(df['Generation'].min(), df['Generation'].max())
sm = plt.cm.ScalarMappable(cmap=cmap, norm=norm)

In [None]:
### plot individual correlationat a fixed timepoint


for t in list_timepoints[:]:
    df_to_calc = df[df['Generation'] == t]  # reduce data to current timepoint
    color_t = cmap(norm(t)) ## use colormap
    
    ## sort by misranking
    df_sorted = df_to_calc.copy(deep=True)
    df_sorted['deltarank_t'] = df_sorted['logit_pergen'].rank() - df_sorted['logit_percycle'].rank()
    df_sorted['deltarank_t_abs'] = np.abs(df_sorted['deltarank_t'])
    df_sorted = df_sorted.sort_values('deltarank_t_abs', ascending = True)
    select = df_sorted.index[[-1]]
    
    ### plot selection coefficients
    fig,ax = plt.subplots(figsize = (0.3*FIGHEIGHT_TRIPLET,0.3*FIGHEIGHT_TRIPLET))

    x = df_to_calc['logit_percycle']
    y = df_to_calc['logit_pergen']

    ax.scatter(x,y, color = color_t) #color_logit_pergen)
    
    ## calculate correlation
    rho = df_stats.at[t,'rho_percycle-pergen'] 
    r = df_stats.at[t,'r_percycle-pergen']

    #title = fr'Pearson $r={r:.2f}$, Spearman $\rho={rho:.2f}$'
    title = fr'$\rho={rho:.2f}$' # short title
    ax.set_title(title, loc = 'center')
    ax.tick_params(labelleft=False, labelbottom = False, left =False, bottom=False)

    ax.set_xlabel(r'$s^{\mathrm{logit}}_{\mathrm{cycle}}$')
    ax.set_ylabel(r'$s^{\mathrm{logit}}_{\mathrm{gen}}$')

    fig.savefig(FIG_DIR + f"snapshot_percycle-vs-pergen_at_t={t}.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)
    
    ## plot LFCS
    fig,ax = plt.subplots(figsize = (0.8*FIGHEIGHT_TRIPLET,0.8*FIGHEIGHT_TRIPLET))
    
    x = df_to_calc['logfc_wt']
    y = df_to_calc['logfc_mut']
    

    ax.scatter(x,y, color = color_t, zorder = 2)
    
    ## make plot square
    #ax.set_xlim(lfc_xmin,lfc_xmax)
    #ax.set_ylim(lfc_ymin,lfc_ymax)
    
    ### find axis limits
    xmin, xmax = ax.get_xlim()
    fcwt_vec = np.linspace(xmin,xmax, num = 100) 


    ## plot red cone for a select point
    #select = range(df_to_calc.shape[0])
    for i in select:
    #for i in df_sorted.index:
        A, B = float(df_sorted.loc[i, 'logfc_wt']), float(df_sorted.loc[i, 'logfc_mut'])
        ax.scatter(A,B,s=200,color ='tab:blue', zorder = -1, alpha = 0.25)

        x_fill = np.linspace(fcwt_vec[0],fcwt_vec[-1])
        y_fill = B/A*x_fill

        ax.fill_between(x_fill, (x_fill - A) + B, y_fill, color='tab:red', alpha=0.25)

    ax.set_xlim(xmin,xmax)
    ax.set_ylabel(r'mutant log fold-change $\mathrm{LFC}_{\mathrm{mut}}$')
    ax.set_xlabel(r'wild-type log fold-change $\mathrm{LFC}_{\mathrm{wt}}$')
    #ax.tick_params(labelleft=False, labelbottom = False, left =False, bottom=False)


    
    fig.savefig(FIG_DIR + f"snapshot_logfc-wt-vs-logfc-mut_at_t={t}.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)
    
    