In [None]:
import pandas as pd           

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from scipy import integrate
from scipy import stats
import random

from scipy.stats import spearmanr, pearsonr
from latex_format import float2latex


In [None]:
### Update dependent parameters according to input
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR_STEM = f'./figures/trait_correlations/'
os.makedirs(FIG_DIR_STEM, exist_ok=True)


In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

In [None]:
DATASET_COLOR = 'darkorange'



In [None]:
## set which trait distribution to plot

DIST = 'all_traits_vary'

See cell [below](#create-subsets-of-data-only-with-marginals) for a choice of trait distributions.

In [None]:
SUFFIX_DATASET = f'{DIST}/'

FIG_DIR = FIG_DIR_STEM + SUFFIX_DATASET
os.makedirs(FIG_DIR, exist_ok=True)


### Load wild-type traits

In [None]:
INDEX_COL = [0,1,2,3,4]
list_na_representations = ['not_present', 'failed_to_compute']

In [None]:
PCWS_TRAITS_WARRINGER = './output/df_M3_traits.csv'
df_warringer = pd.read_csv(PCWS_TRAITS_WARRINGER, header = 0, index_col= INDEX_COL,\
                                  float_precision=None, na_values=list_na_representations)


In [None]:
### define default wild_type
df_wildtypes = df_warringer[df_warringer['is_wildtype']==True]

WILDTYPE = df_wildtypes.median(axis = 0)

### Load mutant data (averaged)

In [None]:

PCWS_TRAITS_WARRINGER_AVERAGED = './output/df_M3_traits_averaged.csv'
df_averaged = pd.read_csv(PCWS_TRAITS_WARRINGER_AVERAGED, header = 0, float_precision=None)

In [None]:
### assign wild-type label
def is_wildtype(row):
    genotype = row['genotype']
    
    if genotype == 'BY4741':
        return True
    else:
        return False
    

row = df_averaged.iloc[0]
is_wildtype(row)

In [None]:
df_averaged['is_wildtype'] = df_averaged.apply(is_wildtype, axis =1)

In [None]:
### append mutant values (averaged) to set of individual wild-type strains
df_knockouts = df_averaged[~df_averaged['is_wildtype']]
df_knockouts = df_knockouts
df_all_vary = df_wildtypes.reset_index().append(df_knockouts.reset_index())

In [None]:
### restore index
index_col_names = df_warringer.index.names
df_all_vary = df_all_vary.set_index(index_col_names)


In [None]:
## Count number of wild-type and knockotus
print(f"no. of knockouts: {df_knockouts.shape[0]}")
print(f"no. of wildtypes: {df_wildtypes.shape[0]}")

### Set units of time

In [None]:
df_all_vary['gmax'] = df_all_vary['gmax']*60 # change units to growth rate per hour
df_all_vary['lag']  = df_all_vary['lag']/60 # change units to hour



In [None]:
WILDTYPE['gmax'] = WILDTYPE['gmax']*60 # change units to growth rate per hour
WILDTYPE['lag']  = WILDTYPE['lag']/60 # change units to hour

### create subsets of data only with marginals

In [None]:
dist2data = {}

## full datadist with all traits
tmp = df_all_vary.copy(deep=True)
dist2data['all_traits_vary'] = tmp
## distribution with no yield variation
tmp = df_all_vary.copy(deep=True)
tmp['yield'] = WILDTYPE['yield']
dist2data['no_yield_variation'] = tmp
## distribution with some yield variation, but only equal or larger than wild-type
tmp = df_all_vary.copy(deep=True)
tmp['yield'] = [v if v > WILDTYPE['yield'] else WILDTYPE['yield'] for v in df_all_vary['yield']]
dist2data['no_deleterious_yield'] = tmp
## distribution with no growth rate variation
tmp = df_all_vary.copy(deep=True)
tmp['gmax'] = WILDTYPE['gmax']
dist2data['no_gmax_variation'] = tmp
## distribution with no growth rate variation
tmp = df_all_vary.copy(deep=True)
tmp['lag'] = WILDTYPE['lag']
dist2data['no_lag_variation'] = tmp

## marginal distribution in gmax
tmp = df_all_vary.copy(deep=True)
tmp['lag'] = WILDTYPE['lag']
tmp['yield'] = WILDTYPE['yield']
dist2data[ 'only_gmax_varies'] = tmp
## marginal distribution in lag
tmp = df_all_vary.copy(deep=True)
tmp['yield'] = WILDTYPE['yield']
tmp['gmax'] = WILDTYPE['gmax']
dist2data[ 'only_lag_varies'] = tmp
## marginal distribution in yield
tmp = df_all_vary.copy(deep=True)
tmp['lag'] = WILDTYPE['lag']
tmp['gmax'] = WILDTYPE['gmax']
dist2data[ 'only_yield_varies'] = tmp


### Choose subset

In [None]:
df_input = dist2data[DIST]

### plot trait distributionn

In [None]:
n_datapoints = df_input.shape[0]
is_wildtype = df_input['is_wildtype']==True
n_knockouts = df_input[~df_input['is_wildtype']].shape[0]

In [None]:
### calculate statistics

### plot mutant dataset
x= df_input.loc[~is_wildtype]['gmax'].values
y = df_input.loc[~is_wildtype]['lag'].values
rho, p = spearmanr(x,y)

label = 'knockouts' + fr' ($\rho={rho:.2f}$, $p = {float2latex(p)}$)'
print(label)

## plot wild-type scatters
x = df_input.loc[is_wildtype]['gmax'].values
y = df_input.loc[is_wildtype]['lag'].values
rho, p = spearmanr(x,y)
label = 'wild-type' + fr' ($\rho={rho:.2f}$, $p = {float2latex(p)}$)'
print(label)




In [None]:
### calculate statistics with pearson

### plot mutant dataset
x= df_input.loc[~is_wildtype]['gmax'].values
y = df_input.loc[~is_wildtype]['lag'].values
r, p = pearsonr(x,y)

label = 'knockouts' + fr' ($r={r:.2f}$, $p = {float2latex(p)}$)'
print(label)

## plot wild-type scatters
x = df_input.loc[is_wildtype]['gmax'].values
y = df_input.loc[is_wildtype]['lag'].values
r, p = pearsonr(x,y)
label = 'wild-type' + fr' ($r={r:.2f}$, $p = {float2latex(p)}$)'
print(label)




In [None]:
def row2label(row):
    if row['is_wildtype'] == True:
        return 'wild-type'
    else:
        return 'knockout'

df_input['label']  = df_input.apply(row2label,axis=1)

In [None]:
palette = {'wild-type':'orange', 'knockout': 'dimgrey', 'wild-type median':'navy'}

In [None]:


ratio = 5
grid = sns.jointplot(data=df_input.sort_values('label'), x="gmax", y="lag", 
                     hue = 'label', palette = palette,
                     marginal_kws = {'multiple':'layer', 'fill':False},
                     marginal_ticks= False, space = 0, ratio = ratio,
                    height = (1+1/ratio)*FIGHEIGHT_TRIPLET, rasterized=True)

ax = grid.ax_joint
## plot wild-type scatters
x = df_input.loc[is_wildtype]['gmax'].values
y = df_input.loc[is_wildtype]['lag'].values


### replot the marginal distributions in different colors
ax = grid.ax_marg_x
palette['knockout'] = 'dimgrey' ### fix the color for marginals in growth rate
sns.kdeplot(data = df_input, x='gmax', ax=ax, fill = True, hue = 'label', palette=palette, multiple = 'layer',
       legend = False)

### replot the marginal distributions in different colors
ax = grid.ax_marg_y
palette['knockout'] = 'dimgrey' ### fix the color for marginals in growth rate
sns.kdeplot(data = df_input, y='lag', ax=ax, fill = True, hue = 'label', palette=palette, multiple = 'layer',
       legend = False)


ax = grid.ax_joint
## plot median wild-type
x = WILDTYPE['gmax']
y = WILDTYPE['lag']
ax.scatter(x,y, color = palette['wild-type median'], alpha = 1, rasterized = True, marker ='x')
grid.ax_marg_y.axhline(y, color = palette['wild-type median'])
grid.ax_marg_x.axvline(x, color = palette['wild-type median'])


## set label
ax = grid.ax_joint
ax.set_ylabel('lag time [hours]')
ax.set_xlabel('growth rate [per hour]')
## set legend
#ax.legend([],[])
#ax.legend(loc = 'upper left', bbox_to_anchor = (-1.05,0.99), frameon=False) # outside
ax.legend(loc = 'upper left', bbox_to_anchor = (-0.05,1.0), frameon=False) #inside

## fix the ticks
ax = grid.ax_marg_x
ax.tick_params(bottom = False)
ax = grid.ax_marg_y
ax.tick_params(left = False)
## set title

title = f"n = {n_knockouts} knockouts"
ax = grid.ax_marg_x
ax.annotate(title, (0.02,0.05), xycoords ='axes fraction')
if DIST not in ['no_gmax_variation']:
    ax.annotate('median\nwild-type', (0.75,0.6), xycoords ='axes fraction',color = palette['wild-type median'])

ax = grid.ax_marg_y
if DIST not in ['no_lag_variation']:
    ax.annotate('median\nwild-type', (0.5,0.02), xycoords ='axes fraction',color = palette['wild-type median'],
           rotation = 270)

grid.fig.savefig(FIG_DIR + f"scatterplot_gmax-vs-lag.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)

In [None]:
### calculate statistics

### plot mutant dataset
x= df_input.loc[~is_wildtype]['gmax'].values
y = df_input.loc[~is_wildtype]['yield'].values
rho, p = spearmanr(x,y)
label = 'knockouts' + fr' ($\rho={rho:.2f}$, $p={p:.12e}$)' #', p = {float2latex(p)}$)'
print(label)

## plot wild-type scatters
x = df_input.loc[is_wildtype]['gmax'].values
y = df_input.loc[is_wildtype]['yield'].values
rho, p = spearmanr(x,y)
label = 'wild-type' + fr' ($\rho={rho:.2f}$, $p={p:.12e})' #', p = {float2latex(p)}$)'
print(label)




In [None]:
### calculate statistics with pearson

### plot mutant dataset
x= df_input.loc[~is_wildtype]['gmax'].values
y = df_input.loc[~is_wildtype]['yield'].values
r, p = pearsonr(x,y)

label = 'knockouts' + fr' ($r={r:.2f}$, $p = {float2latex(p)}$)'
print(label)

## plot wild-type scatters
x = df_input.loc[is_wildtype]['gmax'].values
y = df_input.loc[is_wildtype]['yield'].values
r, p = pearsonr(x,y)
label = 'wild-type' + fr' ($r={r:.2f}$, $p = {float2latex(p)}$)'
print(label)




In [None]:

ratio = 5
grid = sns.jointplot(data=df_input.sort_values('label'), x="gmax", y="yield", 
                     hue = 'label', palette = palette,
                     marginal_kws = {'multiple':'layer', 'fill':False},
                     marginal_ticks= False, space = 0, ratio = ratio,
                    height = (1+1/ratio)*FIGHEIGHT_TRIPLET, rasterized=True)

ax = grid.ax_joint


### replot the marginal distributions in different colors
ax = grid.ax_marg_x
palette['knockout'] = 'dimgrey' ### fix the color for marginals in growth rate
sns.kdeplot(data = df_input, x='gmax', ax=ax, fill = True, hue = 'label', palette=palette, multiple = 'layer',
       legend = False)

### replot the marginal distributions in different colors
ax = grid.ax_marg_y
palette['knockout'] = 'dimgrey' ### fix the color for marginals in growth rate
sns.kdeplot(data = df_input, y='yield', ax=ax, fill = True, hue = 'label', palette=palette, multiple = 'layer',
       legend = False)

ax = grid.ax_joint
## plot median wild-type
x = WILDTYPE['gmax']
y = WILDTYPE['yield']
ax.scatter(x,y, color = palette['wild-type median'], alpha = 1, rasterized = True, marker ='x')
grid.ax_marg_y.axhline(y, color = palette['wild-type median'])
grid.ax_marg_x.axvline(x, color = palette['wild-type median'])


## set label
ax = grid.ax_joint
ax.set_ylabel('biomass yield [OD/mM glucose]')
ax.set_xlabel('growth rate [per hour]')
## set legend
ax.legend([],[])
#ax.legend(loc = 'upper left', bbox_to_anchor = (-1.05,0.99), frameon=False) # outside
ax.legend(loc = 'upper left', bbox_to_anchor = (-0.05,1.0), frameon=False) #inside

## fix the ticks
ax = grid.ax_marg_x
ax.tick_params(bottom = False)
ax = grid.ax_marg_y
ax.tick_params(left = False)

## set title

title = f"n = {n_knockouts} knockouts"
ax = grid.ax_marg_x
ax.annotate(title, (0.01,0.05), xycoords ='axes fraction')
if DIST not in ['no_gmax_variation']:
    ax.annotate('median\nwild-type', (0.75,0.6), xycoords ='axes fraction',color = palette['wild-type median'])

ax = grid.ax_marg_y
if DIST not in ['no_yield_variation']:
    ax.annotate('median\nwild-type', (0.5,0.37), xycoords ='axes fraction',color = palette['wild-type median'],
           rotation = 270)



grid.fig.savefig(FIG_DIR + f"scatterplot_gmax-vs-yield.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)

### add statistical traits for comparing trait variation

We want to make the statement that 

> the variation across knockouts in the trait X is even greater than the variation in the wild-types

The classic test to perform for this analysis is the F-test, and in our case, a right-tailed version of the F-test. 
However, the F-test is known to give misleading results, when the distributions do not satisfy the assumption of normality. https://en.wikipedia.org/wiki/F-test_of_equality_of_variances

Instead, one of two alternatives are proposed: 

- Levene's test, based on the deviation from the mean: https://en.wikipedia.org/wiki/Levene%27s_test
- Brown-Forsythe test, based on the deviation from the median: https://en.wikipedia.org/wiki/Brown%E2%80%93Forsythe_test#cite_note-Good2005-2

A discussion on Stack Exchange leans towards the Levene test: https://stackoverflow.com/questions/21494141/how-do-i-do-a-f-test-in-python

In [None]:
### comparing the 

In [None]:
import scipy.stats

In [None]:
for var in ['gmax', 'lag', 'yield']:
    print("=============================")
    print("Testing variation for " + var )

    ## compare for growth rate

    is_wildtype = df_input['label'] == 'wild-type'
    # read traits
    traits_mut= df_input.loc[~is_wildtype,var]
    traits_wt = df_input.loc[is_wildtype, var]
    # compute standard deviation
    sd_mut =traits_mut.std()
    sd_wt = traits_wt.std()
    print(f"mutant: {sd_mut:.6f}")
    print(f"wildtype: {sd_wt:.6f}")

    ## compute F-statistic
    F = sd_mut**2/sd_wt**2
    # compute degrees of freedome
    dof_mut = len(traits_mut) -1
    dof_wt = len(traits_wt)-1
    # compute p-value in two-sided F test
    p_value = scipy.stats.f.sf(F, dof_mut, dof_wt)
    print(f"Classic F-test: {p_value:.3e}")



    res = scipy.stats.levene(traits_mut,traits_wt)
    print(res)

    res = scipy.stats.bartlett(traits_mut,traits_wt)
    print(res)