# Compare measured traits between replicates of the same growth curve

### Parameters of this notebook 

In [None]:
import pandas as pd           

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from scipy import integrate
from scipy import stats
import random


In [None]:
### Update dependent parameters according to input
import os
import os.path
from os import path

## create export directory if necessary
## foldernames for output plots/lists produced in this notebook
import os
FIG_DIR = f'./figures/warringer2003/'
os.makedirs(FIG_DIR, exist_ok=True)
print("All  plots will be stored in: \n" + FIG_DIR)

OUTPUT_DIR = f'./output/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("All  newly created datafiles will be stored in: \n" + OUTPUT_DIR)

In [None]:


### execute script to load modules here
exec(open('setup_aesthetics.py').read())

In [None]:
DATASET_COLOR = 'darkorange'


In [None]:
SUFFIX_DATASET = 'warringer2003/'

FIG_DIR_DATASET = FIG_DIR + SUFFIX_DATASET
os.makedirs(FIG_DIR_DATASET, exist_ok=True)

### Load trait data

In [None]:
## setup background files from PLATEAU FINDER
exec(open('setup_plateau_finder_warringer2003.py').read())


In [None]:
INDEX_COL = [0,1,2,3,4]
list_na_representations = ['not_present', 'failed_to_compute']

In [None]:
PCWS_TRAITS_WARRINGER = './output/df_M3_traits.csv'
df_warringer = pd.read_csv(PCWS_TRAITS_WARRINGER, header = 0, index_col= INDEX_COL,\
                                  float_precision=None, na_values=list_na_representations)

In [None]:
### assign wild-type label
def is_wildtype(name):
    genotype = name[0]
    
    if genotype == 'BY4741':
        return True
    else:
        return False
    

df_warringer['is_wildtype'] = [is_wildtype(v) for v in df_warringer.index]

### Estimate number of replicates for each genotype

In [None]:
list_genes = list(set(df_warringer.reset_index()['genotype'].values))

gene2n = dict()

for v in list_genes:
    replicates = df_warringer.loc[v]
    gene2n[v] = replicates.shape[0]

In [None]:
set(gene2n.values())

In [None]:
n2gene = dict()

for k, v in gene2n.items():
    n2gene[v] = n2gene.get(v,[]) + [k]

In [None]:
for k,v in n2gene.items():
    print(f"number of genotypes with {k} replicates: {len(v)}")

### Plot for genotypes with two replicates

In [None]:
### we sort all measured growth curves by the metabvariables
df_tmp = df_warringer.copy(deep=True)
df_tmp = df_tmp.sort_values(['run_no', 'plate_no', 'well_no'], ascending = True)

In [None]:
### convert to unit of hours
df_tmp['gmax'] = df_tmp['gmax']*60 # change units to growth rate per hour
df_tmp['lag']  = df_tmp['lag']/60 # change units to hour



In [None]:
list_two = n2gene[2]

# create new dataframe
df_two = pd.DataFrame(index = list_two, columns= ['gmax1', 'gmax2', 'lag1','lag2', 'yield1', 'yield2'])

for gene in list_two:
    replicates = df_tmp.loc[gene]
    assert replicates.shape[0] == 2, 'We expect exactly two replicates!'
    row1 = replicates.iloc[0] # replicate that was measured first
    row2 = replicates.iloc[1] # replicate that was measured later
    
    for v in ['gmax', 'lag', 'yield']:
        df_two.at[gene,v + '1'] = row1[v]
        df_two.at[gene,v + '2'] = row2[v]

In [None]:
### set colors as in the main plots for trait correlation
color = 'dimgrey'


In [None]:
from scipy.stats import pearsonr
from latex_format import float2latex

In [None]:
### compute statistics

for var in ['gmax', 'lag', 'yield']:

    ### plot mutant dataset
    x = df_two[var+'1']
    y = df_two[var+'2']
    r, p = pearsonr(x,y)

    print(var)
    print(fr'r={r:.2f} (p = {float2latex(p)})')



In [None]:
## plot

fig, axes = plt.subplots(1,3, figsize = (3*FIGHEIGHT_TRIPLET, FIGHEIGHT_TRIPLET))

ax = axes[0]
ax = sns.scatterplot(data = df_two, x = 'gmax1', y = 'gmax2', ax = ax, 
                     color = color, rasterized = True )
ax.set_xlabel('growth rate [per hour]\nreplicate 1')
ax.set_ylabel('replicate 2\ngrowth rate [per hour]')

ax = axes[1]
ax = sns.scatterplot(data = df_two, x = 'lag1', y = 'lag2', ax = ax,
                     color = color, rasterized = True)
ax.set_xlabel('lag time [hours]\nreplicate 1')
ax.set_ylabel('replicate 2\nlag time [hours]')

ax = axes[2]
ax = sns.scatterplot(data = df_two, x = 'yield1', y = 'yield2', ax = ax,
                    color = color, rasterized = True)
ax.set_xlabel('biomass yield [OD/mM glucose]\nreplicate 1')
ax.set_ylabel('replicate 2\nbiomass yield [OD/mM glucose]')


for ax in axes:
    ## compute axis limits for square plot
    xmin,xmax = ax.get_xlim()
    ymin,ymax = ax.get_ylim()
    xymin,xymax = np.min([xmin,ymin]), np.max([xmax,ymax])
    ax.set_xlim(xymin,xymax) # make square
    ax.set_ylim(xymin,xymax) # make square
    
    ### plot diagonal
    xmin,xmax = ax.get_xlim()
    ax.plot([xmin,xmax], [xmin,xmax], ls = '--', color = 'black', label = 'x = y')
    
    #plot title
    n_points = df_two.shape[0]
    title = f"subset of n = {n_points} knockouts"
    ax.set_title(title, loc = 'left')
    
    
fig.tight_layout()
fig.savefig(FIG_DIR + f"scatterplot_trait_replicate_measurements.pdf", DPI = DPI, bbox_inches = 'tight', pad_inches = PAD_INCHES)

### Define average traits

In [None]:
list_genes = list(set(df_warringer.reset_index()['genotype'].values))


df_averaged = pd.DataFrame(index = list_genes, columns=['gmax', 'lag', 'yield'])

for gene in list_genes:
    replicates = df_warringer.loc[gene]
    averaged = replicates.mean(axis=0)
    
    for k in df_averaged.columns:
        df_averaged.at[gene,k] = averaged[k]

In [None]:
### reset index
df_averaged['genotype'] = df_averaged.index
df_averaged.reset_index(drop = True)

In [None]:
### store dataset
filename = OUTPUT_DIR + "df_M3_traits_averaged.csv"
df_averaged.to_csv(filename, index = False, float_format= '%.6e')

In [None]:
## reread and test

df = df_averaged

print('#####################################')
print('\nTesting the data stored in ' + filename)
df_reread = pd.read_csv(filename, header = 0, float_precision=None)
print("Testing stored float values.")
float_columns = df.dtypes == 'float64'

x = df_reread.loc[:,float_columns].values
y = df.loc[:,float_columns].values

try:
    np.testing.assert_array_equal(x,y)
    print("Success.")
except AssertionError as e:
    print(e)

print("\nTesting stored values of other type, mostly strings.")
other_columns = ~float_columns
x = df_reread.loc[:,other_columns]
y = df.loc[:,other_columns]


try:
    assert x.equals(y)
    print("Success. All values of other type stored correctly.")
except Exception as e:
    print("Fail. Check true datatypes for columns marked as other in dataframe.")
    print(e)
    