# Gather data for figure 5

## Import packages

In [1]:
import os, re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotnine as pn

from collections import OrderedDict

import scipy.optimize
import scipy.stats

## Define experimental parameters

In [2]:
#Define prefix for folder containing the folder PreProcessed
folder_prefix = '../'

#types of simulations to consider
simulation_types = ['standard']

#create list of dictionaries with information regarding each condition
datasource = OrderedDict()

datasource['glycerolauto'] =  {'datafile': folder_prefix+'PreProcessed/20170327_GW339_temp/colidata.pkl',
                          'period': 1, 'condition' : 'Glycerol'}
datasource['glucose'] = {'datafile': folder_prefix+'PreProcessed/20180711_GW296_glucose37_1_MMStack/colidata.pkl',
                          'period': 0, 'condition' : 'Glucose'}
datasource['glucose8aa'] = {'datafile': folder_prefix+'PreProcessed/20180709_GW296_glucose8aa37_1_MMStack/colidata.pkl',
                          'period': 0, 'condition' : 'Glucose +AA'}
simulation_source = folder_prefix+'DataSimulations/'

#define dictionary with scaling factors
expinfo = {}
expinfo['size_scale'] = 0.065#nm/px
expinfo['time_scale'] = 3#in min

#Define conversion between numerics and real condition/data types names
plot_names = {0:'Glycerol',1:'Glucose',2:'Glucose +AA'}
type_names = {0:'Experiments',1: 'Simulations'}

## Load experimental data, clean up and add cell cycle variables

In [3]:
colidata = [pd.read_pickle(datasource[x]['datafile']) for x in datasource]
for i in range(0,len(datasource)):
    colidata[i] = colidata[i][colidata[i].pearson_log>0.95]
    
    colidata[i] = colidata[i][colidata[i].period == datasource[list(datasource.keys())[i]]['period']]
    
    colidata[i]['condition'] = i
    colidata[i]['datatype'] = 0
    colidata[i]['simul_type'] = -1
    colidata[i][['DLi','Lb_fit','Ld_fit','Ld','Lb','Ld','Li','Li_fit','Li_old']] \
=colidata[i][['DLi','Lb_fit','Ld_fit','Ld','Lb','Ld','Li','Li_fit','Li_old']].applymap(lambda x: x*expinfo['size_scale'])
    colidata[i]['DeltaL'] = colidata[i]['Ld_fit']-colidata[i]['Lb_fit']

colidata_plot = pd.concat(colidata)

## Load simulation data, clean up and add cell cycle variables

In [4]:
colidata_simul = []
for ind0,i in enumerate(simulation_types):
    for ind,j in enumerate(datasource):
        print(simulation_source+j+'_'+i+'.pkl')
        coli_temp = pd.read_pickle(simulation_source+j+'_'+i+'.pkl')
        coli_temp['Li_old'] = coli_temp['mLi_fit']
        coli_temp['datatype'] = ind0+1
        coli_temp['condition'] = ind
        coli_temp['simul_type'] = ind0
        
        coli_temp['DeltaL'] = coli_temp['Ld_fit']-coli_temp['Lb_fit']
        
        colidata_simul.append(coli_temp)
colisimul_plot = pd.concat(colidata_simul)


../DataSimulations/glycerolauto_standard.pkl
../DataSimulations/glucose_standard.pkl
../DataSimulations/glucose8aa_standard.pkl


## Binned average

Calculate binned averages with respect to birth length and remove small bins

In [5]:
coli=[]
for x in colidata_plot.condition.unique():
    #colidata_plot_stat= colidata_plot.copy()
    bin_np, bin_pos = np.histogram(colidata_plot[colidata_plot.condition==x].Lb_fit,bins = np.arange(0,10,0.1))
    minpos = bin_pos[np.where(bin_np>15)[0][0]]
    maxpos = bin_pos[np.where(bin_np>15)[0][-1]]
    coli.append(colidata_plot[(colidata_plot.condition==x)&(colidata_plot.Lb_fit>minpos)
                                  &(colidata_plot.Lb_fit<maxpos)])
    
colidata_plot_stats = pd.concat(coli)

coli=[]
for x in colisimul_plot.condition.unique():
    #colidata_plot_stat= colidata_plot.copy()
    bin_np, bin_pos = np.histogram(colisimul_plot[colisimul_plot.condition==x].Lb_fit,bins = np.arange(0,10,0.1))
    minpos = bin_pos[np.where(bin_np>15)[0][0]]
    maxpos = bin_pos[np.where(bin_np>15)[0][-1]]
    coli.append(colisimul_plot[(colisimul_plot.condition==x)&(colisimul_plot.Lb_fit>minpos)
                                  &(colisimul_plot.Lb_fit<maxpos)])
    
colisimul_plot_stats = pd.concat(coli)

## Combine simulation and experimental data for Figure 5AB and export

In [6]:
#concatenate exp and simul data
all_data = pd.concat([colidata_plot_stats[['Lb_fit','Ld_fit','Li_fit','Ti',
                                           'condition','datatype','simul_type','DeltaL']],
           colisimul_plot_stats[['Lb_fit','Ld_fit','Li_fit','Ti',
                                 'condition','datatype','simul_type','DeltaL']]])

In [7]:
#rename conditions and data types
all_data['datatype'] = all_data.datatype.apply(lambda x: type_names[x])
all_data['condition'] = all_data.condition.apply(lambda x: plot_names[x])



In [100]:
#export data
all_data[['Lb_fit','DeltaL','Li_fit','datatype','condition']].to_csv(folder_prefix+'Data_export/Fig5_AB.csv', index = False)

## Complete infos regarding number of origins for Figure 5D and export

In [8]:
all_data['numori_born'] = all_data.Ti.apply(lambda x: 1 if x>=0 else 2)

In [9]:
all_frames = []
for c in range(3):
    or_exp = all_data[(all_data.datatype == type_names[0])&((all_data.condition == plot_names[c]))].groupby('numori_born').size()
    or_exp = or_exp/np.sum(or_exp)
    or_simul = all_data[(all_data.datatype == type_names[1])&((all_data.condition == plot_names[c]))].groupby('numori_born').size()
    or_simul = or_simul/np.sum(or_simul)
    ori_frame = pd.DataFrame({'Simulations':or_simul,'Experiments':or_exp})
    ori_frame = ori_frame.reset_index()
    ori_frame['condition'] = plot_names[c]
    all_frames.append(ori_frame)
    
origins = pd.melt(pd.concat(all_frames), value_vars=['Simulations','Experiments'],
                  id_vars=['numori_born','condition'],var_name='data_type',value_name='fraction')
origins['condition'] = origins.condition.astype('category')

In [103]:
origins.to_csv(folder_prefix+'Data_export/Fig5_D.csv',index = False)

## Define functions to calculate distribution parameters for Figure 5C

In [10]:
def fit_normal(colidata, field, fit_range):
    valbins, binmean = np.histogram(colidata[field].dropna(), bins=fit_range)
    valbins = valbins/np.sum(valbins)*(binmean[1]-binmean[0])
    bin_pos= np.array([0.5*(binmean[x]+binmean[x+1]) for x in range(len(binmean)-1)])

    additional = (bin_pos, valbins)
    res_fit = scipy.optimize.minimize(fun=gauss_single_fit, args=additional, 
                                                  x0=np.array([np.max(valbins),np.mean(colidata[field].dropna()),
                                                               np.var(colidata[field].dropna())]),method='BFGS')
    
    return bin_pos, valbins, res_fit

def fun_single_gauss(x, A0, x0, sigma):
    return A0*np.exp(-((x-x0)**2)/(2*sigma**2))

def gauss_single_fit(p, *args):
    x,data = args[0], args[1]
    nll = np.sum((fun_single_gauss(x,p[0],p[1],p[2])-data)**2)
    return nll



## Calculate mean and stdv for birth length and export for Figure 5C

In [11]:
means=[]
stdv = []
means_s=[]
stdv_s = []
for i in range(3):
    
    data = all_data[(all_data.condition==plot_names[i])&(all_data.datatype==type_names[0])]
    bin_pos_Lb, valbins_Lb, res_fit_Lb = fit_normal(data,'Lb_fit',np.arange(0,3,0.1))
    means.append(res_fit_Lb.x[1])
    stdv.append(res_fit_Lb.x[2]**1)
    
    data = all_data[(all_data.condition==plot_names[i])&(all_data.datatype==type_names[1])]
    bin_pos_Lb, valbins_Lb, res_fit_Lb = fit_normal(data,'Lb_fit',np.arange(0,3,0.1))
    means_s.append(res_fit_Lb.x[1])
    stdv_s.append(res_fit_Lb.x[2]**1)
    

In [12]:
exp = pd.DataFrame({'means': means, 'standard': stdv, 'datatype': 'exp', 
                    'condition':[datasource[x]['condition'] for x in datasource]})
simul = pd.DataFrame({'means': means_s, 'standard': stdv_s, 'datatype': 'simul', 
                    'condition':[datasource[x]['condition'] for x in datasource]})
means_stdv = pd.concat([exp,simul])



In [122]:
means_stdv.to_csv(folder_prefix+'Data_export/Fig5_C.csv',index=False)