## Comparison of Raw Data Correction Methods
Methods compared:
1. Brianna's linear model with fixed and random effects (only single mutants and wild type grown on one flat [not four flats])

    For sets with many flats:
        
        formula = f'{col_name} ~ Genotype + (1|Column) + (1|Row) + (1|Flat)'
    
    For sets with one flat:
        
        formula = f'{col_name} ~ Genotype + (1|Column) + (1|Row)'

2. Estimation of marginal means for each genotype using lmer (single, double, and wild type)

    Per set, per flat:
    
        formula = TSC ~ Subline + (1|Column) + (1|Row)

3. Spatial Analysis with SpATS (single, double, and wild type)

In [1]:
import datatable as dt
import pandas as pd

### Read in the corrected raw datasets

In [5]:
# Results on single mutants (that were grown on only one flat) for the lmer model Brianna ran in python
og_bri = dt.fread('../data/brianna_comparemean_tolmer_df_withrelative.csv').to_pandas()
og_bri.head()

Unnamed: 0,Set,WT_avg,WT_fitlmer,MA_avg,MA_fitlmer,MB_avg,MB_fitlmer,MA,MB,MA/WT,MB/WT
0,845,30.79,30.28,41.66,42.3,31.21,31.23,AT1G06040,AT2G31380,1.396794,1.031056
1,845E,27.94,27.58,27.1,27.04,25.88,26.72,AT1G06040,AT2G31380,0.980661,0.968813
2,133,406.46,408.68,411.25,414.09,369.34,368.95,AT1G18620,AT1G74160,1.013246,0.902789
3,703,340.87,342.38,228.98,228.4,292.24,291.11,AT1G74160,AT1G18620,0.667103,0.850257
4,72,166.93,166.73,161.08,161.47,151.75,151.59,AT3G14020,AT1G54160,0.968407,0.909187


In [29]:
og_bri.loc[og_bri['Set'].str.contains('1'),:]

Unnamed: 0,Set,WT_avg,WT_fitlmer,MA_avg,MA_fitlmer,MB_avg,MB_fitlmer,MA,MB,MA/WT,MB/WT
2,133,406.46,408.68,411.25,414.09,369.34,368.95,AT1G18620,AT1G74160,1.013246,0.902789
14,791,61.32,60.7,71.38,71.87,66.67,65.13,AT1G07180,AT2G29990,1.184029,1.073125
19,61,332.35,333.34,335.11,345.25,359.52,351.93,AT1G10450,AT1G59890,1.03573,1.055769
22,71,90.37,90.36,15.25,14.19,90.21,89.16,AT1G10650,AT1G60610,0.15708,0.986719
28,761,101.93,98.62,86.71,90.74,116.02,108.8,AT1G17540,AT1G72760,0.92014,1.103183
33,771,102.53,101.68,104.87,104.32,81.56,82.26,AT1G21380,AT1G76970,1.025938,0.80904
46,741,102.17,103.0,113.92,114.62,100.68,100.79,AT1G52190,AT3G16180,1.112831,0.978543
47,712,180.59,181.53,172.47,171.68,161.08,161.35,AT1G52420,AT3G15940,0.945766,0.88886
49,719,53.5,52.22,65.55,65.9,118.06,116.88,AT1G54130,AT3G14050,1.262093,2.238322
52,812,136.65,136.78,139.11,139.78,136.5,136.14,AT1G66180,AT5G37540,1.021922,0.995302


In [6]:
# Results on single and double mutants for the lmer model I ran in R (should be emulating Brianna's results)
bri = dt.fread('../data/double_mutant_fitness_data_05312024_TSC_corrected_brianna.txt').to_pandas()
bri.head()

Unnamed: 0,Set,Flat,Column,Row,Number,Type,Genotype,Subline,MA,MB,...,WO,FN,SPF,TSC,SH,emmean,SE,df,lower.CL,upper.CL
0,1,1,4,1,4,BORDER,MB,001-MB-2,WT,MUT,...,1.0,2.0,21.666667,65.0,0.0,41.937112,4.673613,44.462631,32.520832,51.353393
1,1,1,6,1,6,BORDER,DM,001-DM-2,MUT,MUT,...,0.0,0.0,20.333333,61.0,0.0,45.630814,6.745214,123.521867,32.279637,58.981991
2,1,1,8,1,8,BORDER,MA,001-MA-2,MUT,WT,...,0.0,0.0,15.5,62.0,0.0,51.789349,5.033448,51.046084,41.684503,61.894195
3,1,1,10,1,10,BORDER,WT,001-WT-2,WT,WT,...,1.0,0.0,12.5,37.5,,57.691604,4.672418,44.391296,48.277308,67.105901
4,1,1,6,3,26,INSIDE,MB,001-MB-2,WT,MUT,...,0.0,0.0,16.333333,49.0,0.0,41.937112,4.673613,44.462631,32.520832,51.353393


In [9]:
# Results on single and double mutants for the lmer model ran per set per flat
lin = dt.fread('../data/double_mutant_fitness_data_05312024_TSC_corrected_linear.txt').to_pandas()
lin.head()

Unnamed: 0,Set,Flat,Column,Row,Number,Type,Genotype,Subline,MA,MB,...,WO,FN,SPF,TSC,SH,emmean,SE,df,lower.CL,upper.CL
0,1,1,4,1,4,BORDER,MB,001-MB-2,WT,MUT,...,1.0,2.0,21.666667,65.0,0.0,39.9264,6.587721,35.735639,26.562449,53.290351
1,1,1,6,1,6,BORDER,DM,001-DM-2,MUT,MUT,...,0.0,0.0,20.333333,61.0,0.0,43.56232,13.821447,44.863522,15.72216,71.40248
2,1,1,8,1,8,BORDER,MA,001-MA-2,MUT,WT,...,0.0,0.0,15.5,62.0,0.0,52.3658,7.992484,44.867068,36.266796,68.464805
3,1,1,10,1,10,BORDER,WT,001-WT-2,WT,WT,...,1.0,0.0,12.5,37.5,,48.111368,6.926374,38.047473,34.09023,62.132505
4,1,1,6,3,26,INSIDE,MB,001-MB-2,WT,MUT,...,0.0,0.0,16.333333,49.0,0.0,39.9264,6.587721,35.735639,26.562449,53.290351


In [10]:
# Results on single and double mutants for the spatial analysis model ran per set per flat
spa = dt.fread('../data/double_mutant_fitness_data_05312024_TSC_corrected_SpATS.txt').to_pandas()
spa.head()

Unnamed: 0,Set,Flat,Column,Row,Number,Type,Genotype,Subline,MA,MB,...,WO,FN,SPF,TSC,SH,R,C,geno,weights,fit.TSC$fitted
0,1,1,4,1,4,BORDER,MB,001-MB-2,WT,MUT,...,1.0,2.0,21.666667,65.0,0.0,1,4,001-MB-2,True,63.65474
1,1,1,6,1,6,BORDER,DM,001-DM-2,MUT,MUT,...,0.0,0.0,20.333333,61.0,0.0,1,6,001-DM-2,True,54.07806
2,1,1,8,1,8,BORDER,MA,001-MA-2,MUT,WT,...,0.0,0.0,15.5,62.0,0.0,1,8,001-MA-2,True,54.592371
3,1,1,10,1,10,BORDER,WT,001-WT-2,WT,WT,...,1.0,0.0,12.5,37.5,,1,10,001-WT-2,True,42.483856
4,1,1,6,3,26,INSIDE,MB,001-MB-2,WT,MUT,...,0.0,0.0,16.333333,49.0,0.0,3,6,001-MB-2,True,50.764047


In [12]:
og_bri.shape, bri.shape, lin.shape, spa.shape

((119, 11), (25795, 26), (25795, 26), (25795, 26))

In [17]:
# Reshape Brianna's data
og_Bri = og_bri[['Set', 'WT_fitlmer', 'MA_fitlmer', 'MB_fitlmer']].melt(id_vars = 'Set', value_name='TSC_corrected', var_name='Genotype')
og_Bri.Genotype = Bri.Genotype.str.split('_').str.get(0)
og_Bri.head()

Unnamed: 0,Set,Genotype,TSC_corrected
0,845,WT,30.28
1,845E,WT,27.58
2,133,WT,408.68
3,703,WT,342.38
4,72,WT,166.73


In [16]:
bri_raw = og_bri[['Set', 'WT_avg', 'MA_avg', 'MB_avg']].melt(id_vars = 'Set', value_name='TSC_avg_raw', var_name='Genotype')
bri_raw.Genotype = bri_raw.Genotype.str.split('_').str.get(0)
bri_raw.head()

Unnamed: 0,Set,Genotype,TSC_avg_raw
0,845,WT,30.79
1,845E,WT,27.94
2,133,WT,406.46
3,703,WT,340.87
4,72,WT,166.93


In [27]:
# Merge corrected values with the mean of the raw data for single mutants
corrected = pd.merge(bri[['Set', 'Genotype', 'emmean']].\
    groupby(['Set', 'Genotype']).mean(), lin[['Set', 'Genotype', 'emmean']].\
    groupby(['Set', 'Genotype']).mean(), left_on=['Set', 'Genotype'], right_index=True, how='left') # to compare with Brianna's

corrected = pd.merge(corrected, spa[['Set', 'Genotype', 'fit.TSC$fitted']].\
    groupby(['Set', 'Genotype']).mean(), 
    left_on=['Set', 'Genotype'], right_index=True, how='left')

corrected = pd.merge(corrected, og_Bri, left_index=True,
    right_on=['Set', 'Genotype'], how='left') # Brianna's python results

corrected = pd.merge(corrected, bri_raw, on=['Set', 'Genotype'], how='left') # Brianna's raw mean data

corrected = pd.merge(corrected, lin[['Set', 'Genotype', 'TSC']].groupby(['Set', 'Genotype']).mean(),
    left_on=['Set', 'Genotype'], right_index=True, how='left') # Raw mean data (to compare with Brianna's)

corrected.columns = ['Set', 'Genotype', 'Brianna_rerun', 'Linear', 'SpATS',
                     'Brianna_og', 'TSC_avg_raw', 'TSC_avg_raw_k']
corrected

Unnamed: 0,Set,Genotype,Brianna_rerun,Linear,SpATS,Brianna_og,TSC_avg_raw,TSC_avg_raw_k
0,40.161851,41.386552,40.386243,1,DM,,,40.386243
1,51.323940,51.398520,51.669540,1,MA,,,51.669540
2,38.587114,37.635531,38.492063,1,MB,,,38.492063
3,54.818423,56.606496,54.880556,1,WT,,,54.880556
4,611.318607,620.917549,616.974368,11,DM,,,616.974368
...,...,...,...,...,...,...,...,...
537,31.721810,31.721810,32.216129,845,MB,31.23,31.21,32.216129
538,30.348334,30.348334,30.793651,845,WT,30.28,30.79,30.793651
539,26.787721,26.787721,27.104478,845E,MA,27.04,27.10,27.104478
540,26.958152,26.958152,25.883333,845E,MB,26.72,25.88,25.883333


In [21]:
corrected.select_dtypes('float').corr(method='pearson')

Unnamed: 0,Brianna_rerun,Linear,SpATS,Brianna_og,TSC_avg_raw,TSC_avg_raw_k
Brianna_rerun,1.0,0.999905,0.999797,0.465277,0.45965,0.999797
Linear,0.999905,1.0,0.999843,0.465277,0.45965,0.999843
SpATS,0.999797,0.999843,1.0,0.467512,0.462071,1.0
Brianna_og,0.465277,0.465277,0.467512,1.0,0.999771,0.467512
TSC_avg_raw,0.45965,0.45965,0.462071,0.999771,1.0,0.462071
TSC_avg_raw_k,0.999797,0.999843,1.0,0.467512,0.462071,1.0
