In [14]:
import os
import pandas as pd
import numpy as np
from scipy.ndimage import maximum_filter
from tqdm import tqdm
from scipy.optimize import curve_fit

import re

from Utilities import *

In [67]:
# Chosse a single experiment
dir = "../Data/experiments_all/20240305_EXP13.xlsm"
# dir = "../new/data/20240612_Evaluation_BT2.xlsm"
dict_data = read_experiments(dir)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  2.64it/s]


In [60]:
# Chosse a set of experiments in a folder
dir = "../Data/experiments_all/"
dict_data = read_experiments(dir)

100%|█████████████████████████████████████████████████████████████████| 25/25 [00:19<00:00,  1.27it/s]


In [61]:
dict_data.keys()

dict_keys(['REP9', 'REP5', 'REP18', 'REP4', 'REP14', 'REP8', 'RUS5', 'REP3', 'REP12', 'REP2', 'EXP14', 'REP11', 'EXP8', 'REP10', 'UK1', 'REP17', 'RUS6', 'EXP3', 'EXP12', 'REP7', 'REP6', 'EXP13', 'EXP2', 'RUS7', 'REP16'])

In [62]:
dict_data['EXP13'].keys()

dict_keys(['WT', 'R111X-R408W'])

In [63]:
dict_data['EXP13']['WT'][0]

Unnamed: 0,0,25,50,75,100,125,150,200,500,750,1250,2500
0,60.772641,1169.965881,6396.290196,9310.596802,10150.878863,10529.368792,9876.086485,10598.957087,9289.28864,9903.970859,8351.392862,7479.269266
10,106.525198,1248.760261,8690.444034,15334.31275,18264.999301,18351.298897,20075.414623,19371.74872,20128.092311,19142.635795,16396.408233,10825.246557
25,70.621204,1474.416048,9680.644504,19461.553278,23640.065187,26265.610733,28557.484848,31370.329183,29932.571056,29734.5677,22106.102778,14722.560111
50,104.881411,1599.738788,10149.217987,21677.525803,28631.494116,32453.671425,35305.192257,38157.326134,37001.090631,34435.132356,27463.351804,16595.562317
75,93.8174,1741.217803,11509.39131,22213.508282,29404.183249,33006.365104,36378.922949,39862.476044,41102.4193,31393.751558,26830.285409,14444.327179
100,84.120388,1581.000115,11145.912816,22161.691519,29414.864457,34465.992415,36392.678073,42726.286326,39897.899653,32587.07317,24055.316984,14276.16532
150,100.308271,1746.026294,11442.704512,23335.645327,30556.06881,34741.866056,35634.858107,40701.040782,39879.469302,31558.623556,19931.155175,15402.094841
250,151.290998,1643.049867,10946.739099,21529.615902,29568.41748,32833.931655,33540.57571,37819.313068,32904.639818,26707.091406,19752.585726,12154.614971


In [68]:
save_plot = False
save_image_dir = "../Data/Landscapes_test"
save_feature_dir = "../Data/Landscapes_test"
tag = "v1"

qc_thr_rmse = [0.2, 0.25]
qc_thr_n_peaks = [5, 8]
qc_thr_variation = [0.1, 0.25]

# mkdir the save_*_dir if not existing
if not os.path.exists(save_image_dir):
    os.mkdir(save_image_dir)
if not os.path.exists(save_feature_dir):
    os.mkdir(save_feature_dir)

In [65]:
feature = pd.DataFrame(columns=['genotype', 'experiment', 'Max', 'Max_x', 'Max_y', 's_x', 's_y', 
                                'rmse', 'n_peaks', 'variation', 'qc_result'])

# Loop over all experiments
for exp in tqdm(dict_data.keys()):
    variants = dict_data[exp]

    # Obtain the WT maximum value for normalization
    WT_av = variants['WT'][3]
    max_wt = np.around(WT_av.max().max(), decimals=2) # maximum value of WT to be used for rescaling

    # Loop over variants in a single experiment
    for var in variants.keys():
        ## For each varianr, there exists 3 replicates and the last of (inx=3) contains the median one
        ## We use the median experiment for the analysis, and use the 3 replicates to obtain the QC-variations 
        e1, e2, e3, data = variants[var]
        name = exp + "_" + var
        
        ### 1. QC: replicate variations
        e1, e2, e3 = reshape(e1/max_wt), reshape(e2/max_wt), reshape(e3/max_wt)
        exp_vals = np.concatenate([e1, e2, e3], axis=1)
        median = np.median(exp_vals, axis=1)
        range = np.ptp(exp_vals, axis=1)
        variation = range / (median + 1)
        variation = variation.max()

        ### 2. QC: Count the number of peacks
        z_np = data.to_numpy(dtype = "float")
        filtered_z = maximum_filter(z_np, size=3)
        n_peaks = (z_np == filtered_z).sum().sum()

        ### 3. Gaussian modelling
        #### Get x, y, z
        x, y, z = transform_df(data, max_wt, rescale=True)
        if save_plot:
            plot_landscape(x, y, z, name = name, 
                           show = False, save = True, save_dir = save_image_dir)
        x, y, z = np.log(x + eps), np.log(y + eps), z

        #### Curve fitting
        initial_guess = (1.0, 5, 5, 2, 2)
        bounds = ([0, eps, eps, 0.5, 0.5],                       # Lower bounds
                  [120, np.log(3000), np.log(300), 1000, 1000])  # Upper bounds 1500, 150
        popt, pcov = curve_fit(gaussian_2d, (x, y), z, p0=initial_guess, bounds=bounds)
        
        a, mx, my, sx, sy = tuple(popt)
        z_hat = gaussian_2d((x, y), a, mx, my, sx, sy)
        
        ### 4. Calculate RMSE
        mse = np.mean((z/z.max() - z_hat/z_hat.max())**2)
        rmse = np.sqrt(mse)
        if save_plot:
            plot_landscape(np.exp(x), np.exp(y), z_hat, name = name + "_model",
                          show = False, save = True, save_dir = save_image_dir)


        ### 5. QC check
        if (rmse <= qc_thr_rmse[0]) and (n_peaks <= qc_thr_n_peaks[0]) and (variation <= qc_thr_variation[0]):
            qc_result = 'Pass'
        elif (rmse >= qc_thr_rmse[1]) or (n_peaks >= qc_thr_n_peaks[1]) or (variation >= qc_thr_variation[1]):
            qc_result = 'Fail'
        else:
            qc_result = 'ToCheck'
        
        ### Save features
        feature.loc[len(feature)] = [var, exp, a, mx, my, sx, sy, rmse, n_peaks, variation, qc_result]

100%|█████████████████████████████████████████████████████████████████| 25/25 [00:02<00:00, 10.61it/s]


In [66]:
feature

Unnamed: 0,genotype,experiment,Max,Max_x,Max_y,s_x,s_y,rmse,n_peaks,variation,qc_result
0,WT,REP9,104.388339,5.856701,4.600635,1.183110,1.753429,0.056623,1,0.112588,ToCheck
1,A403V-A403V,REP9,23.962237,0.586828,4.190179,3.543133,2.387371,0.070678,1,0.077003,Pass
2,A403V-R408W,REP9,7.456938,0.000001,4.181969,4.308900,2.119291,0.076922,5,0.034479,Pass
3,I65T-R408W,REP9,5.110183,5.723828,4.307581,1.428694,2.109654,0.234226,8,0.070863,Fail
4,R158Q-R261Q,REP9,8.142141,6.479639,4.573848,1.201338,1.891704,0.042947,2,0.040015,Pass
...,...,...,...,...,...,...,...,...,...,...,...
155,R158Q-Y414C,REP16,5.043721,5.398553,4.684250,1.656301,2.603218,0.111255,6,0.016209,ToCheck
156,P211T-R408Q,REP16,22.682873,5.005239,4.793702,1.466451,2.431487,0.061461,4,0.165302,ToCheck
157,P225T-P281L,REP16,1.147267,7.307807,5.703782,2.629596,2.304331,0.169492,8,0.017675,Fail
158,P281L-A300S,REP16,5.789020,5.940680,4.681701,1.508286,2.201709,0.072170,1,0.028408,Pass


In [70]:
if len(dict_data.keys()) > 1:
    n = ""
else:
    n = "_" + name

if tag == "":
    t = ""
else:
    t = "_" + tag

Path_to_save = os.path.join(save_feature_dir, "extracted_features" + n + t + ".csv")
feature.to_csv(Path_to_save, index=False)
print("Results are saved as:")
print(Path_to_save)

Results are saved as:
../Data/Landscapes_test/extracted_features_v1.csv
