In [2]:
import os
import pandas as pd
import numpy as np
from scipy.interpolate import LinearNDInterpolator, RegularGridInterpolator
from scipy import ndimage
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from tqdm import tqdm
from scipy.optimize import curve_fit

In [6]:
class ExcelSheetReader:
    def __init__(self, file, sheets=None):
        self.file = file
        self.sheets = sheets
        # If no sheets are specified, read the names of all the sheets in the Excel file
        if self.sheets is None:
            self.sheets = pd.ExcelFile(self.file).sheet_names

    def read_averages(self, sheet_name):
        # Read the specified sheet from the Excel file and return the data
        excel_data = pd.read_excel(self.file, sheet_name=sheet_name, skiprows=51)
        self._process_excel(excel_data)
        return excel_data

    def read_replicates(self, sheet_name):
        # Read the specified sheet from the Excel file and return the data
        repl_excel = []
        start_row = {
            1: 4,
            2: 19,
            3: 34,
        }
        for row in start_row.values():
            excel_data = pd.read_excel(
                self.file, sheet_name=sheet_name, skiprows=row, nrows=9
            )
            self._process_excel(excel_data)
            repl_excel.append(excel_data)
        return repl_excel

    def _process_excel(self, excel_data):
        excel_data.dropna(axis=1, inplace=True)
        excel_data.set_index("Unnamed: 14", inplace=True)
        try:
            excel_data.rename(columns={"BLANK": 0}, inplace=True)
        except Exception:
            pass
        excel_data.columns = excel_data.columns.astype(float)
        # excel_data.columns = [0, 25, 50, 75, 100, 125, 150, 200, 500, 750, 1250, 2500]
        excel_data.index.name = "BH4"


def transform_df(df, max_wt, rescale=True):
    avgs = df.unstack().reset_index(level=[0, 1])
    avgs.columns = ["Phe", "BH4", "Enzyme Activity"]
    if rescale:
        avgs["Enzyme Activity"] = (avgs["Enzyme Activity"] / max_wt) * 100
    avgs[avgs < 0] = 0

    x = avgs.Phe.to_numpy()
    y = avgs.BH4.to_numpy()
    z = np.around(avgs["Enzyme Activity"].to_numpy(), decimals=2)
    return x, y, z


def plot_landscape(
    x,
    y,
    z,
    name=" ",
    method="regular_grid",
    show=True,
    save=False,
    save_dir = "",
    nbins=1000,
    rescale=True,
    max_val_scale=None,
    legend=[True, True, True, True],
):
    x_dense = np.linspace(0, x.max(), nbins)
    y_dense = np.linspace(0, y.max(), nbins)
    B1, B2 = np.meshgrid(x_dense, y_dense, indexing="xy")
    dense_points = np.stack([B1.ravel(), B2.ravel()], -1)  # shape (N, 2) in 2d

    try:
        if method == "linear_ndi":
            scattered_points = np.stack(
                [x.ravel(), y.ravel()], -1
            )  # shape (N, 2) in 2d
            smooth_z = LinearNDInterpolator(
                scattered_points,
                z.ravel(),
                rescale=True,
                fill_value=0.0,
            )
            z_smoothed = smooth_z(dense_points).reshape(B1.shape)
            z_smoothed = ndimage.gaussian_filter(z_smoothed, sigma=10)
            z_smoothed[z_smoothed < 0] = 0

        if method == "regular_grid":
            Z = z.reshape(len(np.unique(x)), len(np.unique(y)))
            rgi = RegularGridInterpolator(
                (np.unique(x), np.unique(y)),
                Z,
                method="linear",
                bounds_error=False,
                fill_value=0.0,
            )
            Z_rgi = rgi(np.array([B1.flatten(), B2.flatten()]).T).reshape(B1.shape)
            z_smoothed = ndimage.gaussian_filter(Z_rgi, sigma=7)
            z_smoothed[z_smoothed < 0] = 0
            # z_smoothed[z_smoothed > 100] = 100

        elif method not in ["linear_ndi", "regular_grid"]:
            raise NotImplementedError(
                "Smoothing method not implemented. Choose between 'regular_grid' or 'linear_ndi'."
            )

    except NotImplementedError as e:
        print("Error: ", e)
        sys.exit()

    fig, ax = plt.subplots(figsize=(6, 5))
    if rescale:
        im = ax.pcolormesh(
            B1,
            B2,
            z_smoothed,
            cmap="Spectral_r",
            norm=colors.PowerNorm(vmin=0, vmax=100, gamma=0.6),
        )
    else:
        if max_val_scale is None:
            max_val_scale = np.max(z_smoothed)

        im = ax.pcolormesh(
            B1,
            B2,
            z_smoothed,
            cmap="Spectral_r",
            norm=colors.PowerNorm(vmin=0, vmax=max_val_scale, gamma=0.6),
        )
    ax.set_xlim([np.min(x_dense), np.max(x_dense)])
    ax.set_ylim([np.min(y_dense), np.max(y_dense)])
    ax.set_xlabel("Phe [uM]")
    ax.set_ylabel("BH4 [uM]")
    ax.set_title(f"{name}")

    CS = ax.contour(
        B1, B2, z_smoothed, 5, colors=("lightgrey"), linewidths=1, origin="lower"
    )
    ax.clabel(CS, fmt="%.0f", colors="lightgrey", fontsize=9)
    cbar = fig.colorbar(im, shrink=0.7, ax=ax)
    if rescale:
        cbar.set_label("Enzyme Activity [%]")
    else:
        cbar.set_label("Enzyme Activity")

    df = pd.DataFrame(z_smoothed, columns=x_dense, index=y_dense)

    mx = np.around(df.max().max(), decimals=2)
    bh4 = df.stack().idxmax()[0]
    phe = df.stack().idxmax()[1]

    df_tmp = df.loc[bh4, :]
    phe_min = df_tmp[df_tmp >= mx * 0.5].index.min()
    phe_max = df_tmp[df_tmp >= mx * 0.5].index.max()
    df_tmp2 = df.loc[:, phe]
    bh4_min = df_tmp2[df_tmp2 >= mx * 0.5].index.min()
    bh4_max = df_tmp2[df_tmp2 >= mx * 0.5].index.max()

    output_vals = {
        "Max": mx,
        "Phe": phe,
        "BH4": bh4,
        "50% max BH4 min": bh4_min,
        "50% max BH4 max": bh4_max,
        "50% max Phe min": phe_min,
        "50% max Phe max": phe_max,
    }

    ax.plot(
        phe,
        bh4,
        marker="x",
        c="m",
        markersize=8,
        markeredgecolor="m",
        markeredgewidth=3,
    )

    C50 = ax.contour(
        B1,
        B2,
        z_smoothed,
        levels=[mx * 0.5],
        colors=("magenta",),
        linewidths=1,
        origin="lower",
    )
    ax.clabel(C50, fmt="%.0f", colors="magenta", fontsize=9)

    info_box, max_val, peak_coords, fifty_coords = legend
    if info_box:
        pro = dict(boxstyle="round", facecolor="w", alpha=0.5)
        textstr = ""
        if max_val:
            textstr += f"Max: {mx:.0f}\n"
        if peak_coords:
            textstr += f"Phe: {phe:.0f}\nBH4: {bh4:.0f}\n"
        if fifty_coords:
            textstr += f"50% max BH4:\n{bh4_min:.0f}-{bh4_max:.0f}\n50% max Phe:\n{phe_min:.0f}-{phe_max:.0f}"
        ax.text(
            0.97,
            0.97,
            textstr,
            transform=im.axes.transAxes,
            fontsize=7,
            verticalalignment="top",
            bbox=pro,
            ha="right",
            color="k",
        )
    # im.clim(0, 100)
    plt.tight_layout()

    if save:
        plt.savefig(os.path.join(save_dir, f"landscape_{name}.png"))
    if show:
        plt.show()
    plt.close()

    return output_vals


def gaussian_2d(xy, a, mx, my, sx, sy):
    x, y = xy
    z = a * np.exp( - 0.5 * ( ((x-mx)**2 / (sx**2)) + ((y-my)**2 / (sy**2)) ) )
    return z


eps = 0.000001

def gaussian_2d(xy, a, mx, my, sx, sy):
    x, y = xy
    z = a * np.exp( - 0.5 * ( ((x-mx)**2 / (sx**2)) + ((y-my)**2 / (sy**2)) ) )
    return z


eps = 0.000001

In [7]:
experiment_dir = "Data/experiments/"
files = os.listdir(experiment_dir)
do_plot = True

feature = pd.DataFrame(columns=['genotype', 'experiment', 'Max', 'Max_x', 'Max_y', 's_x', 's_y', 'mse'])

for f in files:
    print(f"file to read: {f}")
    experiment = f.split("_")[1].split(".")[0]

    # Read file: each file contains several sheets each for one variant including the WT.
    infile = os.path.join(experiment_dir, f)
    variants = [v for v in pd.ExcelFile(infile).sheet_names if v.endswith("av")] # read sheets
    xl = ExcelSheetReader(infile)
    max_wt = np.around(xl.read_averages('WT-av').max().max(), decimals=2) # maximum value of WT to be used for rescaling

    # Loop on sheets (variants)
    for v in variants:
        # Read table 
        data = xl.read_averages(v)
        data[data < 0] = 0
        name = "-".join(v.split("-")[:-1] + [infile.split("_")[1].split(".")[0]])
        
        # Get x, y, z
        x, y, z = transform_df(data, max_wt, rescale=True)
        if do_plot:
            plot_landscape(x, y, z, name = name, 
                           show = False, save = True, save_dir = "Data/Landscapes/")
        x, y, z = np.log(x + eps), np.log(y + eps), z

        # Curve fitting
        initial_guess = (1.0, 5, 5, 2, 2)
        bounds = ([0, eps, eps, 0.5, 0.5],                      # Lower bounds
                  [120, np.log(1500), np.log(150), 1000, 1000])  # Upper bounds
        popt, pcov = curve_fit(gaussian_2d, (x, y), z, p0=initial_guess, bounds=bounds)

        # Calculate mase
        a, mx, my, sx, sy = tuple(popt)
        z_hat = gaussian_2d((x, y), a, mx, my, sx, sy)
        mse = np.mean((z/z.max() - z_hat/z_hat.max())**2)
        if do_plot:
            plot_landscape(np.exp(x), np.exp(y), z_hat, name = name + "_model",
                          show = False, save = True, save_dir = "Data/Landscapes/")

        # Save the information
        feature.loc[len(feature)] = [name, experiment, a, mx, my, sx, sy, mse]

file to read: 20240207_REP12.xlsm
file to read: 20240207_REP4.xlsm
file to read: 20240207_REP5.xlsm
file to read: 20240207_REP18.xlsm
file to read: 20240207_REP2.xlsm
file to read: 20240207_REP14.xlsm
file to read: 20240208_RUS7.xlsm
file to read: 20240208_RUS6.xlsm
file to read: 20240208_UK1.xlsm
file to read: 20240207_REP3.xlsm
file to read: 20240208_REP7.xlsm
file to read: 20240207_EXP13.xlsm
file to read: 20240207_EXP8.xlsm
file to read: 20240208_RUS5.xlsm
file to read: 20240208_REP8.xlsm
file to read: 20240207_REP16.xlsm
file to read: 20240207_REP17.xlsm
file to read: 20240208_REP9.xlsm
file to read: 20240207_EXP12.xlsm
file to read: 20240207_REP10.xlsm
file to read: 20240207_EXP2.xlsm
file to read: 20240207_EXP3.xlsm
file to read: 20240207_EXP14.xlsm
file to read: 20240207_REP11.xlsm


In [8]:
feature

Unnamed: 0,genotype,experiment,Max,Max_x,Max_y,s_x,s_y,mse
0,WT-REP12,REP12,105.383761,5.792331,4.774166,1.168135,1.654667,0.002502
1,Y414C-Y414C-REP12,REP12,13.909202,5.165946,4.452004,1.205434,2.153034,0.003829
2,V388M-R408W-REP12,REP12,2.670232,5.795190,4.842806,1.327943,2.325990,0.009638
3,V388M-V388M-REP12,REP12,8.130517,5.686551,4.508648,1.239838,1.938980,0.013167
4,D222X-R261Q-REP12,REP12,7.098648,6.390191,4.523496,1.101674,1.945343,0.002593
...,...,...,...,...,...,...,...,...
112,R243X-R408W-EXP14,EXP14,3.409697,7.313220,5.010635,2.054027,2.359719,0.032863
113,WT-REP11,REP11,102.489746,5.730679,4.508451,1.257369,1.819037,0.005733
114,IVS10-E390G-REP11,REP11,10.094493,7.116501,4.905541,1.946253,2.338086,0.007796
115,IVS10-V388M-REP11,REP11,15.730187,5.744849,4.460699,1.351999,2.014435,0.018216


In [1]:
# feature.to_csv("Data/extracted_features_v1.csv", index=False)