In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from Kd_estimate import *
from multiprocessing import Pool

replicates = {("9114", "H1"): "abc", ("9114", "H3"): "def", ("9114", "FluB"): "ghi",
              ("6261", "H1"): "abc", ("6261", "H9"): "xyz"}

ccs_12_7 = ['0', '12', '115', '11', '105', '10', '95', '9', '85', '8', '75', '7']
ccs_11_6 = ['0', '11', '105', '10', '95', '9', '85', '8', '75', '7', '65', '6']
all_concentrations = {("9114", "H1"): ccs_12_7, ("9114", "H3"): ccs_11_6, 
                      ("9114", "FluB"): ccs_11_6, ("6261", "H1"): ccs_12_7,
                      ("6261", "H9"): ccs_12_7} 
cct_float = {'0': 0, '12': 10**(-12.05), '115': 10**(-11.55), '11': 10**(-11.05),
             '105': 10**(-10.55), '10': 10**(-10.05), '95': 10**(-9.55),
             '9': 10**(-9.05), '85': 10**(-8.55), '8': 10**(-8.05), '75': 10**(-7.55),
             '7': 10**(-7.05), '65': 10**(-6.55), '6': 10**(-6.05)} 

B = 4

def load_counts(r, c, b, antibody):
    """ Load the counts for each bins / concentrations in a dictionary"""
    df = pd.read_csv(f"../Raw_data/seq_counts/Binding/9114{r}/counts_{antibody}_{r}_{c}_{b}.txt",
                    names=["seq", "cnt"], sep="\s+",
                    dtype={'seq': str, 'cnt': np.int64})
    return df.set_index('seq').cnt.to_dict()

def load_bins_borders(r, c, antibody):
    """ Return the borders of the bins for each concentrations (in fluo coordinates)"""
    # different stored format for each antibody
    bns = []
    if antibody == "6261":
        df = pd.read_csv(f"6261/Fluorescent_data_6261.csv").set_index("Sample")
        for b in "1234":
            l = df.loc[f'6261_{r}_{c}_{b}']
            bns += [l.Minimum_A, l.Maximum_A]
    elif antibody == "9114":
        for b in "1234":
            df = pd.read_csv(f"../Raw_data/gate_events/9114{r}/export_Sorted_9114{r}_{c}_PE{b}.csv",
                              names=["FSC-A","FSC-H","FSC-W","SSC-A",
                              "SSC-H","SSC-W","Comp-FITC-A","Comp-PE-A","Time"])
            bns += [df["Comp-PE-A"].min(), df["Comp-PE-A"].max()]
    return [np.mean([bns[1], bns[2]]), 
            np.mean([bns[3], bns[4]]), 
            np.mean([bns[5], bns[6]])]


def load_bins_meanlogs(r, c, antibody, concentrations):
    """ Return the means of:
        log(fluorescence - min(fluorescence) + 1)
    """
    # different stored format for each antibody
    bns = []
    if antibody == "6261":
        df = pd.read_csv(f"6261/Fluorescent_data.csv").set_index("Sample")
        bns = []
        for b in "1234":
            l = df.loc[f'6261_{r}_{c}_{b}']
            bns += [l.Mean_A]
    elif antibody == "9114":
        minf = np.inf # min fluo over all bins
        for cc in concentrations:
            for b in "1234":
                df = pd.read_csv(f"../Raw_data/gate_events/9114{r}/export_Sorted_9114{r}_{cc}_PE{b}.csv",
                                  names=["FSC-A","FSC-H","FSC-W","SSC-A",
                                  "SSC-H","SSC-W","Comp-FITC-A","Comp-PE-A","Time"])
                minf = min(df["Comp-PE-A"].min(), minf)
        for b in "1234":
            df = pd.read_csv(f"../Raw_data/gate_events/9114{r}/export_Sorted_9114{r}_{c}_PE{b}.csv",
                              names=["FSC-A","FSC-H","FSC-W","SSC-A",
                              "SSC-H","SSC-W","Comp-FITC-A","Comp-PE-A","Time"])
            bns += [df["Comp-PE-A"].apply(lambda f: np.log(f - minf + 1)).mean()]
    return bns

def load_bins_stdlogs(r, c, antibody, concentrations):
    """ Return the standard deviation of the logarithm of the fluorescence """
    bns = []
    if antibody == "6261":
        df = pd.read_csv(f"6261/Fluorescent_data.csv").set_index("Sample")
        for b in "1234":
            l = df.loc[f'6261_{r}_{c}_{b}']
            bns += [l.Std_A]
    elif antibody == "9114":
        minf = np.inf # min fluo over all bins
        for cc in concentrations:
            for b in "1234":
                df = pd.read_csv(f"../Raw_data/gate_events/9114{r}/export_Sorted_9114{r}_{cc}_PE{b}.csv",
                                  names=["FSC-A","FSC-H","FSC-W","SSC-A",
                                  "SSC-H","SSC-W","Comp-FITC-A","Comp-PE-A","Time"])
                minf = min(df["Comp-PE-A"].min(), minf)
        for b in "1234":
            df = pd.read_csv(f"../Raw_data/gate_events/9114{r}/export_Sorted_9114{r}_{c}_PE{b}.csv",
                              names=["FSC-A","FSC-H","FSC-W","SSC-A",
                              "SSC-H","SSC-W","Comp-FITC-A","Comp-PE-A","Time"])
            bns += [df["Comp-PE-A"].apply(lambda f: np.log(f - minf + 1)).std()]
    return bns

def load_cell_counts(r, antibody, bins, concentrations):
    """ Return a BxC numpy array containing the cell counts for this replicate """
    df_cnt = pd.read_csv(f"../Raw_data/cell_counts/Binding/{antibody}{r}_cellct.csv").set_index("concen")
    cell_counts = np.zeros((len(bins), len(concentrations)), dtype=np.int64)
    for ib, b in enumerate(bins):
        for ic, c in enumerate(concentrations):
            cell_counts[ib, ic] = df_cnt.loc["_"+ c + "_", b]
    return cell_counts

def from_np_array(array_string):
    array_string = ','.join(array_string.split())
    array_string = array_string.replace('[,', '[')
    return np.array(eval(array_string, {"nan": np.nan}))

## Choice of antigen/replicate

In [None]:
## choice #######
antigen = "H1"
replicate_nb = 2
#################

antibody = "9114"
rep = replicates[antibody, antigen][replicate_nb]
concentrations = all_concentrations[antibody, antigen]
concentrations_num = np.array([cct_float[cc] for cc in concentrations])
bins = ['1', '2', '3', '4']

B = len(bins)  # number of bins
C = len(concentrations)  # number of concentrations
S = 2**({'9114':16, '6261': 11}[antibody])  # number of variants
if antibody == "6261":
    sequences = [f"{s:0>11b}" for s in range(2**11)]
else:
    sequences = [f"{s:0>16b}" for s in range(2**16)]

## Fetch the data

In [None]:
Rbcs = np.zeros((B, C, S))
bin_separations = np.zeros((B-1, C))
for ic, c in tqdm(enumerate(concentrations)):
    for ib, b in enumerate(bins):
        cnt = load_counts(rep, c, b, antibody)
        Rbcs[ib, ic, :] = [cnt[s] if s in cnt else 0 for s in sequences]
    bin_separations[:, ic] = load_bins_borders(rep, c, antibody)
cell_counts = load_cell_counts(rep, antibody, bins, concentrations)
mean_log_fluorescence_bin = np.zeros((B, C))
std_log_fluorescence_bin = np.zeros((B, C))
for ic, c in tqdm(enumerate(concentrations)):
    mean_log_fluorescence_bin[:, ic] = load_bins_meanlogs(rep, c, antibody, concentrations)
    std_log_fluorescence_bin[:, ic] = load_bins_stdlogs(rep, c, antibody, concentrations)
tot_Rbc = np.sum(Rbcs, axis=2)

## Likelihood estimate

In [None]:
def _fun(s):
    return Kd_gaussian_estimate(Rbcs[:, :, s], bin_separations,
                                concentrations_num, cell_counts, 
                                tot_Rbc, mean_log_fluorescence_bin,
                                std_log_fluorescence_bin)
with Pool(32) as p:
    res = list(tqdm(p.imap(_fun, range(S)), total=S))
    
# save the result  
μ = np.full((S, C), np.nan)
μerr = np.full((S, C), np.nan)
σ =     np.full((S, C), np.nan)
logKd = np.full(S, np.nan)
Acoef =     np.full(S, np.nan)
Bcoef =     np.full(S, np.nan)
err =   np.full(S, np.nan)
logKderr = np.full(S, np.nan)
    
for s in tqdm(range(S)):
        μ[s, :], μerr[s, :], σ[s, :], logKd[s], \
        Acoef[s], Bcoef[s], err[s],logKderr[s] = res[s]
           
df = pd.DataFrame()
df["sequence"] = sequences
df["log10_Kd"] = logKd/np.log(10)
df["A"] = Acoef
df["B"] = Bcoef
df["error_fit"] = err
df["ste_log10Kd"] = logKderr/np.log(10)

df.to_csv(f"Kd_inference_{antibody}_{rep}_nova3.csv", index=False)