In [3]:
#%%writefile ../../src/data/data_utils.py
# %load ../../src/data/data_utils.py
# %%writefile ../../src/data/data_utils.py

"""
Author: Jim Clauwaert
Created in the scope of my PhD
"""

import numpy as np

def BinaryOneHotEncoder(Y_bool):
    hot_array = np.zeros([len(Y_bool), 2], dtype=np.int8)
    for i in range(len(Y_bool)): 
        if Y_bool[i] == True:
            hot_array[i,1]=1 
        else:
            hot_array[i,0]=1
    
    return hot_array


def GetDataLocations(sigma):
    """ Helper function for quick access of ChIP-chip data
    sigma: string
        Sigma-factor for which data is loaded
        
    OUTPUT
    -------
    """
    experiments = {"RPOD":3, "RPOS":3, "RNAP":3}
    
    if sigma in experiments:
        for i in range(experiments[sigma]):
            data_ip = ["thesis/sorted/{}_EXP_{}_635.extr".format(sigma, u+1) for u in range(experiments[sigma])]
            data_mock_ip = ["thesis/sorted/{}_EXP_{}_532.extr".format(sigma, u+1) for u in range(experiments[sigma])]
    else:
        return [], []
        
    return data_ip, data_mock_ip

def LoadValidationData():

    A_raw = pd.read_csv("data/external/anderson_NN.csv")
    A_X, A_Y = CreateImageFromSequences(A_raw["PROBE_SEQUENCE"]), A_raw["PM"]
    B_raw = pd.read_csv("data/external/brewster_NN.csv")
    B_X, B_Y = CreateImageFromSequences(B_raw["PROBE_SEQUENCE"]), B_raw["PM"]
    R_raw = pd.read_csv("data/external/rand_mut_NN.csv")
    R_X, R_Y = CreateImageFromSequences(R_raw["PROBE_SEQUENCE"]), R_raw["PM"]
    M_raw = pd.read_csv("data/external/mod_mut_NN.csv")
    M_X, M_Y = CreateImageFromSequences(M_raw["PROBE_SEQUENCE"]), M_raw["PM"]
    D_raw = pd.read_csv("data/external/davis_NN.csv")
    D_X, D_Y = CreateImageFromSequences(D_raw["PROBE_SEQUENCE"]), D_raw["PM"]

    return A_X, A_Y, B_X, B_Y, R_X, R_Y, M_X, M_Y, D_X, D_Y

def TransformDataSimple(data_ip, data_mock_ip):

    list_mock_ip = []
    list_ip = []

    for datafile in data_ip:
        list_ip.append(pd.read_csv(datafile)["PM"].values)
    for datafile in data_mock_ip:
        list_mock_ip.append(pd.read_csv(datafile)["PM"].values)
            
    datafile = pd.read_csv(data_ip[0])
    sequences = datafile["PROBE_SEQUENCE"].values
    IDs = datafile["PROBE_ID"].values
    
    list_ip = np.vstack(list_ip).T
    list_mock_ip = np.vstack(list_mock_ip).T
    log_list_ip = np.log2(list_ip)
    log_mock_list_ip = np.log2(list_mock_ip)

    median_ip = [np.median(log_list_ip[:,u]) for u in range(np.shape(list_ip)[1])]
    mad_ip = [robust.mad(log_list_ip[:,u]) for u in range(np.shape(list_ip)[1])]
    mock_median_ip = [np.median(log_mock_list_ip[:,u]) for u in range (np.shape(list_ip)[1])]
    mock_mad_ip = [robust.mad(log_mock_list_ip[:,u]) for u in range(np.shape(list_ip)[1])]
    
    ip_norm = np.array([(log_list_ip[:,u]-median_ip[u])/mad_ip[u] for u in range(len(mad_ip))]).T
    mock_ip_norm = np.array([(log_mock_list_ip[:,u]-mock_median_ip[u])/mock_mad_ip[u] for u in range(len(mad_ip))]).T
    
    fold = ip_norm-mock_ip_norm
    mean_fold = np.mean(fold,axis=1)
    
    ip_norm_mean = np.mean(ip_norm, axis=1)
    mock_ip_norm_mean = np.mean(mock_ip_norm, axis=1)
    fold_mean = ip_norm_mean - mock_ip_norm_mean

    sequences_img = CreateImageFromSequences(sequences)

    return  sequences_img, mean_fold, sequences, IDs