In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import pickle

In [None]:
from workspace import nometools as nome
from workspace import utils

In [None]:
main_path = Path('..')
steric_path = main_path / 'superposition' / 'clash_1KX5'
intersect_path = main_path / 'Data' / 'intersect_regions'
sliding_path = main_path / 'Data' / 'sliding_window_1kx5'

#### Normalize Steric Clash

In [None]:
infile = steric_path / 'x_y_dict'
with open(infile, 'rb') as fin:
    x_y_clash_dict = pickle.load(fin)

x_y_clash_dict_norm_ = utils.normalize_clash_dict(x_y_clash_dict)

meth_thres_range = [0, 10, 20]
clash_thres_range = [5, 10, 20, 50]

params = []
for clash_thres in clash_thres_range:
    for meth_thres in meth_thres_range:
        k = "c" + str(clash_thres) + "m" + str(meth_thres)
        params.append(k)

In [None]:
plt.figure(figsize=(15,5))
plt.plot(x_y_clash_dict.values(), marker='o')
plt.plot(x_y_clash_dict_norm_.values(), marker='o')
plt.xticks(range(0,len(x_y_clash_dict.values())+5,5))
plt.show()

In [None]:
x_y_clash_dict

In [None]:
len(x_y_clash_dict.keys()), len(x_y_clash_dict_norm_.keys())

#### Sliding Window - This remains the same

In [None]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
# region = 'intron.2.start'

In [None]:
# infile = intersect_path / f'{region}.NDR.HCG.intersect.bed'
# df_NDR = nome.get_nuc_pos_methylation(infile, region=region)

# infile = intersect_path / f'{region}.NDR.HCG.random.intersect.bed'
# df_NDR_random = nome.get_nuc_pos_methylation(infile, region=region)

In [None]:
# df_NDR.shape, df_NDR_random.shape

In [None]:
# infile = intersect_path / f'{region}.NOR.HCG.intersect.bed'
# df_NOR = nome.get_nuc_pos_methylation(infile, region=region)

# infile = intersect_path / f'{region}.NOR.HCG.random.intersect.bed'
# df_NOR_random = nome.get_nuc_pos_methylation(infile, region=region)

In [None]:
# df_NOR.shape, df_NOR_random.shape

In [None]:
# df_NOR.head()

In [None]:
# df_NDR.head()

In [None]:
# df_NDR_sliding_windows = nome.make_sliding_windows_file(df_NDR, x_y_clash_dict, mask=True, region=region)
# df_NDR_sliding_windows.to_csv(sliding_path / f'{region}.df_NDR_sliding_windows.csv', index=False)

In [None]:
# df_NDR_sliding_windows_random = nome.make_sliding_windows_file(df_NDR_random, x_y_clash_dict, region=region)
# df_NDR_sliding_windows_random.to_csv(sliding_path / f'{region}.df_NDR_sliding_windows_random.csv', index=False)

In [None]:
# df_NDR_sliding_windows.shape, df_NDR_sliding_windows_random.shape

In [None]:
# df_NOR_sliding_windows = nome.make_sliding_windows_file(df_NOR, x_y_clash_dict, region=region)
# df_NOR_sliding_windows.to_csv(sliding_path / f'{region}.df_NOR_sliding_windows.csv', index=False)

In [None]:
# df_NOR_sliding_windows_random = nome.make_sliding_windows_file(df_NOR_random, x_y_clash_dict, region=region)
# df_NOR_sliding_windows_random.to_csv(sliding_path / f'{region}.df_NOR_sliding_windows_random.csv', index=False)

In [None]:
# df_NOR_sliding_windows.shape, df_NOR_sliding_windows_random.shape

In [None]:
# df_NDR_sliding_windows.head()

In [None]:
# df_NOR_sliding_windows.head()

#### Calculate Steric Clash

In [None]:
import re
from tqdm import tqdm

def calc_perc_exp_clash_ident(x_y_clash_dict_norm, meth_in_window_tmp, meth_thres, clash_thres):    
    count_ident = 0
    cpg_positions = meth_in_window_tmp.keys()
    for meth_pos in cpg_positions:
        meth_rate = meth_in_window_tmp[meth_pos]
        clash_perc = x_y_clash_dict_norm[meth_pos] 
        
        if meth_rate > meth_thres:
            if clash_perc <= clash_thres:
                count_ident += 1
        else:
            if clash_perc > clash_thres:
                count_ident += 1
                
    nbr_cpgs = len(cpg_positions)
    perc_exp_clash_ident = float(count_ident)/float(nbr_cpgs)
    
    return perc_exp_clash_ident

def calc_score_lists(df_sliding_windows, x_y_clash_dict_norm, params):
    column_names = ["refid_NOR", "trans_id", "refid", "NOR_nbr", "window_nbr", "nbr_CpGs", "nuc_rel_center", "nuc_region_length"] + params
    info_dict = dict()
    for col in column_names:
        info_dict[col] = []
        
    all_refids = list(df_sliding_windows["refid"])
    all_NOR_nbrs = list(df_sliding_windows["NOR_nbr"])
    refid_NORs = []
    for ref, nor in zip(all_refids, all_NOR_nbrs):
        refid_NORs.append(str(ref)+"-"+str(nor))
    
    info_dict["refid_NOR"].extend(refid_NORs)
    info_dict["trans_id"].extend(list(df_sliding_windows["trans_id"]))
    info_dict["refid"].extend(all_refids)
    info_dict["NOR_nbr"].extend(all_NOR_nbrs)
    info_dict["window_nbr"].extend(list(df_sliding_windows["window_nbr"]))
    info_dict["nbr_CpGs"].extend(list(df_sliding_windows["nbr_meth_CpGs"]))
    info_dict["nuc_region_length"].extend(list(df_sliding_windows["nuc_region_length"]))
    info_dict["nuc_rel_center"].extend(list(df_sliding_windows["nuc_rel_center"]))

    all_scores =  list(df_sliding_windows["meth_rates_window"]) #{34: 0.0, 35: 0.0,...}
    c = 0
    for row_df in tqdm(range(len(all_scores))):
        c += 1
        
        # meth_rates_window = ast.literal_eval(all_scores[row_df])
        meth_rates_window = all_scores[row_df]
    
        for param_str in params:
            clash_thres = float(re.findall(r'\d+', param_str)[0]) #c5m0
            meth_thres = float(re.findall(r'\d+', param_str)[1])
            
            perc_clash_ident = calc_perc_exp_clash_ident(x_y_clash_dict_norm, meth_rates_window, meth_thres, clash_thres)
            info_dict[param_str].append(perc_clash_ident)
            
    #Built dataframe 
    df_scores = pd.DataFrame(0, index = np.arange(len(info_dict[column_names[0]])),columns = column_names)
    for feat in column_names:
        df_scores[feat] = info_dict[feat]

    return df_scores

In [None]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
region = 'intron.2.start'

In [None]:
import ast

In [None]:
read_path = main_path / 'Data' / 'sliding_window'
df_NDR_sliding_windows = pd.read_csv(read_path / f'{region}.df_NDR_sliding_windows.csv')
df_NDR_sliding_windows_random = pd.read_csv(read_path / f'{region}.df_NDR_sliding_windows_random.csv')
df_NOR_sliding_windows = pd.read_csv(read_path / f'{region}.df_NOR_sliding_windows.csv')
df_NOR_sliding_windows_random = pd.read_csv(read_path / f'{region}.df_NOR_sliding_windows_random.csv')

In [None]:
df_NDR_sliding_windows['meth_rates_window'] = df_NDR_sliding_windows['meth_rates_window'].apply(ast.literal_eval)
df_NDR_sliding_windows_random['meth_rates_window'] = df_NDR_sliding_windows_random['meth_rates_window'].apply(ast.literal_eval)
df_NOR_sliding_windows['meth_rates_window'] = df_NOR_sliding_windows['meth_rates_window'].apply(ast.literal_eval)
df_NOR_sliding_windows_random['meth_rates_window'] = df_NOR_sliding_windows_random['meth_rates_window'].apply(ast.literal_eval)

In [None]:
df_NDR_sliding_windows.head()

In [None]:
df_NOR_sliding_windows.head()

In [None]:
df_NDR_score_exp = calc_score_lists(df_NDR_sliding_windows, x_y_clash_dict, params)

In [None]:
df_NDR_score_random = calc_score_lists(df_NDR_sliding_windows_random, x_y_clash_dict, params)

In [None]:
df_NDR_score_exp.shape, df_NDR_score_random.shape

In [None]:
df_NOR_score_exp = calc_score_lists(df_NOR_sliding_windows, x_y_clash_dict, params)

In [None]:
df_NOR_score_random = calc_score_lists(df_NOR_sliding_windows_random, x_y_clash_dict, params)

In [None]:
df_NOR_score_exp.shape, df_NOR_score_random.shape

In [None]:
df_NDR_score_exp.head()

In [None]:
df_NDR_score_exp.to_csv(sliding_path / f'{region}.df_NDR_score_exp.csv', index=False)
df_NDR_score_random.to_csv(sliding_path / f'{region}.df_NDR_score_random.csv', index=False)
df_NOR_score_exp.to_csv(sliding_path / f'{region}.df_NOR_score_exp.csv', index=False)
df_NOR_score_random.to_csv(sliding_path / f'{region}.df_NOR_score_random.csv', index=False)

#### Cohen's D and P values

In [None]:
import scipy
from scipy import stats

def calculate_cohens_d(list_EXP,list_RAND):
    mEXP = np.mean(list_EXP)
    sdEXP = np.std(list_EXP)
    
    mRAND = np.mean(list_RAND)
    sdRAND = np.std(list_RAND)
    
    denom = np.sqrt(float(sdEXP**2 + sdRAND**2)/2.0)
    cohens_d = float(mEXP-mRAND)/float(denom + 1e-6)
    
    return cohens_d

def make_df_p_vals_cohens_d(df_scores_EXP, df_scores_RAND, params):
    column_names = ["nbr_CpGs", "parameter", "N_EXP", "N_RAND", "mean_EXP", "median_EXP", "std_EXP", "mean_RAND",
                    "median_RAND", "std_RAND", "cohens_d", "is_normal_EXP", "is_normal_RAND", "pval_ttest",
                    "t_stat", "pval_ranksums", "pval_ks_2samp"]
    info_dict = dict()
    for col in column_names:
        info_dict[col] = []

    nbr_CpGs_list = list(set(list(df_scores_EXP["nbr_CpGs"])))

    for nbr_CpGs in nbr_CpGs_list:
        df_scores_EXP_tmp = df_scores_EXP.loc[df_scores_EXP["nbr_CpGs"] == nbr_CpGs]
        df_scores_RAND_tmp = df_scores_RAND.loc[df_scores_RAND["nbr_CpGs"] == nbr_CpGs]
                    
        for p in range(len(params)):
            par_name = params[p]
            EXP_scores = list(df_scores_EXP_tmp[par_name])
            RAND_scores = list(df_scores_RAND_tmp[par_name])
            
            info_dict["nbr_CpGs"].append(nbr_CpGs)
            info_dict["parameter"].append(par_name)
            
            info_dict["N_EXP"].append(len(EXP_scores))
            info_dict["N_RAND"].append(len(RAND_scores))
            
            info_dict["mean_EXP"].append(np.mean(EXP_scores))
            info_dict["median_EXP"].append(np.median(EXP_scores))
            info_dict["std_EXP"].append(np.std(EXP_scores))
            
            info_dict["mean_RAND"].append(np.mean(RAND_scores))
            info_dict["median_RAND"].append(np.median(RAND_scores))
            info_dict["std_RAND"].append(np.std(RAND_scores))
            
            #EFFECT SIZE
            cohens_d = calculate_cohens_d(EXP_scores, RAND_scores)
            info_dict["cohens_d"].append(cohens_d)
            
            #STAT TESTS
            #is normal distributed? This function tests the null hypothesis that a sample comes from a normal distribution. If small -> ost likely not normal dustributed
            pval_normal_EXP = stats.normaltest(EXP_scores)[1] if len(EXP_scores) >= 8 else -1
            pval_normal_RAND = stats.normaltest(RAND_scores)[1] if len(EXP_scores) >= 8 else -1
            
            #students t
            ttest_res = stats.ttest_ind(EXP_scores,RAND_scores,equal_var = False)
            t_stat = ttest_res[0]
            p_val_ttest = float(ttest_res[1])/2.0
            
            #ranksums, kstest
            ranksums = scipy.stats.ranksums(EXP_scores,RAND_scores)[1]
            ks_2samp = scipy.stats.ks_2samp(EXP_scores,RAND_scores)[1]
    
            info_dict["is_normal_EXP"].append(pval_normal_EXP)
            info_dict["is_normal_RAND"].append(pval_normal_RAND)
            info_dict["pval_ttest"].append(p_val_ttest)
            info_dict["t_stat"].append(t_stat)
            info_dict["pval_ranksums"].append(ranksums)
            info_dict["pval_ks_2samp"].append(ks_2samp)
            
    #Built dataframe
    df = pd.DataFrame(0, index = np.arange(len(info_dict["nbr_CpGs"])),columns = column_names)
    for feat in column_names:
        df[feat] = info_dict[feat]

    return df

In [None]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
region = 'intron.2.start'

In [None]:
df_NDR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NDR_score_exp.csv')
df_NDR_score_random = pd.read_csv(sliding_path / f'{region}.df_NDR_score_random.csv')
df_NOR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NOR_score_exp.csv')
df_NOR_score_random = pd.read_csv(sliding_path / f'{region}.df_NOR_score_random.csv')

In [None]:
df_NDR_pvalues = make_df_p_vals_cohens_d(df_NDR_score_exp, df_NDR_score_random, params)

In [None]:
df_NOR_pvalues = make_df_p_vals_cohens_d(df_NOR_score_exp, df_NOR_score_random, params)

In [None]:
df_NDR_pvalues.head()

In [None]:
plt.figure(figsize=(15,7))
groups = df_NDR_pvalues.groupby(by='parameter')
for par, df_tmp in groups:
    plt.scatter(df_tmp['nbr_CpGs'], df_tmp['N_EXP'], label=par)

# plt.axhline(y=50)
plt.xlim(-2,30)
# plt.ylim((10**1,10**6))
# plt.legend()
plt.xlabel('Number of CpGs in sliding window', fontsize=22)
plt.ylabel('Sample size for Cohen’s D', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.yscale('log')

In [None]:
plt.figure(figsize=(15,7))
groups = df_NOR_pvalues.groupby(by='parameter')
for par, df_tmp in groups:
    plt.scatter(df_tmp['nbr_CpGs'], df_tmp['N_EXP'], label=par)
plt.xlim(-2,30)
# plt.legend()
plt.xlabel('Number of CpGs in sliding window', fontsize=22)
plt.ylabel('Sample size for Cohen’s D', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.yscale('log')

In [None]:
def plot_nbr_CpGs_cohensd(df_values, params):
    plt.figure(figsize=(15,7))
    ax = plt.subplot(1,1,1)
    
    nbr_CpGs_list = sorted(list(set(list(df_values["nbr_CpGs"]))))

    for par_name in params:
        
        df_values_tmp = df_values.loc[df_values["parameter"] == par_name]
        
        x_nbr_cpg_vals = list(df_values_tmp["nbr_CpGs"])
        y_cohens_d_vals = list(df_values_tmp["cohens_d"])
        
        if "m0" in par_name:
            c = "#117A65"
        if "m10" in par_name:
            c = "#45B39D"
        if "m20" in par_name:
            c = "#EB984E"
        if "c5" in par_name:
            m = "*"
            ms = 15
        if "c10" in par_name:
            m = "^"
            ms = 10
        if "c20" in par_name:
            m = "s"
            ms = 10
        if "c50" in par_name:
            m = "o"
            ms = 10
        
        
        plt.plot(x_nbr_cpg_vals, y_cohens_d_vals, linestyle="-", color=c, marker=m, markersize=ms, label=par_name)
    
    
    plt.axhline(y=0.2, linewidth=1, color = '#2C3E50',linestyle='--')
    plt.axhline(y=0.5, linewidth=1, color = '#2C3E50',linestyle='--')
    plt.axhline(y=0.8, linewidth=1, color = '#2C3E50',linestyle='--')
    
    e = 0.02
    ax.text(-1.8,0.2+e, "Small ES")
    ax.text(-1.8,0.5+e, "Medium ES")
    ax.text(-1.8,0.8+e, "Large ES")

    ax.set_ylabel("Cohen's D", fontsize=22)
    ax.set_xlabel("Number of CpGs in sliding window", fontsize=22)   

    plt.xlim(-2,30)
    plt.ylim(-5,5)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    legend = ax.legend(loc="lower left", ncol=4, frameon = 1, prop={'size':18})
    legend.get_frame().set_facecolor('white')

In [None]:
plot_nbr_CpGs_cohensd(df_NDR_pvalues, params)

In [None]:
plot_nbr_CpGs_cohensd(df_NOR_pvalues, params)

In [None]:
def plot_pvals_cohensd(df_values,params):
    
    plt.figure(figsize=(15,7))
    ax = plt.subplot(1,1,1)
    
    nbr_CpGs_list = sorted(list(set(list(df_values["nbr_CpGs"]))))
    
    for par_name in params:
        df_values_tmp = df_values.loc[df_values["parameter"] == par_name]
        
        x_nbr_cpg_vals = list(df_values_tmp["nbr_CpGs"])
        y_pvals_vals = list(df_values_tmp["pval_ranksums"])
        
        y_pvals_log = [-np.log10(p_val) if p_val != 0.0 else 310 for p_val in y_pvals_vals]
        # print(min([p_val for p_val in y_pvals_vals]))
        # y_pvals_log = [-np.log10(p_val) for p_val in y_pvals_vals]
        
        if "m0" in par_name:
            c = "#117A65"
        if "m10" in par_name:
            c = "#45B39D"
        if "m20" in par_name:
            c = "#EB984E"
        if "c5" in par_name:
            m = "*"
            ms = 15
        if "c10" in par_name:
            m = "^"
            ms = 10
        if "c20" in par_name:
            m = "s"
            ms = 10
        if "c50" in par_name:
            m = "o"
            ms = 10
        
        
        plt.plot(x_nbr_cpg_vals,y_pvals_log,linestyle="-",color=c,marker=m,markersize=ms,label=par_name)
    
    
    #plt.axhline(y=-np.log10(0.05), linewidth=1, color = '#5D6D7E',linestyle='-')
    #plt.axhline(y=-np.log10(0.01), linewidth=1, color = '#515A5A',linestyle='-')

    ax.set_ylabel("-log10(P-value)", fontsize=22)
    ax.set_xlabel("Number of CpGs in sliding window", fontsize=22)   

    plt.xlim(-2, 30)
    plt.ylim(0, 450)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)

    legend = ax.legend(loc="upper right",ncol=4,frameon = 1,prop={'size':18})
    legend.get_frame().set_facecolor('white')

In [None]:
plot_pvals_cohensd(df_NDR_pvalues, params)

In [None]:
plot_pvals_cohensd(df_NOR_pvalues, params)

#### Cohen's D Calculation Table

In [62]:
import scipy
from scipy import stats

def calculate_cohens_d(list_EXP,list_RAND):
    mEXP = np.mean(list_EXP)
    sdEXP = np.std(list_EXP)
    
    mRAND = np.mean(list_RAND)
    sdRAND = np.std(list_RAND)
    
    denom = np.sqrt(float(sdEXP**2 + sdRAND**2)/2.0)
    cohens_d = float(mEXP-mRAND)/float(denom + 1e-6)
    
    return cohens_d

def make_df_cohens_d(df_scores_EXP, df_scores_RAND, params):
    column_names = ["parameter", "N_EXP", "N_RAND", "mean_EXP", "median_EXP", "std_EXP", "mean_RAND",
                    "median_RAND", "std_RAND", "cohens_d", "is_normal_EXP", "is_normal_RAND", "pval_ttest",
                    "t_stat", "pval_ranksums", "pval_ks_2samp"]
    info_dict = dict()
    for col in column_names:
        info_dict[col] = []
    
    df_scores_EXP_tmp = df_scores_EXP.loc[df_scores_EXP['nbr_CpGs'].between(10,20)]
    df_scores_RAND_tmp = df_scores_RAND.loc[df_scores_RAND['nbr_CpGs'].between(10,20)]
                    
    for p in range(len(params)):
        par_name = params[p]
        EXP_scores = list(df_scores_EXP_tmp[par_name])
        RAND_scores = list(df_scores_RAND_tmp[par_name])
        
        # info_dict["nbr_CpGs"].append(nbr_CpGs)
        info_dict["parameter"].append(par_name)
        
        info_dict["N_EXP"].append(len(EXP_scores))
        info_dict["N_RAND"].append(len(RAND_scores))
        
        info_dict["mean_EXP"].append(np.mean(EXP_scores))
        info_dict["median_EXP"].append(np.median(EXP_scores))
        info_dict["std_EXP"].append(np.std(EXP_scores))
        
        info_dict["mean_RAND"].append(np.mean(RAND_scores))
        info_dict["median_RAND"].append(np.median(RAND_scores))
        info_dict["std_RAND"].append(np.std(RAND_scores))
        
        #EFFECT SIZE
        cohens_d = calculate_cohens_d(EXP_scores, RAND_scores)
        info_dict["cohens_d"].append(cohens_d)
        
        #STAT TESTS
        #is normal distributed? This function tests the null hypothesis that a sample comes from a normal distribution. If small -> ost likely not normal dustributed
        pval_normal_EXP = stats.normaltest(EXP_scores)[1] if len(EXP_scores) >= 8 else -1
        pval_normal_RAND = stats.normaltest(RAND_scores)[1] if len(EXP_scores) >= 8 else -1
        
        #students t
        ttest_res = stats.ttest_ind(EXP_scores,RAND_scores,equal_var = False)
        t_stat = ttest_res[0]
        p_val_ttest = float(ttest_res[1])/2.0
        
        #ranksums, kstest
        ranksums = scipy.stats.ranksums(EXP_scores,RAND_scores)[1]
        ks_2samp = scipy.stats.ks_2samp(EXP_scores,RAND_scores)[1]

        info_dict["is_normal_EXP"].append(pval_normal_EXP)
        info_dict["is_normal_RAND"].append(pval_normal_RAND)
        info_dict["pval_ttest"].append(p_val_ttest)
        info_dict["t_stat"].append(t_stat)
        info_dict["pval_ranksums"].append(ranksums)
        info_dict["pval_ks_2samp"].append(ks_2samp)
            
    #Built dataframe
    df = pd.DataFrame(0, index = np.arange(len(info_dict["parameter"])),columns = column_names)
    for feat in column_names:
        df[feat] = info_dict[feat]

    return df

In [63]:
# region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
# region = 'intron.2.start'

In [64]:
# df_NDR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NDR_score_exp.csv')
# df_NDR_score_random = pd.read_csv(sliding_path / f'{region}.df_NDR_score_random.csv')
# df_NOR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NOR_score_exp.csv')
# df_NOR_score_random = pd.read_csv(sliding_path / f'{region}.df_NOR_score_random.csv')

In [65]:
# df_NDR_pvalues = make_df_cohens_d(df_NDR_score_exp, df_NDR_score_random, params)

In [66]:
# df_NOR_pvalues = make_df_cohens_d(df_NOR_score_exp, df_NOR_score_random, params)

In [67]:
# df_NDR_pvalues

In [68]:
# df_NOR_pvalues

In [69]:
regions = ['promoter', 'intron.1.start', 'intron.1.end', 'intron.2.start']
labels = ['promoter', 'start of 1st intron', 'end of 1st intron', 'start of 2nd intron']
table = {}
for label, region in zip(labels, regions):
    print(region)
    df_NDR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NDR_score_exp.csv')
    df_NDR_score_random = pd.read_csv(sliding_path / f'{region}.df_NDR_score_random.csv')
    df_NOR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NOR_score_exp.csv')
    df_NOR_score_random = pd.read_csv(sliding_path / f'{region}.df_NOR_score_random.csv')

    df_NDR_pvalues = make_df_cohens_d(df_NDR_score_exp, df_NDR_score_random, params)
    df_NOR_pvalues = make_df_cohens_d(df_NOR_score_exp, df_NOR_score_random, params)

    assert df_NDR_pvalues.shape[0] == df_NOR_pvalues.shape[0]
    assert df_NDR_pvalues['parameter'].to_list() == df_NOR_pvalues['parameter'].to_list()

    table[label] = {
        'Parameter': df_NDR_pvalues['parameter'].to_list(),
        'HNDRs': df_NOR_pvalues['cohens_d'].to_list(),
        'LNDRs': df_NDR_pvalues['cohens_d'].to_list()
    }

promoter
intron.1.start
intron.1.end
intron.2.start


In [70]:
df_table = pd.DataFrame(table).T.explode(['Parameter', 'LNDRs', 'HNDRs'])

In [71]:
df_table

Unnamed: 0,Parameter,HNDRs,LNDRs
promoter,c5m0,0.844334,0.392026
promoter,c5m10,0.845001,0.390944
promoter,c5m20,0.864478,0.44387
promoter,c10m0,0.46329,0.216024
promoter,c10m10,0.463641,0.215351
promoter,c10m20,0.474219,0.243697
promoter,c20m0,-0.270655,-0.128855
promoter,c20m10,-0.270592,-0.128918
promoter,c20m20,-0.273856,-0.146388
promoter,c50m0,-1.395537,-0.652403


In [72]:
print(df_table.to_latex(float_format="{:.2f}".format))

\begin{tabular}{llll}
\toprule
{} & Parameter & HNDRs & LNDRs \\
\midrule
promoter            &      c5m0 &  0.84 &  0.39 \\
promoter            &     c5m10 &  0.85 &  0.39 \\
promoter            &     c5m20 &  0.86 &  0.44 \\
promoter            &     c10m0 &  0.46 &  0.22 \\
promoter            &    c10m10 &  0.46 &  0.22 \\
promoter            &    c10m20 &  0.47 &  0.24 \\
promoter            &     c20m0 & -0.27 & -0.13 \\
promoter            &    c20m10 & -0.27 & -0.13 \\
promoter            &    c20m20 & -0.27 & -0.15 \\
promoter            &     c50m0 & -1.40 & -0.65 \\
promoter            &    c50m10 & -1.40 & -0.65 \\
promoter            &    c50m20 & -1.43 & -0.74 \\
start of 1st intron &      c5m0 &  1.21 &  0.65 \\
start of 1st intron &     c5m10 &  1.21 &  0.65 \\
start of 1st intron &     c5m20 &  1.24 &  0.70 \\
start of 1st intron &     c10m0 &  0.67 &  0.36 \\
start of 1st intron &    c10m10 &  0.67 &  0.36 \\
start of 1st intron &    c10m20 &  0.70 &  0.39 \\
start of

  print(df_table.to_latex(float_format="{:.2f}".format))


In [73]:
df_table[df_table['Parameter'].isin(['c5m0', 'c5m10', 'c5m20'])]

Unnamed: 0,Parameter,HNDRs,LNDRs
promoter,c5m0,0.844334,0.392026
promoter,c5m10,0.845001,0.390944
promoter,c5m20,0.864478,0.44387
start of 1st intron,c5m0,1.205436,0.647242
start of 1st intron,c5m10,1.20527,0.647893
start of 1st intron,c5m20,1.244542,0.700976
end of 1st intron,c5m0,1.282802,0.996352
end of 1st intron,c5m10,1.276652,0.99559
end of 1st intron,c5m20,1.305215,1.021143
start of 2nd intron,c5m0,1.371871,0.968847


#### Histograms

In [None]:
region = 'promoter'
# region = 'intron.1.start'
# region = 'intron.1.end'
# region = 'intron.2.start'

In [None]:
df_NDR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NDR_score_exp.csv')
df_NDR_score_random = pd.read_csv(sliding_path / f'{region}.df_NDR_score_random.csv')
df_NOR_score_exp = pd.read_csv(sliding_path / f'{region}.df_NOR_score_exp.csv')
df_NOR_score_random = pd.read_csv(sliding_path / f'{region}.df_NOR_score_random.csv')

In [None]:
df_NDR_score_exp.columns

In [None]:
fig, axs = plt.subplots(3, 4, figsize=(20,12), layout='constrained')
order = ['c5m0', 'c10m0', 'c20m0', 'c50m0', 'c5m10', 'c10m10', 'c20m10', 'c50m10', 'c5m20', 'c10m20', 'c20m20', 'c50m20']

for ax, thresh in zip(axs.flatten(), order):
    ax.hist(df_NDR_score_random.loc[df_NDR_score_random['nbr_CpGs'].between(10,20), thresh], density=True, alpha=0.5, bins=np.linspace(0,1,20), label='random')
    ax.hist(df_NDR_score_exp.loc[df_NDR_score_exp['nbr_CpGs'].between(10,20), thresh], density=True, alpha=0.3, bins=np.linspace(0,1,20), label='experimental')
    ax.set_title(thresh, fontsize=20)
    ax.set_xlabel('match-score', fontsize=16)
    ax.set_ylabel('density', fontsize=16)
    ax.legend(fontsize=12)

In [None]:
fig, axs = plt.subplots(3, 4, figsize=(20,12), layout='constrained')
order = ['c5m0', 'c10m0', 'c20m0', 'c50m0', 'c5m10', 'c10m10', 'c20m10', 'c50m10', 'c5m20', 'c10m20', 'c20m20', 'c50m20']

for ax, thresh in zip(axs.flatten(), order):
    ax.hist(df_NOR_score_random.loc[df_NOR_score_random['nbr_CpGs'].between(10,20), thresh], density=True, alpha=0.5, bins=np.linspace(0,1,20), label='random')
    ax.hist(df_NOR_score_exp.loc[df_NOR_score_exp['nbr_CpGs'].between(10,20), thresh], density=True, alpha=0.3, bins=np.linspace(0,1,20), label='experimental')
    ax.set_title(thresh, fontsize=20)
    ax.set_xlabel('match-score', fontsize=16)
    ax.set_ylabel('density', fontsize=16)
    ax.legend(fontsize=12)