In [None]:
import pandas as pd 
import pickle5
import numpy as np 
import scipy.stats
import glob 
import matplotlib.pyplot as plt 
import seaborn as sns 

In [None]:
def format_p_value_string(p):
    if p < 0.0001:
        sci_notation_pieces = "{:.2e}".format(p).split("e")
        exponent = int(sci_notation_pieces[1])
        decimal = sci_notation_pieces[0]
        p_string =  "$" + decimal + " x 10^{" + str(exponent) + "}" +  "$"
    else:
        p_string = str(round(p, 4))
    
    return p_string

In [None]:
variant_to_score = pickle5.load(open("checkpoint_completed_032521_9.pickle", "rb"))
patient_to_adj_ldl = pickle5.load(open("LDL_a_values.pickle", "rb"))
scored_variants = set(variant_to_score.keys())
model = pickle5.load(open("regression_model_9_3.pickle", "rb"))
valid_patients = set(patient_to_adj_ldl.keys())
score_dist = list(variant_to_score.values())
max_score = np.max(score_dist)

In [None]:
mean_adj_ldl = np.mean(list(patient_to_adj_ldl.values()))

Significant from Regeneron M3.001 

ABCA1, ABCA6, ABCG5, ADCK5, ALB, ANGPTL3, APOB, ARR3, ASGR1, CETP, CPEB4, 
CPT1A, DENND4C, FOXA3, GAS6, GCK, GPCPD1, HMGCR, HNF1A, IL1RAP, LDLR, MYLIP,
NPC1L1, NR1H4, OR2T6, PCSK9, PREB, RND1,  RRBP1, SOX15, STAB1, TET2, TEX38, TIMD4

In [None]:
regeneron_significant = "ABCA1, ABCA6, ABCG5, ADCK5, ALB, ANGPTL3, APOB, ARR3, ASGR1, CETP, CPEB4, CPT1A, DENND4C, FOXA3, GAS6, GCK, GPCPD1, HMGCR, HNF1A, IL1RAP, LDLR, MYLIP, NPC1L1, NR1H4, OR2T6, PCSK9, PREB, RND1, RRBP1, SOX15, STAB1, TET2, TEX38, TIMD4"
regeneron_significant = regeneron_significant.split(", ")
# regeneron_significant.remove("APOB")
# regeneron_significant.remove("ABCA1")

mean_ldl_a = np.mean(list(patient_to_adj_ldl.values()))
mean_ldl = 145.47

profile_path = "./profiles/"
# score_path = "./score_files/"
score_path = "./score_files_2/"
regression_cols = ["CADD", "GERP", "allele_frequency", "Missense",  "phyloP", "Deleterious"]
regression_cols_9 = [
    'VEST4_score', 
    'M-CAP_score', 
    'MPC_score', 
    'PrimateAI_score', 
    'GM12878_fitCons_score', 
    'MutationAssessor_score', 
    'FATHMM_score', 
    'MetaLR_score', 
    'MetaSVM_score'
]

r_cols = regression_cols + regression_cols_9

In [None]:
def format_labels(lst):
    v = []
    for val in lst:
        if val <= 0:
            v.append(str(int(val)))
        else:
            v.append("+" + str(int(val)))
    return v 

In [None]:
obtained_genes = glob.glob("./profiles/*.csv")
obtained_genes = list(map(lambda x: x.replace("./profiles/", "").replace(".csv", "") , obtained_genes))
needed_genes = set(regeneron_significant).difference(set(obtained_genes))

In [None]:
# writer = pd.ExcelWriter('061522_Score_comparisons.xlsx', engine='xlsxwriter')

In [None]:
score_cols = [
#     'CADD', 
#     'GERP',  
#     'phyloP',  
    'VEST4_score', 
#     'M-CAP_score', 
#     'MPC_score', 
#     'PrimateAI_score', 
#     'GM12878_fitCons_score', 
#     'MutationAssessor_score', 
#     'FATHMM_score', 
#     'MetaLR_score', 
#     'MetaSVM_score' 
]

score_to_gene_values = {}
for score_val in score_cols:
    score_to_gene_values[score_val] = {}

gene_to_data = {}
for g in set(regeneron_significant).difference(needed_genes).union(set(["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"])):
    gene_to_data[g] = {}
    

gene_to_points = {}
for g in set(regeneron_significant).difference(needed_genes).union(set(["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"])):
    gene_to_points[g] = {}

all_variants = set()
for score_val in score_cols:
    score_dist = pickle5.load(open("/Volumes/Ian/ldl_score_distribution_pickles/" + score_val + ".pickle", "rb"))
    max_score = np.max(score_dist)
    for g in set(regeneron_significant).difference(needed_genes).union(set(["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"])):
        v = pd.read_csv("./score_files_2/" + g + ".csv")
        valid_variants = v
        delet_variants = set(valid_variants.loc[valid_variants["Deleterious"] == 1]["Name"].values)
        covered_variants = set(valid_variants["Name"].values)
        p = pd.read_csv("./profiles/" + g + ".csv", index_col = "new_id")
        p = p.loc[
            (~p["most_severe_variant"].isna()) &
            (p["ldl"] > 0)
        ]
        x = []
        y = []
        for index, row in p.iterrows():
            v = row["most_severe_variant"]
            if index in valid_patients:
                ldl = patient_to_adj_ldl[index]
            else:
                continue 
            
            if v in delet_variants:
                x.append(max_score)
                y.append(ldl)
            elif v in covered_variants:
                info = valid_variants.loc[valid_variants["Name"] == v].iloc[0]
                score = info[score_val]
                x.append(score)
                y.append(ldl)
        corr, p = scipy.stats.spearmanr(x, y)
        y = list(map(lambda x : x - mean_adj_ldl, y))
        gene_to_data[g][score_val] = abs(corr)
        score_to_gene_values[score_val][g] = abs(corr)
        gene_to_points[g][score_val.replace("_score", "")] = {}
        gene_to_points[g][score_val.replace("_score", "")]["ldl_vals"] = y
        gene_to_points[g][score_val.replace("_score", "")]["score"] = x

In [None]:
fig,( (ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize = (35, 30))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.4, 
    hspace=0.4
)


genes = ["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"]
axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9] 
y_ticklabels = ["-100","-50","0","+50","+100","+150","+200","+250","+300"]
y_ticks = list(np.arange(-100, 301, 50))
x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
for ax, gene in zip(axes, genes):
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    ax.scatter(gene_to_points[gene]["VEST4"]["score"], gene_to_points[gene]["VEST4"]["ldl_vals"])
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "red", lw = 5)
    ax.set_title("$\it{" + gene + "}$", fontsize = 25)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 20)
    ax.text(0.05, 275, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20)
    ax.text(0.05, 250, r"p-value = " + format_p_value_string(p), fontsize = 20)
    ax.set_xlabel("\nVEST4 Score", fontsize = 20)
    
ax1.set_ylabel("$\Delta$ Adjusted LDL\n", fontsize = 20)
ax4.set_ylabel("$\Delta$ Adjusted LDL\n", fontsize = 20)
# fig.tight_layout()

In [None]:
# fig.savefig("LDL_values_Missense_Deleterious_three_genes.eps", fmt = "eps", bbox_inches = "tight")

In [None]:
fig,( (ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize = (35, 30))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.4, 
    hspace=0.4
)


genes = ["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"]
axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9] 
y_ticklabels = ["-100","-50","0","+50","+100","+150","+200","+250","+300"]
y_ticks = list(np.arange(-100, 301, 50))
x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
for ax, gene in zip(axes, genes):
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
#     sns.regplot(x,y, color ='green', ax = ax)
    c, p = scipy.stats.spearmanr(x, y)
    ax.scatter(x, y)
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    ax.set_title("$\it{" + gene + "}$", fontsize = 25)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 20)
    ax.text(0.05, 275, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20)
    ax.text(0.05, 250, r"p-value = " + format_p_value_string(p), fontsize = 20)
    ax.set_xlabel("\nVEST4 Score", fontsize = 20)
    
ax1.set_ylabel("$\Delta$ LDL\n", fontsize = 20)



In [None]:


bin_size = 0.05 
fig,( (ax1, ax2, ax3), (ax4, ax5, ax6), (ax7, ax8, ax9)) = plt.subplots(3, 3, figsize = (40, 30))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.8, 
    hspace=0.4
)



genes = ["ABCA1", "ACAT2", "ASGR1", "LDLR", "OTX2", "RAB10", "RABIF", "TRAPPC1", "VPS51"]
axes = [ax1,ax2,ax3,ax4,ax5,ax6,ax7,ax8,ax9] 

x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

gene_to_config = {
    "ABCA1" : [-35, 25, 5],
    "ACAT2" : [-50, 50, 10], 
    "ASGR1" : [-35, 25, 5],
    "LDLR" : [-20, 100, 20],
    "OTX2" : [-50,50, 10], 
    "RAB10" : [-30, 90, 10],
    "RABIF" : [-30, 90, 10], 
    "TRAPPC1" : [-50, 50, 10], 
    "VPS51" : [-50, 50, 10]
}

for ax, gene in zip(axes, genes):
    config = gene_to_config[gene]
    y_ticks = list(np.arange(config[0], config[1], config[2]))
    y_ticklabels = format_labels(np.arange(config[0], config[1], config[2]))
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(bin_size, 1.01, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] > i - bin_size) &
            (temp_df["score"] <= i)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
    
    ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 25)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 20)
    ax.text(1.05, 0.9, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20, transform=ax.transAxes)
    ax.text(1.05, 0.85, r"p-value = " + format_p_value_string(p), fontsize = 20, transform=ax.transAxes)
    ax.set_ylabel("$\Delta$ Adjusted LDL-C\n mmol/L", fontsize = 20)
    ax.set_xlabel("\nVEST4 Score", fontsize = 20)
    ax.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))


In [None]:

fig, ax = plt.subplots(figsize = (10,10))

genes = ["LDLR"]
axes = [ax]
for ax, gene in zip(axes, genes):
    config = gene_to_config[gene]
    y_ticks = list(np.arange(config[0], config[1], config[2]))
    y_ticklabels = format_labels(np.arange(config[0], config[1], config[2]))
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(bin_size, 1.01, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] > i - bin_size) &
            (temp_df["score"] <= i)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
    
   
    print(gene, np.max(valid_y), np.min(valid_y))
    ax.axhline(y=0, color='black', lw = 2, xmin = 0, xmax = 1, linestyle='--')
    scatter = ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 30)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 25)
    ax.text(1.05, 0.9, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20, transform=ax.transAxes)
    ax.text(1.05, 0.85, r"p-value = " + format_p_value_string(p), fontsize = 20, transform=ax.transAxes)
    ax.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
    ax.set_xlabel("\nVEST4 Score", fontsize = 30)
    ax.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
    handles, labels = scatter.legend_elements(prop="sizes", alpha=0.6)
    
    handles, labels = scatter.legend_elements(prop="sizes", c= "#1f77b4", num=len(all_x))

    print(np.min(all_sizes))

    labels_2 = []
    handles_2 = []
    keeping = [10,20,30,40,50]
    for l, h in zip(labels, handles):
        numeric_filter = filter(str.isdigit, l)
        numeric_string = "".join(numeric_filter)
        actual_size = str(int(int(numeric_string) / 5))
        if int(actual_size) in keeping:
            labels_2.append('$\\mathdefault{' + actual_size + '}$')
            handles_2.append(h)
    legend2 = ax.legend(
        handles_2, 
        labels_2, 
        ncol = 2,
        labelspacing = 1.0,
        loc="upper right", 
        fontsize = 20,
        bbox_to_anchor=(1.45, 0.8)
    )

fig.savefig("LDLR_legend_adjLDLC_Correlation.svg", fmt = "svg", bbox_inches = "tight")

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (45,10))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.8, 
    hspace=0.4
)



genes = ["ABCA1", "ASGR1"]
axes = [ax1, ax2, ax3]
all_sizes = []
all_y = []
all_x = []
for ax, gene in zip(axes, genes):
    config = gene_to_config[gene]
    y_ticks = list(np.arange(config[0], config[1], config[2]))
    y_ticklabels = format_labels(np.arange(config[0], config[1], config[2]))
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    ax.axhline(y=0, color='black', lw = 2, xmin = 0, xmax = 1, linestyle='--')
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(bin_size, 1.01, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] > i - bin_size) &
            (temp_df["score"] <= i)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
    
   
    for (s, x_val), y_val in zip(zip(sizes, valid_x), valid_y):
        all_sizes.append(s)
        all_x.append(x_val)
        all_y.append(y_val)
   
    scatter = ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 30)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 25)
    ax.text(1.05, 0.9, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 25, transform=ax.transAxes)
    ax.text(1.05, 0.85, r"p-value = " + format_p_value_string(p), fontsize = 25, transform=ax.transAxes)
    ax.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
    ax.set_xlabel("\nVEST4 Score", fontsize = 30)
    ax.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
    ax.axhline(y=0, color='black', lw = 2, xmin = 0, xmax = 1, linestyle='--')



ax3.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
ax3.set_xlabel("\nVEST4 Score", fontsize = 30)
ax3.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
ax3.set_title("$\it{" + "ghost" + "}$", fontsize = 30)
ax3.set_yticks(y_ticks)
ax3.set_yticklabels(y_ticklabels, fontsize = 20)
ax3.set_xticks(x_ticks)
ax3.set_xticklabels(x_ticklabels, fontsize = 25)

scatter = ax3.scatter(all_x, all_y , s = all_sizes)
handles, labels = scatter.legend_elements(prop="sizes", c= "#1f77b4", num=len(all_x))


labels_2 = []
handles_2 = []
keeping = [4,20,40,80,120,160]
for l, h in zip(labels, handles):
    numeric_filter = filter(str.isdigit, l)
    numeric_string = "".join(numeric_filter)
    actual_size = str(int(int(numeric_string) / 5))
    if int(actual_size) in keeping:
        labels_2.append('$\\mathdefault{' + actual_size + '}$')
        handles_2.append(h)
legend2 = ax3.legend(
    handles_2, 
    labels_2, 
    ncol = 2,
    labelspacing = 1.0,
    loc="upper right", 
    fontsize = 20,
    bbox_to_anchor=(1.5, 0.5)
)

fig.savefig("ABCA1_ASGR1_legend_adjLDLC_Correlation.svg", fmt = "svg", bbox_inches = "tight")

In [None]:



ax3.scatter(all_x, all_y, s = all_sizes)
ax3.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
ax3.set_xlabel("\nVEST4 Score", fontsize = 30)
ax3.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
ax3.set_title("$\it{" + "ghost" + "}$", fontsize = 30)
ax3.set_yticks(y_ticks)
ax3.set_yticklabels(y_ticklabels, fontsize = 20)
ax3.set_xticks(x_ticks)
ax3.set_xticklabels(x_ticklabels, fontsize = 25)


print(all_sizes)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3, figsize = (45,10))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.8, 
    hspace=0.4
)


genes = ["RAB10", "RABIF"]
axes = [ax1, ax2]

all_sizes = []
all_y = []
all_x = []

for ax, gene in zip(axes, genes):
    config = gene_to_config[gene]
    y_ticks = list(np.arange(config[0], config[1], config[2]))
    y_ticklabels = format_labels(np.arange(config[0], config[1], config[2]))
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    ax.axhline(y=0, color='black', lw = 2, xmin = 0, xmax = 1, linestyle='--')
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(bin_size, 1.01, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] > i - bin_size) &
            (temp_df["score"] <= i)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
    
    print(np.max(sizes) / 5)
    print(gene, np.max(valid_y), np.min(valid_y))
    
    for (s, x_val), y_val in zip(zip(sizes, valid_x), valid_y):
        all_sizes.append(s)
        all_x.append(x_val)
        all_y.append(y_val)
    
    
    ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 30)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 25)
    ax.text(1.05, 0.9, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 25, transform=ax.transAxes)
    ax.text(1.05, 0.85, r"p-value = " + format_p_value_string(p), fontsize = 25, transform=ax.transAxes)
    ax.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
    ax.set_xlabel("\nVEST4 Score", fontsize = 30)
    ax.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
    


    
print(np.min(all_sizes))

ax3.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
ax3.set_xlabel("\nVEST4 Score", fontsize = 30)
ax3.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
ax3.set_title("$\it{" + "ghost" + "}$", fontsize = 30)
ax3.set_yticks(y_ticks)
ax3.set_yticklabels(y_ticklabels, fontsize = 20)
ax3.set_xticks(x_ticks)
ax3.set_xticklabels(x_ticklabels, fontsize = 25)

scatter = ax3.scatter(all_x, all_y , s = all_sizes)
handles, labels = scatter.legend_elements(prop="sizes", c= "#1f77b4", num=len(all_x))

print(labels)
print(np.min(all_sizes))

labels_2 = []
handles_2 = []
keeping = set([2,4,6,8,10])
for l, h in zip(labels, handles):
    numeric_filter = filter(str.isdigit, l)
    numeric_string = "".join(numeric_filter)
    actual_size = str(int(int(numeric_string) / 5))
    print(actual_size)
    if int(actual_size) in keeping:
        labels_2.append('$\\mathdefault{' + actual_size + '}$')
        handles_2.append(h)
        keeping.remove(int(actual_size))
legend2 = ax3.legend(
    handles_2, 
    labels_2, 
    ncol = 2,
    labelspacing = 1.0,
    loc="upper right", 
    fontsize = 20,
    bbox_to_anchor=(1.5, 0.5)
)

fig.savefig("RAB10_RABIF_legend_adjLDLC_Correlation.svg", fmt = "svg", bbox_inches = "tight")

In [None]:
fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize = (60,10))

plt.subplots_adjust(
    left=0.1,
    bottom=0.1, 
    right=0.9, 
    top=0.9, 
    wspace=0.8, 
    hspace=0.4
)

all_sizes = []
all_y = []
all_x = []


genes = ["ACAT2", "OTX2",  "VPS51"]
axes = [ax1, ax2,ax3]
for ax, gene in zip(axes, genes):
    config = gene_to_config[gene]
    y_ticks = list(np.arange(config[0], config[1], config[2]))
    y_ticklabels = format_labels(np.arange(config[0], config[1], config[2]))
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    c, p = scipy.stats.spearmanr(x, y)
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    ax.axhline(y=0, color='black', lw = 2, xmin = 0, xmax = 1, linestyle='--')
    sns.regplot(x,y, color ='green', ax = ax, scatter = False)
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(bin_size, 1.01, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] > i - bin_size) &
            (temp_df["score"] <= i)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
        
    for (s, x_val), y_val in zip(zip(sizes, valid_x), valid_y):
        all_sizes.append(s)
        all_x.append(x_val)
        all_y.append(y_val)
    ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 30)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 25)
    ax.text(1.05, 0.9, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 25, transform=ax.transAxes)
    ax.text(1.05, 0.85, r"p-value = " + format_p_value_string(p), fontsize = 25, transform=ax.transAxes)
    ax.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
    ax.set_xlabel("\nVEST4 Score", fontsize = 30)
    ax.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))

ax4.set_ylabel("$\Delta$ Adjusted LDL-C\n mg/dL", fontsize = 25)
ax4.set_xlabel("\nVEST4 Score", fontsize = 30)
ax4.set_ylim(bottom = np.min(y_ticks), top = np.max(y_ticks))
ax4.set_title("$\it{" + "ghost" + "}$", fontsize = 30)
ax4.set_yticks(y_ticks)
ax4.set_yticklabels(y_ticklabels, fontsize = 20)
ax4.set_xticks(x_ticks)
ax4.set_xticklabels(x_ticklabels, fontsize = 25)

scatter = ax4.scatter(all_x, all_y , s = all_sizes)
handles, labels = scatter.legend_elements(prop="sizes", c= "#1f77b4", num=len(all_x))


labels_2 = []
handles_2 = []
keeping = set([2,5,10,15,20])
for l, h in zip(labels, handles):
    numeric_filter = filter(str.isdigit, l)
    numeric_string = "".join(numeric_filter)
    actual_size = str(int(int(numeric_string) / 5))
    print(actual_size)
    if int(actual_size) in keeping:
        labels_2.append('$\\mathdefault{' + actual_size + '}$')
        handles_2.append(h)
        keeping.remove(int(actual_size))
legend2 = ax4.legend(
    handles_2, 
    labels_2, 
    ncol = 2,
    labelspacing = 1.0,
    loc="upper right", 
    fontsize = 20,
    bbox_to_anchor=(1.5, 0.5)
)


fig.savefig("ACAT2_OTX2_VSP51_legend_adjLDLC_Correlation.svg", fmt = "svg", bbox_inches = "tight")

In [None]:
fig, ax = plt.subplots(figsize = (10,8))

top = 25
bottom = -25
step = 5
y_ticklabels = list(map(lambda x : str(int(x)) if int(x) <= 0 else "+" + str(int(x)), list(np.arange(bottom, top + .01, step))))
y_ticks = list(np.arange(bottom, top + .01, step))
x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]


gene = "ABCA1" 
bin_size = 0.05
x = gene_to_points[gene]["VEST4"]["score"]
y = gene_to_points[gene]["VEST4"]["ldl_vals"]

# sns.regplot(x,y, color ='green', scatter = False, ax = ax)
# ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "green", lw = 5)

temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
valid_x = []
valid_y = []
sizes = []
for i in np.arange(0, 1.01, bin_size):
    valid_x.append(i)
    subset = temp_df.loc[
        (temp_df["score"] >= i) &
        (temp_df["score"] < i + bin_size)
    ]
    valid_y.append(np.mean(subset["ldl"].values))
    sizes.append(len(subset))

    

ax.scatter(valid_x,valid_y, s = sizes)

sns.regplot(valid_x,valid_y, color ='red', scatter = False, ax = ax)
ax.plot(np.unique(x), np.poly1d(np.polyfit(valid_x, valid_y, 1))(np.unique(x)), c = "red", lw = 5)




ax.set_title("$\it{" + gene + "}$", fontsize = 25)
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_ticklabels, fontsize = 20)
ax.set_xticks(x_ticks)
ax.set_xticklabels(x_ticklabels, fontsize = 20)
#     ax.text(0.05, 275, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20)
#     ax.text(0.05, 250, r"p-value = " + format_p_value_string(p), fontsize = 20)
ax.set_ylabel("$\Delta$ Adjusted LDL\n", fontsize = 20)
ax.set_xlabel("\nVEST4 Score", fontsize = 20)


In [None]:
df = pd.DataFrame(gene_to_data)

fig, ax = plt.subplots(figsize = (20,10))
sns.heatmap(df, annot=True, ax = ax, fmt = ".2f")
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
ax.set_ylim(b, t) # update the ylim(bottom, top) values
fig.savefig("heatmap_deleterious_missense.eps", fmt = "eps", bbox_inches = "tight")


df.to_excel(writer, sheet_name="Delet+Mis Correlations", index=True)
s_to_ranks = {}
for score_val in score_cols:
    s_to_ranks[score_val] = []
# s_to_ranks['Deleteriousness Score'] = []
for c in df.columns:
    vals = df[c].sort_values(ascending = False)
    vals = vals.dropna()
    for counter, (index, val) in enumerate(vals.iteritems()):
        s_to_ranks[index].append(counter + 1) 
lst = []
for k, v in s_to_ranks.items():
    lst.append([k, np.mean(v)])
    print(k, np.mean(v))
    
df = pd.DataFrame(lst, columns = ["Score", "Average Rank"])
df = df.sort_values(by = "Average Rank")
df.to_excel(writer, sheet_name="Avg Rank Delet+Mis Correlations", index=False)

In [None]:
score_cols = [
    'CADD', 
    'GERP',  
    'phyloP',  
    'VEST4_score', 
    'M-CAP_score', 
    'MPC_score', 
    'PrimateAI_score', 
    'GM12878_fitCons_score', 
    'MutationAssessor_score', 
    'FATHMM_score', 
    'MetaLR_score', 
    'MetaSVM_score' 
]

all_variants = set()

score_to_gene_values = {}
for score_val in score_cols:
    score_to_gene_values[score_val.replace("_score", "")] = {}

gene_to_data = {}
for g in set(regeneron_significant).difference(needed_genes):
    gene_to_data[g] = {}

gene_to_points = {}
for g in set(regeneron_significant).difference(needed_genes):
    gene_to_points[g] = {}
    
for score_val in score_cols:
    p_val_counter = 0
    for g in set(regeneron_significant).difference(needed_genes):
        v = pd.read_csv("./score_files_2/" + g + ".csv")
        valid_variants = v.loc[v["Missense"] == 1]
        all_variants = all_variants.union(set(valid_variants["Name"].values))
        covered_variants = set(valid_variants["Name"].values)
        p = pd.read_csv("./profiles/" + g + ".csv", index_col = "new_id")
        p = p.loc[
            (~p["most_severe_variant"].isna()) &
            (p["ldl"] > 0)
        ]
        x = []
        y = []
        for index, row in p.iterrows():
            v = row["most_severe_variant"]
            if index in valid_patients:
                ldl = patient_to_adj_ldl[index]
            else:
                continue 
            if v in covered_variants:
                info = valid_variants.loc[valid_variants["Name"] == v].iloc[0]
                score = info[score_val]
                x.append(score)
                y.append(ldl)
        c = scipy.stats.spearmanr(x, y)
        corr = c[0]
        if c[1] <= 0.05:
            p_val_counter += 1
        y = list(map(lambda x : x - mean_adj_ldl, y))
        gene_to_points[g][score_val.replace("_score", "")] = {}
        gene_to_points[g][score_val.replace("_score", "")]["ldl_vals"] = y
        gene_to_points[g][score_val.replace("_score", "")]["score"] = x
        gene_to_data[g][score_val.replace("_score", "")] = abs(corr)
        score_to_gene_values[score_val.replace("_score", "")][g] = abs(corr)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize = (35, 8))

genes = ["ABCA1", "ASGR1", "LDLR"]
axes = [ax1,ax2,ax3] 
y_ticklabels = ["-100","-50", "0", "+50", "+100","+150","+200","+250", "+300"]
y_ticks = list(np.arange(-100, 301, 50))
x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
for ax, gene in zip(axes, genes):
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    
    c, p = scipy.stats.spearmanr(x, y)
    
    
    ax.scatter(gene_to_points[gene]["VEST4"]["score"], gene_to_points[gene]["VEST4"]["ldl_vals"])
      
    ax.plot(np.unique(x), np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), c = "red", lw = 5)
    ax.set_title("$\it{" + gene + "}$", fontsize = 25)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 20)
    ax.text(0.05, 275, r"Spearman $ \rho $ = " + str(round(c, 2)), fontsize = 20)
    ax.text(0.05, 250, r"p-value = " + format_p_value_string(p), fontsize = 20)
    ax.set_xlabel("\nVEST4 Score", fontsize = 20)
    
ax1.set_ylabel("$\Delta$ LDL\n", fontsize = 20)

In [None]:
fig.savefig("LDL_values_Missense_three_genes.eps", fmt = "eps", bbox_inches = "tight")

In [None]:
bin_size = 0.1 

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize = (35, 8))

genes = ["ABCA1", "ASGR1", "LDLR"]
axes = [ax1,ax2,ax3] 

y_ticklabels = ["-100","-50", "0", "+50", "+100",]
y_ticks = list(np.arange(-100, 101, 50))
x_ticks = list(np.arange(0, 1.01, 0.1))
x_ticklabels = [0, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]

for ax, gene in zip(axes, genes):
    x = gene_to_points[gene]["VEST4"]["score"]
    y = gene_to_points[gene]["VEST4"]["ldl_vals"]
    temp_df = pd.DataFrame(np.transpose([x, y]), columns = ["score", "ldl"])
    
    valid_x = []
    valid_y = []
    sizes = []
    for i in np.arange(0, 1, bin_size):
        valid_x.append(i)
        subset = temp_df.loc[
            (temp_df["score"] >= i) &
            (temp_df["score"] < i + bin_size)
        ]
        valid_y.append(np.mean(subset["ldl"].values))
        sizes.append(len(subset)*5)
        
    ax.scatter(valid_x,valid_y, s = sizes)
    ax.set_title("$\it{" + gene + "}$", fontsize = 25)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_ticklabels, fontsize = 20)
    ax.set_xticks(x_ticks)
    ax.set_xticklabels(x_ticklabels, fontsize = 20)
    ax.set_ylabel("$\Delta$ LDL\n", fontsize = 20)
    ax.set_xlabel("\nVEST4 Score", fontsize = 20)


In [None]:
fig.savefig("LDL_binned_values_Missense_three_genes.eps", fmt = "eps", bbox_inches = "tight")

In [None]:
df = pd.DataFrame(gene_to_data)

df.to_excel(writer, sheet_name="Misense Correlations", index=True)

fig, ax = plt.subplots(figsize = (20,10))
sns.heatmap(df, annot=True, ax = ax, fmt = ".2f")
b, t = plt.ylim() # discover the values for bottom and top
b += 0.5 # Add 0.5 to the bottom
t -= 0.5 # Subtract 0.5 from the top
ax.set_ylim(b, t) # update the ylim(bottom, top) values
ax.set_title("Correlations of Missense Carrier Adjusted LCL-C \n and Computational Score Prediction \n", fontsize = 15)
fig.savefig("heatmap_missense.eps", fmt = ".eps", bbox_inches = "tight")


s_to_ranks = {}

for score_val in score_cols:
    s_to_ranks[score_val.replace("_score", "")] = []
# s_to_ranks['Deleteriousness Score'] = []
print(s_to_ranks)
for c in df.columns:
    vals = df[c].sort_values(ascending = False)
    vals = vals.dropna()
    for counter, (index, val) in enumerate(vals.iteritems()):
        print(index, val)
        s_to_ranks[index.replace("_score", "")].append(counter + 1)
        
lst = []
for k, v in s_to_ranks.items():
    print(v)
    print(np.mean(list(filter(lambda x : not pd.isnull(x), v))))
    lst.append([k, np.mean(v)])
    print(k, np.mean(list(filter(lambda x : not pd.isnull(x), v))))
    
df = pd.DataFrame(lst, columns = ["Score", "Average Rank"])
df = df.sort_values(by = "Average Rank")
df.to_excel(writer, sheet_name="Avg Rank Mis Correlations", index=False)

In [None]:
writer.close()