In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import scipy.stats
import operator

In [None]:
table = pd.read_excel("102720_PE115pep_analysis.xlsx")
table = table.loc[~table["Design category"].isna()]

pe_cols = [
    'Rep 1A frac PE',
    'Rep 1B frac PE',
    'Rep 2A frac PE',
    'Rep 2B frac PE',
    'Rep 3 frac PE',
    'Rep 4 frac PE',
    'Rep 5 frac PE',
    'Rep 6 frac PE',
    'Rep 7 frac PE',
    'Rep 8 frac PE',
    'Rep 9 frac PE'
]

controls = table.loc[table["Design category"].str.contains("control")]
control_frac_pe_vals = controls[pe_cols].values
all_control_vals = np.ndarray.flatten(control_frac_pe_vals)

non_controls = table.loc[~table["Design category"].str.contains("control")]

non_controls['Oligo.name']
name_to_p_val = {}
for index, row in non_controls.iterrows():
    name = row['Oligo.name']
    vals = row[pe_cols].values
    p_val = scipy.stats.mannwhitneyu(all_control_vals, vals).pvalue
    name_to_p_val[name] = p_val
    
m = len(name_to_p_val)
counter = 0
alpha = 0.1
p_vals_sorted = dict(sorted(name_to_p_val.items(), key=operator.itemgetter(1)))

names = []
p_vals = []
signif = []
for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    names.append(name)
    p_vals.append(p_val)
    if p_val < thresh:
        counter += 1
        signif.append(True)
    else:
        signif.append(False)
    
print(counter)


df = pd.DataFrame()
df["name"] = names
df["p_val"] = p_vals
df["-log 10 p_val"] = list(map(lambda x : -1*np.log10(x), p_vals))
df["significant fdr = 0.1"] = signif
df.to_csv("Significance_values_102720_PE115pep.csv", index = False)



In [13]:
df = pd.read_excel("SuppTable3_PE_115peptide_analysis.xlsx")
df = df.loc[~df["Oligo.name"].isna()]
pep = df.loc[df["Oligo.name"] == "hblOligo252889_Cat3"].iloc[0]
 
exp_cols = ['r5','r6','r7','r8','r9']

controls = df.loc[df["Design category"].str.contains("control")]
non_controls = df.loc[~df["Design category"].str.contains("control")]

all_control_vals = np.ndarray.flatten(controls[exp_cols].values)

name_to_p_val = {}

name_to_median = {}

for index, row in non_controls.iterrows():
    name = row['Oligo.name']
    vals = row[exp_cols].values
    name_to_median[name] = np.median(vals)
    p_val = scipy.stats.mannwhitneyu(all_control_vals, vals).pvalue
    name_to_p_val[name] = p_val 

    
counter = 0
names = []
medians = []
p_vals = []
signif_10 = []
signif_05 = []

m = len(name_to_p_val)
counter = 0
alpha = 0.1
p_vals_soredd = dict(sorted(name_to_p_val.items(), key=operator.itemgetter(1)))


for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    names.append(name)
    p_vals.append(p_val)
    medians.append(name_to_median[name])
    if p_val < thresh:
        counter += 1
        signif_10.append(True)
    else:
        signif_10.append(False)
        
        
m = len(name_to_p_val)
counter = 0
alpha = 0.05
p_vals_sorted = dict(sorted(name_to_p_val.items(), key=operator.itemgetter(1)))

for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    if p_val < thresh:
        counter += 1
        signif_05.append(True)
    else:
        signif_05.append(False)
        

df = pd.DataFrame()
df["name"] = names
df["median PE r5 - r9 reps"] = medians
df["p_val"] = p_vals
df["-log 10 p_val"] = list(map(lambda x : -1*np.log10(x), p_vals))
df["significant fdr = 0.1"] = signif_10
df["significant fdr = 0.05"] = signif_05

df.to_csv("Significance_values_supp_table_3.csv", index = False)
len(df.loc[df["significant fdr = 0.1"] == True])

51

In [37]:
df = pd.read_excel("PE_dual_peptide_analysis.xlsx", sheet_name = "mESC_CtrlnormfracPE")
df = df.drop(columns = ["Unnamed: 13"])

column_values = [
    "norm_fracPE_mESCrep1M",
    "norm_fracPE_mESCrep1R",
    "norm_fracPE_mESCrep2M",
    "norm_fracPE_mESCrep2R",
    "norm_fracPE_mESCrep3M",
    "norm_fracPE_mESCrep3R",
    "norm_fracPE_mESCrep4M",
    "norm_fracPE_mESCrep4R",
    "norm_fracPE_mESCrep5M",
    "norm_fracPE_mESCrep5R"
]

control_vals = df.loc[df["Combined name"] == "VCPp8 (control)-VCPp8 (control)"].iloc[0][column_values].values

non_control_pairs = df.loc[
    (df["peptide 1 (read 1)"] != "VCPp8 (control)")&
    (df["peptide 2 (read 2)"] != "VCPp8 (control)")
]


pair_to_pval = {}

for index, row in non_control_pairs.iterrows():
    v = row[column_values].values
    p = scipy.stats.mannwhitneyu(v, control_vals, alternative = "greater").pvalue
    pair_to_pval[row["Combined name"]] = p
    

m = len(pair_to_pval)
counter = 0
alpha = 0.1
p_vals_sorted = dict(sorted(pair_to_pval.items(), key=operator.itemgetter(1)))
signif = []
for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    if p_val < thresh:
        counter += 1
        signif.append(True)
    else:
        signif.append(False)
        
print(counter)



80


In [63]:
df = pd.read_excel("PE_dual_peptide_analysis.xlsx", sheet_name = "mESC_CtrlnormfracPE")
df = df.drop(columns = ["Unnamed: 13"])

column_values = [
    "norm_fracPE_mESCrep1M",
    "norm_fracPE_mESCrep1R",
    "norm_fracPE_mESCrep2M",
    "norm_fracPE_mESCrep2R",
    "norm_fracPE_mESCrep3M",
    "norm_fracPE_mESCrep3R",
    "norm_fracPE_mESCrep4M",
    "norm_fracPE_mESCrep4R",
    "norm_fracPE_mESCrep5M",
    "norm_fracPE_mESCrep5R"
]

control_vals = df.loc[df["Combined name"] == "VCPp8 (control)-VCPp8 (control)"].iloc[0][column_values].values

non_control_pairs = df.loc[
    (df["peptide 1 (read 1)"] != "VCPp8 (control)") &
    (df["peptide 2 (read 2)"] != "VCPp8 (control)")
]


pair_to_pval = {}

for index, row in non_control_pairs.iterrows():
    v = row[column_values].values
    p = scipy.stats.mannwhitneyu(v, control_vals, alternative = "greater").pvalue
    pair_to_pval[row["Combined name"]] = p
    

m = len(pair_to_pval)
print(m)
counter = 0
alpha = 0.01
p_vals_sorted = dict(sorted(pair_to_pval.items(), key=operator.itemgetter(1)))
signif = []
significant_vals = set()
insignificant_vals = set()

for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    if p_val < thresh:
        counter += 1
        signif.append(True)
        significant_vals.add(name)
    else:
        signif.append(False)
        insignificant_vals.add(name)
        
print(counter)
is_signif = []
p_vals = []
for index, row in df.iterrows():
    if row["Combined name"] in significant_vals:
        p_vals.append(p_vals_sorted[row["Combined name"]])
        is_signif.append(True)
    elif row["Combined name"] in insignificant_vals:
        p_vals.append(p_vals_sorted[row["Combined name"]])
        is_signif.append(False)
        print("insignif")
    elif row["Combined name"] == "VCPp8 (control)-VCPp8 (control)":
        p_vals.append(1)
        is_signif.append("N/A (Control)")
    else:
        p_vals.append(None)
        is_signif.append("N/A (Excluded)")
df["p-val"] = p_vals
df["Significant FDR = 0.01"] = is_signif

df.to_csv("mESC_CtrlnormfracPE_significant_dual_peptides.csv")

81
79
insignif
insignif


In [71]:
df = pd.read_excel("PE_dual_peptide_analysis.xlsx", sheet_name = "HCT116_CtrlnormfracPE")

df["Combined name"] = df["peptide 1 (read 1)"] + "-" + df["peptide 2 (read 2)"]

column_values = ["norm_fracPE_HCT116rep1", "norm_fracPE_HCT116rep2"]

control_vals = df.loc[
    (df["peptide 1 (read 1)"] == "VCPp8 (control)") &
    (df["peptide 2 (read 2)"] == "VCPp8 (control)")
    
].iloc[0][column_values].values

print(control_vals)

non_control_pairs = df.loc[
    (df["peptide 1 (read 1)"] != "VCPp8 (control)") &
    (df["peptide 2 (read 2)"] != "VCPp8 (control)")
]


pair_to_pval = {}

for index, row in non_control_pairs.iterrows():
    v = row[column_values].values
    p = scipy.stats.mannwhitneyu(v, control_vals, alternative = "greater").pvalue
    pair_to_pval[row["peptide 1 (read 1)"] + "-" + row["peptide 2 (read 2)"]] = p

m = len(pair_to_pval)
print(m)
counter = 0
alpha = 0.1
p_vals_sorted = dict(sorted(pair_to_pval.items(), key=operator.itemgetter(1)))
signif = []
significant_vals = set()
insignificant_vals = set()

for k , (name, p_val) in enumerate(p_vals_sorted.items()):
    thresh = ((k+1)/m)*alpha
    if p_val < thresh:
        counter += 1
        signif.append(True)
        significant_vals.add(name)
    else:
        signif.append(False)
        insignificant_vals.add(name)

        
print(counter)

is_signif = []
p_vals = []
for index, row in df.iterrows():
    if row["peptide 1 (read 1)"] + "-" + row["peptide 2 (read 2)"] in significant_vals:
        p_vals.append(p_vals_sorted[row["Combined name"]])
        is_signif.append(True)
    elif row["peptide 1 (read 1)"] + "-" + row["peptide 2 (read 2)"] in insignificant_vals:
        p_vals.append(p_vals_sorted[row["Combined name"]])
        is_signif.append(False)
    elif row["Combined name"] == "VCPp8 (control)-VCPp8 (control)":
        p_vals.append(1)
        print("found")
        is_signif.append("N/A (Control)")
    else:
        p_vals.append(None)
        is_signif.append("N/A (Excluded)")
df["p-val"] = p_vals
df["Significant FDR = 0.01"] = is_signif

df.to_csv("HCT116_CtrlnormfracPE_significant_dual_peptides.csv")




[1.0 1.0]
81
0
found


In [48]:
p_vals_sorted

{'NFATC2IPp1-IGF1pm1': 0.11033568095992341,
 'MSH4p2-NFATC2IPp1': 0.11033568095992341,
 'CDKN2Ap1-NFATC2IPp1': 0.11033568095992341,
 'POLNpm19-NFATC2IPp1': 0.11033568095992341,
 'NFATC2IPp1-ZBTB21p11': 0.11033568095992341,
 'IGF1pm1-IGF1pm1': 0.11033568095992341,
 'NFATC2IPp1-MSH4p2': 0.11033568095992341,
 'NFATC2IPp1-CDKN2Ap1': 0.11033568095992341,
 'POLNpm19-ZBTB21p11': 0.11033568095992341,
 'POLNpm19-IGF1pm1': 0.11033568095992341,
 'POLNpm19-CDKN2Ap1': 0.11033568095992341,
 'NFATC2IPp1-POLNpm19': 0.11033568095992341,
 'IGF1pm1-NFATC2IPp1': 0.11033568095992341,
 'IGF1pm1-MSH4p2': 0.11033568095992341,
 'ZBTB21p11-NFATC2IPp1': 0.11033568095992341,
 'CDKN2Ap1-IGF1pm1': 0.11033568095992341,
 'CDKN2Ap1-MSH4p2': 0.11033568095992341,
 'POLNpm19-MSH4p2': 0.11033568095992341,
 'MSH4p2-ZBTB21p11': 0.11033568095992341,
 'IGF1pm1-ZBTB21p11': 0.11033568095992341,
 'CDKN2Ap1-ZBTB21p11': 0.11033568095992341,
 'IGF1pm1-POLNpm19': 0.11033568095992341,
 'CDKN2Ap1-BRCA2p13': 0.11033568095992341,
 'CDKN