In [1]:
import pandas as pd

## Crosslinks

In [2]:
s1_links = pd.read_csv("S1_FP_RP/1perc_residue_Links_xiFDR2.2.1.csv")
s1_links = s1_links[(s1_links["Decoy1"] == False) & (s1_links["Decoy2"] == False)]
s2_links = pd.read_csv("S2_FP_RP/1perc_residue_Links_xiFDR2.2.1.csv")
s2_links = s2_links[(s2_links["Decoy1"] == False) & (s2_links["Decoy2"] == False)]
s3_links = pd.read_csv("S3_FP_RP/1perc_residue_Links_xiFDR2.2.1.csv")
s3_links = s3_links[(s3_links["Decoy1"] == False) & (s3_links["Decoy2"] == False)] 

In [3]:
s1_links["fdrGroup"].value_counts()

fdrGroup
Self       158
Between     47
Name: count, dtype: int64

In [4]:
s2_links["fdrGroup"].value_counts()

fdrGroup
Self       255
Between     46
Name: count, dtype: int64

In [5]:
s3_links["fdrGroup"].value_counts()

fdrGroup
Self       176
Between     34
Name: count, dtype: int64

In [6]:
def get_crosslink_types(xls, df):
    for i, row in df.iterrows():
        current_xl = "--".join(sorted([str(row["Protein1"]).strip() + "_" + str(row["fromSite"]).strip(), str(row["Protein2"]).strip() + "_" + str(row["ToSite"]).strip()]))
        if current_xl not in xls:
            xls[current_xl] = row["fdrGroup"]
    return

In [7]:
xls = dict()
get_crosslink_types(xls, s1_links)
get_crosslink_types(xls, s2_links)
get_crosslink_types(xls, s3_links)
len(xls)

435

In [8]:
nr_intra = 0
nr_inter = 0
for xl in xls:
    if "Self" in xls[xl]:
        nr_intra+=1
    if "Between" in xls[xl]:
        nr_inter+=1
print(f"Nr of intra links: {nr_intra}\nNr of inter links: {nr_inter}")

Nr of intra links: 354
Nr of inter links: 81


## Export all XLs

In [9]:
def get_peptide1(row):
    psmid = str(row["PSMIDs"]).split(";")[0]
    p1 = psmid.split("P1_")[1].split(" ")[0]
    if "." in p1:
        p1 = p1.split(".")[1]
    seq = ""
    for aa in p1:
        if aa.isupper():
            seq += aa
    if len(seq) == 0:
        raise RuntimeError("Couldn't parse peptide sequence!")
    return seq

def get_peptide2(row):
    psmid = str(row["PSMIDs"]).split(";")[0]
    p2 = psmid.split("P2_")[1].split(" ")[0]
    if "." in p2:
        p2 = p2.split(".")[1]
    seq = ""
    for aa in p2:
        if aa.isupper():
            seq += aa
    if len(seq) == 0:
        raise RuntimeError("Couldn't parse peptide sequence!")
    return seq

In [10]:
def get_positions(row):
    psmid = str(row["PSMIDs"]).split(";")[0]
    pos1 = psmid.split("P2_")[1].split(" ")[1]
    pos2 = psmid.split("P2_")[1].split(" ")[2]
    return [int(pos1), int(pos2)]

In [11]:
def get_crosslinks(xls, df):
    for i, row in df.iterrows():
        current_xl = "--".join(sorted([str(row["Protein1"]).strip() + "_" + str(row["fromSite"]).strip(), str(row["Protein2"]).strip() + "_" + str(row["ToSite"]).strip()]))
        if current_xl not in xls:
            pos1, pos2 = get_positions(row)
            xls[current_xl] = {"Peptide 1": get_peptide1(row), "Peptide 1 XL Pos": pos1, "Protein 1": row["Protein1"], "Protein 1 XL Pos": row["fromSite"],
                               "Peptide 2": get_peptide2(row), "Peptide 2 XL Pos": pos2, "Protein 2": row["Protein2"], "Protein 2 XL Pos": row["ToSite"]}
    return

In [12]:
export = dict()
get_crosslinks(export, s1_links)
get_crosslinks(export, s2_links)
get_crosslinks(export, s3_links)
len(export)

435

In [13]:
export_dict = {"Peptide 1": [], "Peptide 1 XL Pos": [], "Protein 1": [], "Protein 1 XL Pos": [],
               "Peptide 2": [], "Peptide 2 XL Pos": [], "Protein 2": [], "Protein 2 XL Pos": []}
for xl in export:
    export_dict["Peptide 1"].append(export[xl]["Peptide 1"])
    export_dict["Peptide 1 XL Pos"].append(export[xl]["Peptide 1 XL Pos"])
    export_dict["Protein 1"].append(export[xl]["Protein 1"])
    export_dict["Protein 1 XL Pos"].append(export[xl]["Protein 1 XL Pos"])
    export_dict["Peptide 2"].append(export[xl]["Peptide 2"])
    export_dict["Peptide 2 XL Pos"].append(export[xl]["Peptide 2 XL Pos"])
    export_dict["Protein 2"].append(export[xl]["Protein 2"])
    export_dict["Protein 2 XL Pos"].append(export[xl]["Protein 2 XL Pos"])

In [14]:
export_df = pd.DataFrame(export_dict)
export_df.shape

(435, 8)

In [15]:
export_df

Unnamed: 0,Peptide 1,Peptide 1 XL Pos,Protein 1,Protein 1 XL Pos,Peptide 2,Peptide 2 XL Pos,Protein 2,Protein 2 XL Pos
0,SSNRFLKDAGSGVENASK,7,Q19057,577,SSNRFLKDAGSGVENASK,7,Q19057,577
1,KPVPEWCDEAVKPSEK,1,O76840,678,KPVPEWCDEAVKPSEK,1,O76840,678
2,TIAECLADELINAAKGSSNSYAIK,15,P52821,62,EVITYKLITPSVVSERLK,6,P49041,188
3,NLRAYEGVPAKYQK,11,Q27389,33,LLQGDKVVVLR,6,Q27389,112
4,RYVNVASGPGKK,11,Q20228,29,ERLDQELKLIGTFGLK,8,Q18231,122
...,...,...,...,...,...,...,...,...
430,VFFKAGVLAHLEDIRDEK,4,P02566,775,KGFPNR,1,Q21000;O62244;P12844;P02567;P02566;P12845,715;709;719;712;714;720
431,RIAKHR,4,Q23494,379,LRDTYKNHPEVR,6,Q23494,389
432,YLAEVASEDRAAVVEKSQK,16,P41932;N1NV25,145;23,MKGDYYR,1,N1NV25,1
433,GSAVEREISGQYAADKK,16,Q95XS1,103,GSAVEREISGQYAADKK,16,Q95XS1,103


In [16]:
# export_df.to_excel("high-confidence_crosslinks_all_replicates.xlsx", index = False)

## PPIs

In [17]:
s1_ppi = pd.read_csv("S1_FP_PPI_inter/1perc_ppi_inter_ppi_xiFDR2.2.1.csv")
s1_ppi = s1_ppi[(s1_ppi["isDecoy1"] == False) & (s1_ppi["isDecoy2"] == False)]
s2_ppi = pd.read_csv("S2_FP_PPI_inter/1perc_ppi_inter_ppi_xiFDR2.2.1.csv")
s2_ppi = s2_ppi[(s2_ppi["isDecoy1"] == False) & (s2_ppi["isDecoy2"] == False)]
s3_ppi = pd.read_csv("S3_FP_PPI_inter/1perc_ppi_inter_ppi_xiFDR2.2.1.csv")
s3_ppi = s3_ppi[(s3_ppi["isDecoy1"] == False) & (s3_ppi["isDecoy2"] == False)]

In [18]:
s1_ppi["fdrGroup"].value_counts()

fdrGroup
Self       108
Between     29
Name: count, dtype: int64

In [19]:
s2_ppi["fdrGroup"].value_counts()

fdrGroup
Self       149
Between     33
Name: count, dtype: int64

In [20]:
s3_ppi["fdrGroup"].value_counts()

fdrGroup
Self       115
Between     31
Name: count, dtype: int64

In [21]:
def get_ppi_types(ppis, df):
    for i, row in df.iterrows():
        current_ppi = "--".join(sorted([str(row["Protein1"]).strip(), str(row["Protein2"]).strip()]))
        if current_ppi not in ppis:
            ppis[current_ppi] = row["fdrGroup"]

In [22]:
ppis = dict()
get_ppi_types(ppis, s1_ppi)
get_ppi_types(ppis, s2_ppi)
get_ppi_types(ppis, s3_ppi)
len(ppis)

244

In [23]:
nr_intra = 0
nr_inter = 0
for ppi in ppis:
    if "Self" in ppis[ppi]:
        nr_intra+=1
    if "Between" in ppis[ppi]:
        nr_inter+=1
print(f"Nr of intra PPIs: {nr_intra}\nNr of inter PPIs: {nr_inter}")

Nr of intra PPIs: 192
Nr of inter PPIs: 52


## Export to xiVIEW

In [24]:
s1_links_ppi_inter = pd.read_csv("S1_FP_PPI_inter/1perc_ppi_inter_Links_xiFDR2.2.1.csv")
s2_links_ppi_inter = pd.read_csv("S2_FP_PPI_inter/1perc_ppi_inter_Links_xiFDR2.2.1.csv")
s3_links_ppi_inter = pd.read_csv("S3_FP_PPI_inter/1perc_ppi_inter_Links_xiFDR2.2.1.csv")

In [25]:
data = [s1_links_ppi_inter, s2_links_ppi_inter, s3_links_ppi_inter]
inter_links_1perc_ppi_fdr = {"AbsPos1": [], "AbsPos2": [], "Protein1": [], "Protein2": []}

In [26]:
seen_xl = set()
for df in data:
    for i, row in df.iterrows():
        current_xl = "--".join(sorted([str(row["Protein1"]).strip() + "_" + str(row["fromSite"]).strip(), str(row["Protein2"]).strip() + "_" + str(row["ToSite"]).strip()]))
        # skip if xl not unique
        if current_xl in seen_xl:
            continue
        # skip decoys
        if row["Decoy1"] == True or row["Decoy2"] == True:
            continue
        # skip if PPIFDR greater 1% (none should be skipped)
        if row["PPIFDR"] > 0.01:
            continue
        # skip if xl is intra
        if "Self" in row["fdrGroup"]:
            continue
        seen_xl.add(current_xl)
        inter_links_1perc_ppi_fdr["AbsPos1"].append(row["fromSite"])
        inter_links_1perc_ppi_fdr["AbsPos2"].append(row["ToSite"])
        inter_links_1perc_ppi_fdr["Protein1"].append(row["Protein1"])
        inter_links_1perc_ppi_fdr["Protein2"].append(row["Protein2"])

In [27]:
inter_links_1perc_ppi_fdr_df = pd.DataFrame(inter_links_1perc_ppi_fdr)

In [28]:
inter_links_1perc_ppi_fdr_df

Unnamed: 0,AbsPos1,AbsPos2,Protein1,Protein2
0,62,188,P52821,P49041
1,122,29,Q18231,Q20228
2,194,146,O45012,Q21276
3,82,25,P91128,P49181
4,32;32;31;32,92,Q27484;Q27876;P04255;Q27894,P62784
...,...,...,...,...
71,87;149,116,I2HAF9;Q9NEN6,O01868
72,182,227,Q27389,O17536
73,118;118;117;118,11;11;11;11,Q27484;Q27876;P04255;Q27894,Q27485;J7S164;J7SA65;P09588
74,32,307,O01802,O02056


In [29]:
# inter_links_1perc_ppi_fdr_df.to_csv("inter_links_1perc_ppi_fdr_xiview.csv", index = False)