# Setup

First we load in our datasets for analysis. This includes our cluster datasets for both stages, where each row is a cluster as well as protein datasets where each row is a protein.

In [None]:
import pandas as pd
# load in our cluster data
s3_df = pd.read_csv("data/generated_tables/s3_network.tsv", sep="\t")
s5_df = pd.read_csv("data/generated_tables/s5_network.tsv", sep="\t")
# keep track of the stage
s5_df["stage"] = 5
s3_df["stage"] = 3


# add essentiality data onto clusters
s5_essentiality_df = pd.read_csv("data/generated_tables/s5_essentiality_df.tsv", sep="\t")
s3_essentiality_df = pd.read_csv("data/generated_tables/s3_essentiality_df.tsv", sep="\t")
# keep track of stage
s5_essentiality_df["stage"] = 5
s3_essentiality_df["stage"] = 3

# combine the two dataframes
protein_concat_df = pd.concat([s5_essentiality_df, s3_essentiality_df])

In [None]:
print("Number of clusters in s5 network greater than random: ", sum(s5_df["significant"]))
print("Number of clusters in s3 network greater than random: ", sum(s3_df["significant"]))

Number of clusters in s5 network greater than random:  59
Number of clusters in s3 network greater than random:  25


In [None]:
from ast import literal_eval

# function to convert str representation of Proteins cluster column in df to sets
def convert_data(df):
    df["Proteins"] = df["Proteins"].apply(literal_eval)
    return df 

print("data type of cluster before converting: ", type(s3_df["Proteins"][0]))
for df in [s3_df, s5_df]:
    df = convert_data(df)
print("data type of cluster after converting: ", type(s3_df["Proteins"][0]))


data type of cluster before converting:  <class 'str'>
data type of cluster after converting:  <class 'set'>


## Tagging Protein Essentiality to Clusters

In [None]:
# tagging protein essentiality to clusters
def calculate_cluster_essentiality(cluster):
    total = 0
    for protein in cluster:
        if protein in s5_essentiality_df["Accession ID"].values:
            total += s5_essentiality_df[s5_essentiality_df["Accession ID"] == protein]["essential"].values[0]
    return total/len(cluster)

lst = []
for i, row in s5_df.iterrows():
    lst.append(calculate_cluster_essentiality(row["Proteins"]))
s5_df["essentiality"] = lst

lst = []
for i, row in s3_df.iterrows():
    lst.append(calculate_cluster_essentiality(row["Proteins"]))
s3_df["essentiality"] = lst

## Analyzing Between Clusters

In [None]:
# # combine the two dataframes
# cluster_concat_df = pd.concat([s3_df, s5_df])


# rename columns
s3_df.rename(columns={"Number": "s3_number",
                      "Proteins": "s3_proteins",
                      "avg_spearman": "s3_avg_spearman",
                      "size": "s3_size",
                      "essentiality": "s3_essentiality"}, inplace=True),

s5_df.rename(columns={"Number": "s5_number",
                      "Proteins": "s5_proteins",
                      "avg_spearman": "s5_avg_spearman",
                      "size": "s5_size",
                      "essentiality": "s5_essentiality"}, inplace=True)

In [None]:
# cross join the two dataframes so each row is a combination of s3 and s5 clusters
crossed_df = pd.merge(
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size", "s3_essentiality"]],
    s5_df[['s5_number', 's5_proteins', 's5_avg_spearman', 's5_size', "s5_essentiality"]],
    how="cross")

# reordering the columns
crossed_df = crossed_df.loc[:, ["s3_number", "s5_number", "s3_proteins", "s5_proteins", 
                                "s3_avg_spearman", "s5_avg_spearman", "s3_size", "s5_size"]]
crossed_df["total_size"] = crossed_df["s3_size"] + crossed_df["s5_size"]

In [None]:
# look for common proteins across clusters
common_proteins = []
for _, row in crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["s3_proteins"]:
        if protein in row["s5_proteins"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
crossed_df["common_proteins"] = common_proteins

In [None]:
# look for common proteins across clusters
def get_common_proteins(proteins1, proteins2):
    common_proteins = []
    # iterate over crossed df
    for _, row in crossed_df.iterrows():
        row_common_proteins = []
        # if protein in s3 is in s5, add to list of common proteins
        for protein in row[proteins1]:
            if protein in row[proteins2]:
                row_common_proteins.append(protein)
        common_proteins.append(row_common_proteins)
    return common_proteins

In [None]:
# add number of overlapping proteins
crossed_df["num_overlap"] = crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap
509,6,5,"{Q8IAR3, Q8IC01, C0H5H0, C6KST3, Q8IBI3, Q8IJN...","{Q8IAR3, Q8I6T3, C6KST3, Q8IBI3, Q8IJN9, Q8IDG...",0.70285,0.744068,24,35,59,"[Q8IAR3, C6KST3, Q8IBI3, Q8IJN9, Q8IDG2, Q8IDG...",15
89,1,5,"{Q8IAR3, Q8IC01, C6KST3, Q8IBI3, Q8IJN9, Q8IDG...","{Q8IAR3, Q8I6T3, C6KST3, Q8IBI3, Q8IJN9, Q8IDG...",0.697981,0.744068,24,35,59,"[Q8IAR3, C6KST3, Q8IBI3, Q8IJN9, Q8IDG2, Q8IDG...",15
2,0,2,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.246667,0.539301,16,65,81,"[Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8IET...",11
1178,14,2,"{Q8IAX8, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8I60...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.277058,0.539301,14,65,79,"[Q8IAX8, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8IJX...",10
1514,18,2,"{Q8IKF0, Q7KQL5, C6KTA4, Q8IK89, Q8I0P6, O9615...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.356469,0.539301,21,65,86,"[Q8I0P6, O97285, K7NTP5, Q8I0V2, Q8IET7, Q8IKR...",7


In [None]:
def calculate_overlap(row, size_col1, size_col2):
    return row["num_overlap"] / min(row[size_col1], row[size_col2])

def calculate_jaccard_index(row, proteins_col1, proteins_col2):
    return len(row["common_proteins"]) / len(set(list(row[proteins_col1]) + list(row[proteins_col2])))

crossed_df["percent_overlap"] = crossed_df.apply(calculate_overlap, args=["s3_size", "s5_size"], axis=1)
crossed_df["jaccard_index"] = crossed_df.apply(calculate_jaccard_index, args=["s3_proteins", "s5_proteins"], axis=1)
crossed_df["similarity_score"] = (crossed_df["percent_overlap"] + crossed_df["jaccard_index"]) / 2

In [None]:
crossed_df[["s3_number", "s5_number", "s3_avg_spearman", "s5_avg_spearman", "s3_size", "s5_size", "total_size", "num_overlap", "jaccard_index", "percent_overlap", "similarity_score"]].sort_values("similarity_score", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,num_overlap,jaccard_index,percent_overlap,similarity_score
1848,22,0,0.333005,0.326273,2,6,8,2,0.333333,1.0,0.666667
1466,17,38,0.568391,0.601248,4,6,10,3,0.428571,0.75,0.589286
996,11,72,0.6033,0.605468,5,5,10,3,0.428571,0.6,0.514286
946,11,22,0.6033,0.231954,5,6,11,3,0.375,0.6,0.4875
509,6,5,0.70285,0.744068,24,35,59,15,0.340909,0.625,0.482955


In [None]:
# top 10 clusters with highest percent common
crossed_df[crossed_df["num_overlap"] > 2].sort_values("percent_overlap", ascending=False).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
1466,17,38,"{Q76NM4, Q8IHR8, Q8I3W9, A0A5K1K8H7}","{A0A5K1K8H7, Q8IIK8, C0H516, Q8I274, Q76NM4, Q...",0.568391,0.601248,4,6,10,"[Q76NM4, Q8I3W9, A0A5K1K8H7]",3,0.75,0.428571,0.589286
1178,14,2,"{Q8IAX8, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8I60...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.277058,0.539301,14,65,79,"[Q8IAX8, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8IJX...",10,0.714286,0.144928,0.429607
87,1,3,"{Q8IAR3, Q8IC01, C6KST3, Q8IBI3, Q8IJN9, Q8IDG...","{Q8I2H3, Q8IDS0, Q8IEP9, Q6ZMA8, Q76NM6, Q8I28...",0.697981,0.641215,24,7,31,"[Q8I280, Q6ZMA8, Q8I2H3, Q76NM6, Q8IE84]",5,0.714286,0.192308,0.453297
2,0,2,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.246667,0.539301,16,65,81,"[Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8IET...",11,0.6875,0.157143,0.422321
2270,27,2,"{Q8IAX8, Q8IIV2, Q8ILG8, Q8IBV7, Q8IIV1, Q8I5H...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.285536,0.539301,9,65,74,"[Q8IAX8, Q8IIV2, Q8IBV7, Q8IIV1, Q8I5H4, C6KT18]",6,0.666667,0.088235,0.377451


In [None]:
# top 10 clusters with lowest percent common
crossed_df.sort_values(["percent_overlap", "total_size"], ascending=[True, False]).head(5)

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
590,7,2,"{Q8IBN4, Q8IEU2, O77310, Q8I484, Q8IAV1, Q8I4R...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.470468,0.539301,11,65,76,[],0,0.0,0.0,0.0
1346,16,2,"{Q8IC01, Q8II24, C0H5H0, Q8I2X4, Q8I2J3, Q8IJN...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.574636,0.539301,10,65,75,[],0,0.0,0.0,0.0
2354,28,2,"{Q8IC01, Q8I2X4, Q8I2J3, O97227, Q8IEK1, Q8II3...","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.472813,0.539301,7,65,72,[],0,0.0,0.0,0.0
1262,15,2,"{Q8IDG9, Q8IFM0, Q8I206, Q8IIX5, Q8I490, Q8I2F4}","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.633005,0.539301,6,65,71,[],0,0.0,0.0,0.0
758,9,2,"{P61074, Q8ILB6, Q8IEJ6, O97227, Q8I0X1}","{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.6567,0.539301,5,65,70,[],0,0.0,0.0,0.0


In [None]:
# get clusters with no common proteins across all clusters
x = crossed_df[crossed_df["s3_number"] == 1]
y = x["num_overlap"] > 1
bool(y.sum() == 0)

False

In [None]:
lst = []
for i in s3_df["s3_number"]:
    x = crossed_df[crossed_df["s3_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s3_df["unique between stages"] = lst

lst = []
for i in s5_df["s5_number"]:
    x = crossed_df[crossed_df["s5_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique between stages"] = lst

In [None]:
s5_df[s5_df["unique between stages"]].sort_values("s5_size", ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages
17,17,"{Q8I3X4, Q8IE66, P50250, Q8IKT2, Q8I5B6, Q76NM...",0.328216,9,0.418033,False,True,5,0.777778,True
23,23,"{Q8IKK7, O97249, Q8IJ34, Q8IB14, Q8I463, A0A14...",0.471949,9,0.396058,True,True,5,0.888889,True
15,15,"{Q8IJM0, Q8I1V1, Q8I5M9, Q8IC05, Q8IAR6, Q8IEQ...",0.786224,8,0.421476,True,True,5,0.75,True
63,63,"{Q8I3X4, P50250, Q8IKT2, Q8IJN9, Q8IIR8, Q8I6U...",0.372379,8,0.399449,False,True,5,0.625,True
30,30,"{Q8ILS7, Q8I3Y6, Q8II82, C0H4C7, Q8IBR6, Q8I3A...",0.759633,7,0.407794,True,True,5,0.428571,True


In [None]:
s5_df[s5_df["unique between stages"] & (s5_df["s5_size"] > 2)].sort_values(["s5_avg_spearman", "s5_size"], ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages
1,1,"{Q8I246, Q8IDZ9, Q8IIW2, Q8ILP6, Q8IIA4, Q8IBS3}",0.913026,6,0.401052,True,True,5,1.0,True
66,66,"{Q8II42, P61074, Q8I3A1, C6S3I6, Q8II92}",0.873885,5,0.408008,True,True,5,0.6,True
31,31,"{Q8IL48, C6KSV2, A0A5K1K967, C6KTA3}",0.858456,4,0.407547,True,True,5,0.5,True
15,15,"{Q8IJM0, Q8I1V1, Q8I5M9, Q8IC05, Q8IAR6, Q8IEQ...",0.786224,8,0.421476,True,True,5,0.75,True
41,41,"{A0A5K1K8Y8, Q8IDE7, Q8IHY0, Q8IC01}",0.786125,4,0.428807,True,True,5,0.75,True


## Comparing Within Cluster

To compare PPIs within a cluster, we cross join the stage 3 dataset with itself,
and see if the same proteins appear in any 2 clusters. We call this attribute 'unique within stage'.

Additionally, we cross join them 

In [None]:
# TODO: 

# cross join the two dataframes so each row is a combination of s3 and s5 clusters
s3_crossed_df = pd.merge(
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size"]].rename(columns={"s3_number": "number_1",
                                                                                        "s3_proteins": "proteins_1",
                                                                                        "s3_avg_spearman": "avg_spearman_1",
                                                                                        "s3_size": "size_1"}),
    s3_df[["s3_number", "s3_proteins", "s3_avg_spearman", "s3_size"]].rename(columns={"s3_number": "number_2",
                                                                                        "s3_proteins": "proteins_2",
                                                                                        "s3_avg_spearman": "avg_spearman_2",
                                                                                        "s3_size": "size_2"}),
    how="cross")
# drop rows where the two clusters are the same
s3_crossed_df.drop(s3_crossed_df[s3_crossed_df["number_1"] == s3_crossed_df["number_2"]].index, inplace=True)
# calculating total size
s3_crossed_df["total_size"] = s3_crossed_df["size_1"] + s3_crossed_df["size_2"]
s3_crossed_df.head()

# look for common proteins across clusters
common_proteins = []
for _, row in s3_crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["proteins_1"]:
        if protein in row["proteins_2"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
s3_crossed_df["common_proteins"] = common_proteins

# add number of overlapping proteins
s3_crossed_df["num_overlap"] = s3_crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

lst = []
for i in s3_df["s3_number"]:
    x = s3_crossed_df[s3_crossed_df["number_1"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s3_df["unique within stage"] = lst

lst = []
# loop over each s3 cluster
for i in s3_df["s3_number"]:
    # get list of cluster pairs for current s3 cluster
    x = crossed_df[crossed_df["s3_number"] == i]
    # see if there is any overlapping proteins
    y = x["num_overlap"] > 1
    # if there's no overlapping proteins across all pairs, then the proteins
    # in this cluster are unique between stages
    lst.append(bool(y.sum() == 0))
s3_df["unique between stages"] = lst

In [None]:
# TODO: 

# cross join the two dataframes so each row is a combination of s3 and s5 clusters
s5_crossed_df = pd.merge(
    s5_df[["s5_number", "s5_proteins", "s5_avg_spearman", "s5_size"]].rename(columns={"s5_number": "number_1",
                                                                                        "s5_proteins": "proteins_1",
                                                                                        "s5_avg_spearman": "avg_spearman_1",
                                                                                        "s5_size": "size_1"}),
    s5_df[["s5_number", "s5_proteins", "s5_avg_spearman", "s5_size"]].rename(columns={"s5_number": "number_2",
                                                                                        "s5_proteins": "proteins_2",
                                                                                        "s5_avg_spearman": "avg_spearman_2",
                                                                                        "s5_size": "size_2"}),
    how="cross")
# drop rows where the two clusters are the same
s5_crossed_df.drop(s5_crossed_df[s5_crossed_df["number_1"] == s5_crossed_df["number_2"]].index, inplace=True)
# calculating total size
s5_crossed_df["total_size"] = s5_crossed_df["size_1"] + s5_crossed_df["size_2"]

# look for common proteins across clusters
common_proteins = []
for _, row in s5_crossed_df.iterrows():
    row_common_proteins = []
    for protein in row["proteins_1"]:
        if protein in row["proteins_2"]:
            row_common_proteins.append(protein)
    common_proteins.append(row_common_proteins)
s5_crossed_df["common_proteins"] = common_proteins

# add number of overlapping proteins
s5_crossed_df["num_overlap"] = s5_crossed_df["common_proteins"].apply(len)
# sort by number of overlaps
filtered_crossed_df = crossed_df[crossed_df["num_overlap"] > 0]
filtered_crossed_df.sort_values("num_overlap", ascending=False).head(5)

# get clusters with proteins unique WITHIN a stage
lst = []
for i in s5_df["s5_number"]:
    x = s5_crossed_df[s5_crossed_df["number_1"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique within stage"] = lst

# get clusters with proteins unique BETWEEN stages
lst = []
for i in s5_df["s5_number"]:
    x = crossed_df[crossed_df["s5_number"] == i]
    y = x["num_overlap"] > 1
    lst.append(bool(y.sum() == 0))
s5_df["unique between stages"] = lst

In [None]:
unique_s5_clusters = s5_df[s5_df["unique within stage"] & 
      s5_df["unique between stages"] & 
      s5_df["large"] & 
      s5_df["significant"]].sort_values("s5_avg_spearman", ascending=False)
print(unique_s5_clusters.head(5))
unique_s5_clusters.to_csv("data/generated_tables/unique_s5_clusters.tsv", sep="\t", index=False)

    s5_number                                        s5_proteins  \
31         31               {Q8IL48, C6KSV2, A0A5K1K967, C6KTA3}   
15         15  {Q8IJM0, Q8I1V1, Q8I5M9, Q8IC05, Q8IAR6, Q8IEQ...   
40         40                   {Q8IIT3, Q8I3Q7, C0H4W2, Q8I5V6}   
19         19               {Q8I1S0, A0A143ZY58, Q8IM66, Q8IKH3}   
44         44                   {Q8IJ56, Q8I544, Q8I2G1, Q8ILP3}   

    s5_avg_spearman  s5_size  avg_spearman_random  significant  large  stage  \
31         0.858456        4             0.407547         True   True      5   
15         0.786224        8             0.421476         True   True      5   
40         0.772332        4             0.409747         True   True      5   
19         0.756486        4             0.437104         True   True      5   
44         0.703811        4             0.387597         True   True      5   

    s5_essentiality  unique between stages  unique within stage  
31             0.50                   True  

In [None]:
s3_df[s3_df["unique within stage"] & s3_df["unique between stages"] & s3_df["large"] & s3_df["significant"]]
#{Q8I3M5, C6KTB3, O77312}	

Unnamed: 0,s3_number,s3_proteins,s3_avg_spearman,s3_size,avg_spearman_random,significant,large,stage,s3_essentiality,unique between stages,unique within stage
19,19,"{Q8I3M5, C6KTB3, O77312}",0.87931,3,0.384645,True,True,3,1.0,True,True


In [None]:
s5_df[s5_df["unique within stage"] & s5_df["unique between stages"] & s5_df["large"] & s5_df["significant"]].sort_values("s5_avg_spearman", ascending=False).head(5)

Unnamed: 0,s5_number,s5_proteins,s5_avg_spearman,s5_size,avg_spearman_random,significant,large,stage,s5_essentiality,unique between stages,unique within stage
31,31,"{Q8IL48, C6KSV2, A0A5K1K967, C6KTA3}",0.858456,4,0.407547,True,True,5,0.5,True,True
15,15,"{Q8IJM0, Q8I1V1, Q8I5M9, Q8IC05, Q8IAR6, Q8IEQ...",0.786224,8,0.421476,True,True,5,0.75,True,True
40,40,"{Q8IIT3, Q8I3Q7, C0H4W2, Q8I5V6}",0.772332,4,0.409747,True,True,5,0.5,True,True
19,19,"{Q8I1S0, A0A143ZY58, Q8IM66, Q8IKH3}",0.756486,4,0.437104,True,True,5,1.0,True,True
44,44,"{Q8IJ56, Q8I544, Q8I2G1, Q8ILP3}",0.703811,4,0.387597,True,True,5,0.0,True,True


In [None]:
# convert csv to tsv
s3_ppi_clusters = pd.read_csv("data/Stage_3_PPI_predicted_features_with_clusters1.csv", sep=",")
s3_ppi_clusters.to_csv("data/Stage_3_PPI_predicted_features_with_clusters1.tsv", sep="\t", index=False)

s5_ppi_clusters = pd.read_csv("data/Stage_5_PPI_predicted_features_with_clusters1.csv", sep=",")
s5_ppi_clusters.to_csv("data/Stage_5_PPI_predicted_features_with_clusters1.tsv", sep="\t", index=False)

# create a map of protein accessions to protein name/functions
protein_desc_map = {}
for row in s5_ppi_clusters.iterrows():
    if row[1]["Protein1"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein1"]] = row[1]["Description1"]
    if row[1]["Protein2"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein2"]] = row[1]["Description2"]
        
for row in s3_ppi_clusters.iterrows():
    if row[1]["Protein1"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein1"]] = row[1]["Description1"]
    if row[1]["Protein2"] not in protein_desc_map:
        protein_desc_map[row[1]["Protein2"]] = row[1]["Description2"]

def get_protein_desc_from_acccession(cluster):
    return [protein_desc_map[protein] for protein in cluster]


In [None]:
get_protein_desc_from_acccession(s5_df["s5_proteins"][0])
get_protein_desc_from_acccession(s3_df["s3_proteins"][4])

['Aminopeptidase P',
 'Triosephosphate isomerase',
 '20 kDa chaperonin',
 'Heat shock protein 70']

In [None]:
# append putative tag if cluster contains a putative protein
def append_putative(row, protein_col):
    proteins = get_protein_desc_from_acccession(row[protein_col])
    contains_putative = False
    for p in proteins:
        if 'putative' in p:
            contains_putative = True
            break
    return contains_putative

s5_df["contains_putative"] = s5_df.apply(append_putative, args=["s5_proteins"], axis=1)
s3_df["contains_putative"] = s3_df.apply(append_putative, args=["s3_proteins"], axis=1)

In [None]:
# rename columns
cluster_concat_df = pd.concat(
    [s3_df.rename(columns={"s3_number": "Number",
                      "s3_proteins": "Proteins",
                      "s3_avg_spearman": "avg_spearman",
                      "s3_size": "size",
                      "s3_essentiality": "essentiality"}, inplace=False),
    s5_df.rename(columns={"s5_number": "Number",
                      "s5_proteins": "Proteins",
                      "s5_avg_spearman": "avg_spearman",
                      "s5_size": "size",
                      "s5_essentiality": "essentiality"}, inplace=False)]
)
cluster_concat_df = cluster_concat_df[cluster_concat_df["size"] > 2]
cluster_concat_df["descriptions"] = cluster_concat_df["Proteins"].apply(get_protein_desc_from_acccession)

In [None]:
cluster_concat_df

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,descriptions
0,0,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...",0.246667,16,0.379478,False,True,3,0.562500,False,False,True,"[RNA helicase, Elongation factor 1-alpha, Hist..."
1,1,"{Q8IAR3, Q8IC01, C6KST3, Q8IBI3, Q8IJN9, Q8IDG...",0.697981,24,0.378678,True,True,3,0.833333,False,False,True,"[Proteasome subunit alpha type-6, putative, He..."
2,2,"{Q8IKF0, A0A143ZY58, Q7KQL5, C6KTA4, Q76NM3, Q...",0.392119,26,0.374136,True,True,3,0.576923,False,False,True,"[RNA helicase, ADP-ribosylation factor 1, Tubu..."
3,3,"{Q76NN6, Q8ILI6, Q8ILB6, Q8II61, Q8I5B6, Q8I2W...",0.520338,8,0.388409,True,True,3,0.625000,False,False,True,"[Ran-specific GTPase-activating protein 1, put..."
4,4,"{A0A144A2H0, Q7KQM0, Q8IDZ8, Q8II24}",0.777833,4,0.381746,True,True,3,0.750000,True,False,False,"[Aminopeptidase P, Triosephosphate isomerase, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,77,"{Q8IKF0, C0H4H6, Q8IBG1, Q8I463, Q9TY94, Q7KQL...",0.230131,9,0.418307,False,True,5,0.888889,False,False,True,"[RNA helicase, Cytochrome b-c1 complex subunit..."
78,78,"{Q8IL75, Q8II72, Q8IBN4, Q8IEU2, Q8IJ76, Q8IC4...",0.287340,8,0.388058,False,True,5,0.125000,False,False,True,"[Cytochrome b-c1 complex subunit Rieske, putat..."
80,80,"{Q8IK89, O97306, Q8ILZ7, Q8I3S3}",0.390790,4,0.401394,False,True,5,0.250000,True,False,True,"[Trailer hitch homolog, putative, Uncharacteri..."
81,81,"{Q8IL80, Q8IAM2, Q7KQL8, Q9NLB2}",0.651888,4,0.377906,True,True,5,0.500000,True,False,False,"[thioredoxin-dependent peroxiredoxin, 1-cys pe..."


## Functions for analysis

#### For Clusters

In [None]:
# By clusters
def get_clusters_for_protein(protein, cluster_concat_df=cluster_concat_df):
    """gets a clusters by protein

    Args:
        protein (str): protein ID, either uniprot accession or plasmoDB ID
        cluster_concat_df (_type_, optional): DF containing clusters by row. Defaults to cluster_concat_df.

    Returns:
        _type_: augmented dataframe with clusters containing protein
    """
    print("get clusters for protein with function: ", protein_desc_map[protein])
    # get all clusters for a given protein
    clusters = cluster_concat_df[cluster_concat_df["Proteins"].apply(lambda x: protein in x)]
    return clusters

In [None]:
get_clusters_for_protein("Q8I0V2", cluster_concat_df)

get clusters for protein with function:  ATP synthase subunit beta


Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,descriptions
0,0,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...",0.246667,16,0.379478,False,True,3,0.5625,False,False,True,"[RNA helicase, Elongation factor 1-alpha, Hist..."
2,2,"{Q8IKF0, A0A143ZY58, Q7KQL5, C6KTA4, Q76NM3, Q...",0.392119,26,0.374136,True,True,3,0.576923,False,False,True,"[RNA helicase, ADP-ribosylation factor 1, Tubu..."
14,14,"{Q8IAX8, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV7, Q8I60...",0.277058,14,0.366131,False,True,3,0.5,False,False,True,"[DNA/RNA-binding protein ALBA1, Histone H4, AT..."
18,18,"{Q8IKF0, Q7KQL5, C6KTA4, Q8IK89, Q8I0P6, O9615...",0.356469,21,0.394985,False,True,3,0.571429,False,False,True,"[RNA helicase, Tubulin beta chain, Pyruvate ki..."
24,24,"{Q8IKF0, Q76NN8, Q8I0V2, Q8IBD1, Q8I5A9, Q8IKG...",0.313951,13,0.387974,False,True,3,0.461538,False,False,True,"[RNA helicase, Calcium-transporting ATPase, AT..."
2,2,"{Q8IL58, Q8IIV2, Q8I4U5, Q8IIU8, Q8I487, Q8IEQ...",0.539301,65,0.404399,True,True,5,0.692308,False,False,True,"[60S ribosomal protein L1, putative, Histone H..."
59,59,"{Q8IBN4, Q8I0V2, A0A5K1K8W5, Q8I2X3, Q8IM15, A...",0.15422,6,0.40673,False,True,5,0.333333,True,False,True,"[Secreted ookinete protein, putative, ATP synt..."


#### For Proteins

In [None]:
# By proteins
def get_essential_proteins(protein_concat_df=protein_concat_df):
    return protein_concat_df[protein_concat_df["essential"] == True]

In [None]:
essential = get_essential_proteins()
essential[(essential["Degree"] < 10) & (essential["Degree"] > 5)].head()

Unnamed: 0.1,Unnamed: 0,Accession ID,Gene ID,Product Description,Neighbours,essential,Degree,Betweenness Centrality,Closeness Centrality,stage
4,4,Q8IIA4,PF3D7_1126000,threonine--tRNA ligase,"['Q8IBS3', 'Q8IE10', 'Q8IDK7', 'Q8IDZ9', 'Q8IL...",True,7,0.00829,0.312109,5
6,6,C0H571,PF3D7_0929400,high molecular weight rhoptry protein 2,"['Q8IKC8', 'Q8IBN4', 'Q6ZMA7', 'Q8I4T3', 'C6KS...",True,9,0.002461,0.310253,5
21,21,Q8IKF0,PF3D7_1468700,eukaryotic initiation factor 4A,"['Q8I0V4', 'Q9TY94', 'C6KT23', 'Q8IDB0', 'Q8IC...",True,7,0.000639,0.316163,5
25,25,Q8I3I6,PF3D7_0528100,"AP-1/2 complex subunit beta, putative","['Q8IB24', 'Q8ILG6', 'Q8I2X4', 'C0H5H0', 'Q8I3...",True,6,0.005161,0.338432,5
28,28,Q8IIJ6,PF3D7_1117100,ubiquitin carboxyl-terminal hydrolase UCH54,"['Q8II71', 'C0H5H0', 'Q8IJW0', 'A0A5K1K9F3', '...",True,9,0.000112,0.33906,5


In [None]:
# get cluster pairs with only one common protein
crossed_df[(crossed_df["num_overlap"] == 1) & (crossed_df["s3_size"] > 2) & (crossed_df["s5_size"] > 2)]

Unnamed: 0,s3_number,s5_number,s3_proteins,s5_proteins,s3_avg_spearman,s5_avg_spearman,s3_size,s5_size,total_size,common_proteins,num_overlap,percent_overlap,jaccard_index,similarity_score
8,0,8,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8IKF0, A0A5K1K910, Q8III5, Q9TY94, Q7KQL5, Q...",0.246667,0.446076,16,6,22,[Q8IKF0],1,0.166667,0.047619,0.107143
14,0,14,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8IJ28, Q8IKJ0, Q8IBV7, Q8IJM9}",0.246667,0.496880,16,4,20,[Q8IBV7],1,0.250000,0.052632,0.151316
20,0,20,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{O97306, Q8IJX3, Q8ILZ7, Q8IK89}",0.246667,0.251806,16,4,20,[Q8IK89],1,0.250000,0.052632,0.151316
33,0,33,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8I5B3, O96221, Q8I5L6, Q8ILX1, Q8IB60, Q8I1S0}",0.246667,0.643875,16,6,22,[Q8I5L6],1,0.166667,0.047619,0.107143
59,0,59,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...","{Q8IBN4, Q8I0V2, A0A5K1K8W5, Q8I2X3, Q8IM15, A...",0.246667,0.154220,16,6,22,[Q8I0V2],1,0.166667,0.047619,0.107143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399,28,47,"{Q8IC01, Q8I2X4, Q8I2J3, O97227, Q8IEK1, Q8II3...","{C0H516, Q76NM4, Q8IHR8, Q7K6A5}",0.472813,0.490805,7,4,11,[Q7K6A5],1,0.250000,0.100000,0.175000
2534,30,14,"{Q8IAX8, Q8IIV2, Q8IBV7, Q8IIV1, Q8I5H4, Q8IJX...","{Q8IJ28, Q8IKJ0, Q8IBV7, Q8IJM9}",0.277737,0.496880,9,4,13,[Q8IBV7],1,0.250000,0.083333,0.166667
2540,30,20,"{Q8IAX8, Q8IIV2, Q8IBV7, Q8IIV1, Q8I5H4, Q8IJX...","{O97306, Q8IJX3, Q8ILZ7, Q8IK89}",0.277737,0.251806,9,4,13,[Q8IJX3],1,0.250000,0.083333,0.166667
2544,30,24,"{Q8IAX8, Q8IIV2, Q8IBV7, Q8IIV1, Q8I5H4, Q8IJX...","{Q8ILZ7, Q8IKP1, O97285, O97306, Q8I5H4, Q8IK89}",0.277737,0.268440,9,6,15,[Q8I5H4],1,0.166667,0.071429,0.119048


In [None]:
tmp = get_clusters_for_protein("Q8IKF0")
tmp[tmp["contains_putative"]]

get clusters for protein with function:  RNA helicase


Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,descriptions
0,0,"{Q8IKF0, Q8I0P6, Q8IIV2, Q8I0V2, Q8IB24, Q8IBV...",0.246667,16,0.379478,False,True,3,0.5625,False,False,True,"[RNA helicase, Elongation factor 1-alpha, Hist..."
2,2,"{Q8IKF0, A0A143ZY58, Q7KQL5, C6KTA4, Q76NM3, Q...",0.392119,26,0.374136,True,True,3,0.576923,False,False,True,"[RNA helicase, ADP-ribosylation factor 1, Tubu..."
10,10,"{Q8IKF0, Q8I0V4, Q8II24, Q8IDZ8, Q8IC05, A0A14...",0.584989,9,0.35859,True,True,3,0.777778,False,False,True,"[RNA helicase, Endoplasmin, putative, Heat sho..."
18,18,"{Q8IKF0, Q7KQL5, C6KTA4, Q8IK89, Q8I0P6, O9615...",0.356469,21,0.394985,False,True,3,0.571429,False,False,True,"[RNA helicase, Tubulin beta chain, Pyruvate ki..."
24,24,"{Q8IKF0, Q76NN8, Q8I0V2, Q8IBD1, Q8I5A9, Q8IKG...",0.313951,13,0.387974,False,True,3,0.461538,False,False,True,"[RNA helicase, Calcium-transporting ATPase, AT..."
8,8,"{Q8IKF0, A0A5K1K910, Q8III5, Q9TY94, Q7KQL5, Q...",0.446076,6,0.399085,True,True,5,0.833333,False,False,True,"[RNA helicase, aspartate carbamoyltransferase,..."
77,77,"{Q8IKF0, C0H4H6, Q8IBG1, Q8I463, Q9TY94, Q7KQL...",0.230131,9,0.418307,False,True,5,0.888889,False,False,True,"[RNA helicase, Cytochrome b-c1 complex subunit..."


In [None]:
cluster_concat_df[cluster_concat_df["contains_putative"] & cluster_concat_df["unique between stages"]].sort_values(["essentiality", "avg_spearman"], ascending=False).head(5)

Unnamed: 0,Number,Proteins,avg_spearman,size,avg_spearman_random,significant,large,stage,essentiality,unique between stages,unique within stage,contains_putative,descriptions
19,19,"{Q8I3M5, C6KTB3, O77312}",0.87931,3,0.384645,True,True,3,1.0,True,True,True,"[Karyopherin beta, Transportin, Exportin-1, pu..."
46,46,"{Q8IL86, Q8IEC8, Q8I2W2, Q8IAU7}",0.777422,4,0.39321,True,True,5,1.0,False,True,True,"[Translocation protein SEC62, Translocation pr..."
19,19,"{Q8I1S0, A0A143ZY58, Q8IM66, Q8IKH3}",0.756486,4,0.437104,True,True,5,1.0,True,True,True,"[Small GTP-binding protein sar1, ADP-ribosylat..."
54,54,"{Q7KQK6, Q8IIR9, Q8I5H2, Q8I608}",0.528079,4,0.376658,True,True,5,1.0,False,False,True,"[GTP-binding nuclear protein, Casein kinase 2,..."
23,23,"{Q8IKK7, O97249, Q8IJ34, Q8IB14, Q8I463, A0A14...",0.471949,9,0.396058,True,True,5,0.888889,True,False,True,"[Glyceraldehyde-3-phosphate dehydrogenase, Sma..."
