In [22]:
import pandas as pd
import numpy as np
import mygene
import joblib
import graph_tool as gt

mg = mygene.MyGeneInfo()


def create_ppi_graph(edges_df: pd.DataFrame):
    edges_df = edges_df.loc[edges_df.iloc[:, 0] != edges_df.iloc[:, 1], :]

    nodes = list(set(edges_df.iloc[:, 0].unique().tolist() + edges_df.iloc[:, 1].unique().tolist()))

    edges_df.loc[:, 'source'] = edges_df.iloc[:, 0].apply(lambda x: nodes.index(x) if x in nodes else np.nan)
    edges_df.loc[:, 'target'] = edges_df.iloc[:, 1].apply(lambda x: nodes.index(x) if x in nodes else np.nan)

    edges_df = edges_df.dropna()

    edges_df.loc[:, 'union'] = edges_df.apply(lambda x: ", ".join(list({str(x['source'])}.union({str(x['target'])}))),
                                              axis=1)
    edges_df.loc[:, 'union'] = edges_df.apply(lambda x: ", ".join(list({str(x['source'])}.union({str(x['target'])}))),
                                              axis=1)
    edges_df = edges_df.drop_duplicates(subset=['union'])

    ppi_g = gt.Graph()

    for node in nodes:
        v = ppi_g.add_vertex()

    for index, row in edges_df.iterrows():
        ppi_g.add_edge(row['source'], row['target'], add_missing=False)

    print('Number of nodes:', ppi_g.num_vertices(), 'Number of edges:', ppi_g.num_edges())
    return ppi_g, nodes

In [23]:
large = pd.read_csv('../0_data/ppi_networks/human_interactome_wang.txt', sep="\t", header=0).drop(
    ['EntrezID', 'EntrezID.1'], axis=1)
large.to_csv('large_network.csv')
large_ppi, large_nodes = create_ppi_graph(large)
large_ppi = {'network': large_ppi, 'nodes': large_nodes}
joblib.dump(large_ppi, 'large_ppi.sav')

Number of nodes: 16454 Number of edges: 233957


['large_ppi.sav']

In [10]:
biogrid = pd.read_csv('../0_data/ppi_networks/biogrid.zip', sep='\t', skiprows=35,
                      usecols=['OFFICIAL_SYMBOL_A', 'OFFICIAL_SYMBOL_B', 'ALIASES_FOR_A', 'ALIASES_FOR_B',
                               'ORGANISM_A_ID', 'ORGANISM_B_ID'], compression='zip')
biogrid = biogrid[biogrid['ORGANISM_A_ID'] == 9606]
biogrid = biogrid[biogrid['ORGANISM_B_ID'] == 9606]
biogrid = biogrid.drop(['ORGANISM_A_ID', 'ORGANISM_B_ID'], axis=1)
biogrid = biogrid.drop_duplicates()
display(biogrid)
print(
    f"{len(set(biogrid['OFFICIAL_SYMBOL_A'].unique().tolist() + biogrid['OFFICIAL_SYMBOL_B'].unique().tolist()))} Proteins with {biogrid.shape[0]} interactions")
biogrid.loc[:, ['OFFICIAL_SYMBOL_A', 'OFFICIAL_SYMBOL_B']].to_csv('biogrid_network.csv', index=False)
biogrid_ppi, biogrid_nodes = create_ppi_graph(biogrid)
biogrid_ppi = {'network': biogrid_ppi, 'nodes': biogrid_nodes}
joblib.dump(biogrid_ppi, 'biogrid_ppi.sav')

Unnamed: 0,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,ALIASES_FOR_A,ALIASES_FOR_B
0,MAP2K4,FLNC,JNKK|JNKK1|MAPKK4|MEK4|MKK4|PRKMK4|SAPKK-1|SAP...,ABP-280|ABP280A|ABPA|ABPL|FLN2|MFM5|MPD4
1,MYPN,ACTN2,CMD1DD|CMH22|MYOP|RCM4,CMD1AA
2,ACVR1,FNTA,ACTRI|ACVR1A|ACVRLK2|ALK2|FOP|SKR1|TSRI,FPTA|PGGT1A|PTAR2
3,GATA2,PML,DCML|IMD21|MONOMAC|NFE1B,MYL|PP8675|RNF71|TRIM19
4,RPA2,STAT3,REPA2|RP-A p32|RP-A p34|RPA32,ADMIO|APRF|HIES
...,...,...,...,...
2379603,RNF43,ZNF638,RNF124|URCC,NP220|ZFML|Zfp638
2379608,ALDH7A1,BMI1,ATQ1|EPD|PDE,FLVI2/BMI1|PCGF4|RNF51
2379625,CDK1,SLBP,CDC2|CDC28A|P34CDC2,HBP
2379626,CDK2,SLBP,CDKN2|p33(CDK2),HBP


19826 Proteins with 773955 interactions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges_df.loc[:, 'source'] = edges_df.copy().iloc[:, 0].apply(lambda x: nodes.index(x) if x in nodes else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges_df.loc[:, 'target'] = edges_df.copy().iloc[:, 1].apply(lambda x: nodes.index(x) if x in nodes else np.nan)


Number of nodes: 19823 Number of edges: 728607


['biogrid_ppi.sav']

In [11]:
iid = pd.read_csv('../0_data/ppi_networks/iid.gz', sep='\t', compression='gzip', low_memory=False,
                  usecols=['symbol1', 'symbol2']).drop_duplicates()
display(iid)
print(
    f"{len(set(iid['symbol1'].unique().tolist() + iid['symbol2'].unique().tolist()))} Proteins with {iid.shape[0]} interactions")
iid.to_csv('iid_network.csv', index=False)
iid_ppi, iid_nodes = create_ppi_graph(iid)
iid_ppi = {'network': iid_ppi, 'nodes': iid_nodes}
joblib.dump(iid_ppi, 'iid_ppi.sav')

Unnamed: 0,symbol1,symbol2
0,MAP2K4,FLNC
1,ACTN2,MYPN
2,FNTA,ACVR1
3,GATA2,PML
4,RPA2,STAT3
...,...,...
1209529,SNAI1,LOXL1
1209530,FBLN1,COL18A1
1209531,COMP,COMP
1209532,SNAI1,LOXL3


19552 Proteins with 1206598 interactions


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges_df.loc[:, 'source'] = edges_df.copy().iloc[:, 0].apply(lambda x: nodes.index(x) if x in nodes else np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  edges_df.loc[:, 'target'] = edges_df.copy().iloc[:, 1].apply(lambda x: nodes.index(x) if x in nodes else np.nan)


Number of nodes: 19548 Number of edges: 1198994


['iid_ppi.sav']