<a href="https://colab.research.google.com/github/jarasch/Human-Protein-Protein-Interaction-Link-Prediction/blob/main/Human_Protein_Protein_Interaction_Link_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Human: Protein-Protein Interaction Link Prediction

Adapted from [Graphable example](https://www.graphable.ai/videos/link-prediction-python-example-protein-protein-interactions/) for latest GDS version.

Data from [SNAP](https://snap.stanford.edu/biodata/datasets/10000/10000-PP-Pathways.html).

In [None]:
!pip install graphdatascience



In [None]:
import pandas as pd

NEO4J_URI='<uri>'
NEO4J_USERNAME='neo4j'
NEO4J_PASSWORD='<password>'
NEO4J_DATABASE='neo4j'

from neo4j import GraphDatabase
driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD))
n4j = driver.session(database=NEO4J_DATABASE)

from graphdatascience import GraphDataScience
gds = GraphDataScience(NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD), aura_ds=True)
gds.set_database(NEO4J_DATABASE)

In [None]:
GRAPH_NAME = 'HumanPPI'
PIPELINE_NAME = 'pipe'
MODEL_NAME = 'ppi'

if gds.graph.exists(GRAPH_NAME).all():
  gds.graph.drop(gds.graph.get(GRAPH_NAME))
if gds.pipeline.exists(PIPELINE_NAME).all():
  gds.pipeline.drop(gds.pipeline.get(PIPELINE_NAME))
if gds.model.exists(MODEL_NAME).all():
  gds.model.drop(gds.model.get(MODEL_NAME))

In [None]:
relationships = pd.read_csv('https://snap.stanford.edu/biodata/datasets/10000/files/PP-Pathways_ppi.csv.gz', header=None, names=['sourceNodeId', 'targetNodeId'])
relationships['relationshipType'] = relationships.apply(lambda x: 'INTERACTS', axis='columns')
relationships.head()

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,1394,2778,INTERACTS
1,6331,17999,INTERACTS
2,122704,54460,INTERACTS
3,2597,2911,INTERACTS
4,4790,79155,INTERACTS


In [None]:
nodes = pd.DataFrame(set(relationships['sourceNodeId']) | set(relationships['targetNodeId']), columns=['nodeId'])
nodes['labels'] = nodes.apply(lambda x: ['Protein'], axis='columns')
nodes

Unnamed: 0,nodeId,labels
0,1,[Protein]
1,2,[Protein]
2,131076,[Protein]
3,9,[Protein]
4,10,[Protein]
...,...,...
21552,130872,[Protein]
21553,130916,[Protein]
21554,130940,[Protein]
21555,130951,[Protein]


In [None]:
G = gds.graph.construct(GRAPH_NAME, nodes, relationships)

Uploading Nodes:   0%|          | 0/21557 [00:00<?, ?Records/s]

Uploading Relationships:   0%|          | 0/342353 [00:00<?, ?Records/s]

In [None]:
gds.graph.relationships.toUndirected(G, relationship_type='INTERACTS', mutate_relationship_type='UINTERACTS')

inputRelationships                                                 342353
relationshipsWritten                                               684706
mutateMillis                                                            0
postProcessingMillis                                                    0
preProcessingMillis                                                     0
computeMillis                                                          42
configuration           {'relationshipType': 'INTERACTS', 'jobId': '5a...
Name: 0, dtype: object

In [None]:
P = gds.lp_pipe(PIPELINE_NAME)

In [None]:
P.addNodeProperty('pageRank', mutateProperty='pageRank')

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps                                                        []
splitConfig          {'testFraction': 0.1, 'validationFolds': 3, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.addNodeProperty('betweenness', mutateProperty='betweenness')

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps                                                        []
splitConfig          {'testFraction': 0.1, 'validationFolds': 3, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.addNodeProperty('fastRP', mutateProperty='embedding',
  embeddingDimension=256,
  iterationWeights=[0.8, 1, 1, 1],
  normalizationStrength=0.5,
  randomSeed=42)

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps                                                        []
splitConfig          {'testFraction': 0.1, 'validationFolds': 3, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.configureSplit(testFraction=0.3,
  trainFraction=0.3,
  #negativeSamplingRatio: 1.33,
  validationFolds=7)

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps                                                        []
splitConfig          {'testFraction': 0.3, 'validationFolds': 7, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.addLogisticRegression(penalty=0, tolerance=0.001, maxEpochs=500)
P.addLogisticRegression(penalty=0, tolerance=0.01, maxEpochs=500)
P.addLogisticRegression(penalty=0.01, tolerance=0.001, maxEpochs=500)
P.addLogisticRegression(penalty=0.01, tolerance=0.01, maxEpochs=500)
P.addLogisticRegression(penalty=0.1, tolerance=0.001, maxEpochs=500)
P.addLogisticRegression(penalty=0.1, tolerance=0.01, maxEpochs=500)

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps                                                        []
splitConfig          {'testFraction': 0.3, 'validationFolds': 7, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
#P.addFeature('hadamard', nodeProperties=['embedding', 'pageRank', 'betweenness'])
#P.addFeature('hadamard', nodeProperties=['embedding', 'pageRank'])
P.addFeature('hadamard', nodeProperties=['embedding', 'betweenness'])
#P.addFeature('hadamard', nodeProperties=['embedding'])

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps         [{'name': 'HADAMARD', 'config': {'nodeProperti...
splitConfig          {'testFraction': 0.3, 'validationFolds': 7, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.configureSplit(testFraction=0.3, trainFraction=0.3, validationFolds=7)

name                                                              pipe
nodePropertySteps    [{'name': 'gds.pageRank.mutate', 'config': {'m...
featureSteps         [{'name': 'HADAMARD', 'config': {'nodeProperti...
splitConfig          {'testFraction': 0.3, 'validationFolds': 7, 't...
autoTuningConfig                                     {'maxTrials': 10}
parameterSpace       {'MultilayerPerceptron': [], 'RandomForest': [...
Name: 0, dtype: object

In [None]:
P.train(G, targetRelationshipType='UINTERACTS', modelName=MODEL_NAME)

Link Prediction Train Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

(LPModel({'modelName': {0: 'ppi'}, 'modelType': {0: 'LinkPrediction'}, 'modelInfo': {0: {'modelName': 'ppi', 'modelType': 'LinkPrediction', 'metrics': {'AUCPR': {'test': 0.8301995681053607, 'validation': {'min': 0.8302289157768568, 'max': 0.8336041261298948, 'avg': 0.8314714426236858}, 'outerTrain': 0.8314695906771933, 'train': {'min': 0.8311148786845121, 'max': 0.8316769054110157, 'avg': 0.8314697023207751}}}, 'pipeline': {'nodePropertySteps': [{'name': 'gds.pageRank.mutate', 'config': {'mutateProperty': 'pageRank', 'contextRelationshipTypes': [], 'contextNodeLabels': []}}, {'name': 'gds.betweenness.mutate', 'config': {'mutateProperty': 'betweenness', 'contextRelationshipTypes': [], 'contextNodeLabels': []}}, {'name': 'gds.fastRP.mutate', 'config': {'randomSeed': 42, 'contextRelationshipTypes': [], 'mutateProperty': 'embedding', 'iterationWeights': [0.8, 1, 1, 1], 'normalizationStrength': 0.5, 'embeddingDimension': 256, 'contextNodeLabels': []}}], 'featureSteps': [{'name': 'HADAMARD',

In [None]:
model = gds.model.get(MODEL_NAME)

In [None]:
model.predict_stream(G, topN=100, threshold=0.3)

Link Prediction Predict Pipeline:   0%|          | 0/100 [00:00<?, ?%/s]

Unnamed: 0,node1,node2,probability
0,54566,54575,1.0
1,54566,54577,1.0
2,54566,54583,1.0
3,54566,54585,1.0
4,54566,54596,1.0
...,...,...,...
95,54566,54658,1.0
96,54566,54622,1.0
97,54566,54584,1.0
98,54566,54576,1.0
