<a href="https://colab.research.google.com/github/jinhangjiang/JJ_Freq_Used_Code_Library/blob/main/graph/node2vec_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up environment

#### MIT License

MIT License

Copyright (c) [2021] [Jinhang Jiang]

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

#### Install node2vec (not installed as default)

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.3.tar.gz (4.6 kB)
Building wheels for collected packages: node2vec
  Building wheel for node2vec (setup.py) ... [?25l[?25hdone
  Created wheel for node2vec: filename=node2vec-0.4.3-py3-none-any.whl size=5978 sha256=9bd900ed702af93163caeb6474c5f41b6786295c95ca2d7e921298781e4d155a
  Stored in directory: /root/.cache/pip/wheels/07/62/78/5202cb8c03cbf1593b48a8a442fca8ceec2a8c80e22318bae9
Successfully built node2vec
Installing collected packages: node2vec
Successfully installed node2vec-0.4.3


#### Load packages

In [None]:
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder
import networkx as nx
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Define functions

In [None]:
def convert_graph(edgelist,name):
  # create a empty graph
  graph=nx.Graph()
  # get the edgelist, T(Source, Target, Weights), into the empty graph
  graph.add_weighted_edges_from([tuple(x) for x in edgelist.values])
  graph.name = name
  print(nx.info(graph))
  print("Graph density:",nx.density(graph))
  print("------------------------------------")
  print("************************************")
  print("------------------------------------")
  return graph

In [None]:
def fitnode2vce(graph, walklength=30, numwalks=10, windows=10):
  # calculation for vector_size is inspired by the machine learning crush course from Google
  # you may find the original content here: https://developers.google.com/machine-learning/crash-course/embeddings/video-lecture
  # p32 talks about dimensions/vector_size 
  vector_size = round(len(graph.nodes)**0.25)
  setup = Node2Vec(graph,dimensions=vector_size, walk_length=walklength, num_walks=numwalks, workers=4,weight_key="weight")
  model = setup.fit(window=windows, min_count=1)
  return model

In [None]:
def getnodeemb(model):
  vocab, vectors = model.wv.vocab, model.wv.vectors

  # get node name and embedding vector index.
  name_index = np.array([(v[0], v[1].index) for v in vocab.items()]) #.index

  # init dataframe using embedding vectors and set index as node name
  node2vec_output = pd.DataFrame(vectors[name_index[:,1].astype(int)])
  node2vec_output.index = name_index[:,0]
  return node2vec_output

In [None]:
def getedgeemb(edge_emb):
  edges_kv=edge_emb.as_keyed_vectors()
  vocab, vectors = edges_kv.wv.vocab, edges_kv.wv.vectors

  # get node name and embedding vector index.
  name_index = np.array([(v[0], v[1].index) for v in vocab.items()]) #.index

  # init dataframe using embedding vectors and set index as node name
  edge_output = pd.DataFrame(vectors[name_index[:,1].astype(int)])
  edge_output.index = name_index[:,0]
  edge_output.reset_index(inplace=True)
  edge_output["index"] = edge_output["index"].str.replace(r'[^\w\s]', '')
  df = pd.DataFrame(edge_output["index"].str.split(' ',1).tolist(), columns = ['Source','Target'])
  edge_output["index"] = df.Target
  edge_output.rename(columns={'index':'Target'}, inplace=True)
  edge_output.index = df.Source
  edge_output.reset_index(inplace=True)
  return edge_output

# Load Data

In [None]:
# read the data, I assumed, for future use, all the data will be 3-column weighted edgelist as: T(Source, Target, Weights)
# the edge1.csv data can be accessed via this link: https://raw.githubusercontent.com/jinhangjiang/Disnet_ERGM/main/Data/0.2/edge1.csv
edge = pd.read_csv("Edgelist_all.csv", usecols=["Source","Target","Weight20192"])

# Creating Graphs

In [None]:
%%time
g = convert_graph(edge, "disease network of 20192")

# Fit node2vec

In [None]:
# you may change the parameters here, defualts for walk_length, num_walk, and window are 30, 10, 10, respectively.
# if you want to change the number of dimensions, you need to go back to the "define function" section and manually change it
# increasing the numwalks will exponentially increase the fitting time 

model = fitnode2vce(g, walklength=10,numwalks=5,windows=10)

# Get node embeddings to the dataframe

In [None]:
output = getnodeemb(model)

In [None]:
output

In [None]:
#output.to_csv("dis_embedding.csv",index=True)

# Node Similarity

In [None]:
model.wv.most_similar("j449",topn=5)

# Get edge embeddings to dataframe

In [None]:
edges_embs = HadamardEmbedder(keyed_vectors=model.wv)

In [None]:
edge_output = getedgeemb(edges_embs)

In [None]:
edge_output

In [None]:
#edge_output.to_csv("dis_edge_embedding.csv")

# Edge Similarity

In [None]:
edges_kv = edges_embs.as_keyed_vectors()

In [None]:
edges_kv.most_similar(str(('i255', 'z95810')))