In [1]:
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from __future__ import division
import numpy as np
from numpy.linalg import inv
from scipy.spatial.distance import pdist, squareform
import pickle

2022-12-08 00:41:55.798549: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Supporting Functions

In [2]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

In [3]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

# Load Dataset & Construct Graph

In [4]:
edges_df = pd.read_csv("data/git_web_ml/musae_git_edges.csv")
nodes_df = pd.read_csv("data/git_web_ml/musae_git_target.csv")
G_github = nx.from_pandas_edgelist(edges_df, source="id_1", target='id_2')
print('-----------------------')
print('Original Graph:')
print(nx.info(G_github))

sample_num = 3000
sample_nodes = random.choices(list(G_github.nodes()), k=sample_num)
G = G_github.subgraph(sample_nodes)
print('-----------------------')
print('Picked graph with 3000 nodes:')
print(nx.info(G))
largest_cc = max(nx.connected_components(G), key=len)
G = G.subgraph(largest_cc)
print('-----------------------')
print('Fully Connected smaller graph:')
print(nx.info(G))
G_github = G
# print(G_github.nodes())
G = StellarGraph.from_networkx(G_github)

-----------------------
Original Graph:
Graph with 37700 nodes and 289003 edges
-----------------------
Picked graph with 3000 nodes:
Graph with 2883 nodes and 1525 edges
-----------------------
Fully Connected smaller graph:
Graph with 963 nodes and 1423 edges


In [5]:
# Calculate DSD
edges = nx.to_numpy_matrix(G_github)
DSD = DSD_calculator(edges, 20, 0.2)

In [6]:
test = DSD
rows = len(DSD)
columns = len(DSD[0])
for i in range(rows):
    for j in range(columns):
        if DSD[i][j] == 0:
            test[i][j] = DSD[i][j]
        else:
            test[i][j] = 1.0 / DSD[i][j]

print(len(test))
node_list = list(G.nodes())
nodes_dftest = nodes_df
source_list = []
target_list = []
weight_list = []

for edges in G.edges():
    node1= edges[0]
    node2 = edges[1]
    index1 = node_list.index(node1)
    index2 = node_list.index(node2)
    weight = test[index1][index2]
    source_list.append(node1)
    target_list.append(node2)
    weight_list.append(weight)
    
weighted_edges = pd.DataFrame(source_list, columns=['source'])
weighted_edges['target'] = target_list
weighted_edges['weight'] = weight_list
G = StellarGraph(edges=weighted_edges)
# print(G.info())

963


In [7]:
print(nodes_df.head())

   id          name  ml_target
0   0        Eiryyy          0
1   1    shawflying          0
2   2   JpMCarrilho          1
3   3     SuhwanCha          0
4   4  sunilangadi2          1


# Random Walk

In [8]:
rw = BiasedRandomWalk(G)

walks = rw.run(
    nodes=G.nodes(),  # root nodes
    length=80,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2,  # Defines (unormalised) probability, 1/q, for moving away from source node
    weighted=True,  # for weighted random walks
    seed=42,  # random seed fixed for reproducibility
)

In [9]:
print(walks[110])

[36890, 26595, 11685, 26595, 11685, 26595, 36890, 26595, 36890, 26595, 36890, 26595, 36890, 26595, 11685, 26595, 11685, 26595, 11685, 26595, 11685, 26595, 11685, 26595, 11685, 26595, 36890, 26595, 15195, 27173, 24894, 27173, 24894, 27173, 15828, 27173, 24894, 27173, 24894, 27173, 37505, 27173, 15195, 27173, 24894, 27173, 36819, 15875, 27925, 15875, 35089, 15875, 35089, 33863, 36576, 33863, 36576, 33863, 36576, 33863, 35089, 33863, 1737, 19876, 19987, 19876, 23098, 19876, 19987, 19876, 19859, 19876, 19987, 19876, 25498, 19876, 25498, 16282, 12630, 25498]


# Node Embedding

In [10]:
str_walks = [[str(n) for n in walk] for walk in walks]

model = Word2Vec(str_walks, vector_size = 16, window=10, min_count=1, sg=1, workers=4)

In [11]:
emb_df = (
    pd.DataFrame(
        [model.wv.get_vector(str(n)) for n in G.nodes()],
        index = G.nodes()
    )
)

In [12]:
test = emb_df.merge(
    nodes_df[['id', 'ml_target']].set_index('id'),
    left_index = True,
    right_index = True
)
print(test.head())

              0         1         2         3         4         5         6  \
10240 -2.670354 -0.894849 -1.434834 -2.842315  1.434970 -2.563948 -1.325668   
20486 -1.364016 -0.573321 -0.775646  0.205674 -0.042246 -0.654841 -1.750834   
10    -0.753459 -0.888934 -0.741287 -0.837583 -0.469711 -0.395925 -0.576131   
16396 -1.072069  0.501650 -1.001305 -1.274024  1.769316 -0.956774  0.250618   
24590 -0.875853 -0.131014 -0.121556 -1.087274  2.637292 -0.253239 -0.069273   

              7         8         9        10        11        12        13  \
10240  0.993446  3.734835  0.659477  2.740249 -2.967784  0.411044 -0.901932   
20486  1.570562  0.310480 -0.143882  0.707318 -1.362441  0.438214  0.482301   
10     0.649408  1.365623  0.930770  1.512255 -0.412556 -0.453594 -0.500055   
16396  0.939766  0.253950  0.506356 -0.859607 -2.262495  0.305720 -0.370935   
24590  0.104321 -0.452859 -0.136926  1.611903 -2.000164 -0.097123 -0.207105   

             14        15  ml_target  
10240 -0.59

# Train Classifier

In [13]:
x = emb_df.values
y = test['ml_target'].values

x_train, x_test, y_train, y_test = train_test_split(
    x, 
    y,
    test_size = 0.3
)

# GBC classifier
clf = GradientBoostingClassifier()

# train the model
clf.fit(x_train, y_train)

# Evaluate Classifier

In [14]:
print(clf.score(x_test, y_test))

0.8650519031141869


In [15]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

In [26]:
X = np.array([[0,2,3], [1,0,3], [3,4,0]])
dist_matrix = pdist(X, metric = 'cityblock')
# dist_matrix = pdist(X)
# print(dist_matrix)
print(squareform(dist_matrix))

[[0. 3. 8.]
 [3. 0. 9.]
 [8. 9. 0.]]
