In [2]:
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from __future__ import division
import numpy as np
from numpy.linalg import inv
from scipy.spatial.distance import pdist, squareform
import pickle



2024-04-22 12:05:43.948175: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-04-22 12:05:47.005785: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Supporting Functions

In [3]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    # Normalization of the Adjacency Matrix
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    
    # random walk
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

# Load Dataset & Construct Graph

In [4]:
edges_df = pd.read_csv("data/git_web_ml/musae_git_edges.csv")
nodes_df = pd.read_csv("data/git_web_ml/musae_git_target.csv")

In [5]:
G = pickle.load(open('graph.pickle10k', 'rb'))
print(G)

Graph with 6123 nodes and 16591 edges


In [6]:
node_list = np.array(range(len(G.nodes())))
print(node_list)
random.Random(42).shuffle(node_list)
print(node_list)
trainNum = node_list[:4900]
testNum = node_list[4900:]
print(trainNum)
print(testNum)

[   0    1    2 ... 6120 6121 6122]
[4336 2119 4940 ...  204  912 5238]
[4336 2119 4940 ... 1794 2886 5885]
[2344 4069  401 ...  204  912 5238]


In [7]:
# Calculate DSD
# edges = nx.to_numpy_array(G_github)
edges = nx.to_numpy_array(G)
# DSD = DSD_calculator(edges, -1, 0)
DSD = np.load("DSD.npy")

In [8]:
# test = DSD
# rows = len(DSD)
# columns = len(DSD[0])
# for i in range(rows):
#     for j in range(columns):
#         if DSD[i][j] == 0:
#             test[i][j] = DSD[i][j]
#         else:
#             test[i][j] = 1.0 / DSD[i][j]

# print(len(test))
# node_list = list(G.nodes())
# nodes_dftest = nodes_df
# source_list = []
# target_list = []
# weight_list = []

# for edges in G.edges():
#     node1= edges[0]
#     node2 = edges[1]
#     index1 = node_list.index(node1)
#     index2 = node_list.index(node2)
#     weight = test[index1][index2]
#     source_list.append(node1)
#     target_list.append(node2)
#     weight_list.append(weight)
    
# weighted_edges = pd.DataFrame(source_list, columns=['source'])
# weighted_edges['target'] = target_list
# weighted_edges['weight'] = weight_list
# G = StellarGraph(edges=weighted_edges)
# print(G.info())

In [9]:
# print(nodes_df.head())

# Random Walk

In [10]:
# rw = BiasedRandomWalk(G)

# walks = rw.run(
#     nodes=G.nodes(),  # root nodes
#     length=80,  # maximum length of a random walk
#     n=10,  # number of random walks per root node
#     p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
#     q=2,  # Defines (unormalised) probability, 1/q, for moving away from source node
#     weighted=True,  # for weighted random walks
#     seed=42,  # random seed fixed for reproducibility
# )

In [11]:
# print(walks[110])

# Node Embedding

In [12]:
# str_walks = [[str(n) for n in walk] for walk in walks]

# model = Word2Vec(str_walks, vector_size = 16, window=10, min_count=1, sg=1, workers=4)

In [13]:
# emb_df = (
#     pd.DataFrame(
#         [model.wv.get_vector(str(n)) for n in G.nodes()],
#         index = G.nodes()
#     )
# )

# test = emb_df.merge(
#     nodes_df[['id', 'ml_target']].set_index('id'),
#     left_index = True,
#     right_index = True
# )

# test = test.sort_index()

# test.to_csv("emd_DSD.csv", index=False)

# Train Classifier

In [14]:
test = pd.read_csv("emd_DSD.csv")

trainData = test.iloc[trainNum]
testData = test.iloc[testNum]

x_train = trainData.drop('ml_target', axis=1).values
y_train = trainData['ml_target'].values
x_test = testData.drop('ml_target', axis=1).values
y_test = testData['ml_target'].values

# x = test.drop('ml_target', axis=1).values
# y = test['ml_target'].values

# x_train, x_test, y_train, y_test = train_test_split(
#     x, 
#     y,
#     test_size = 0.2
# )

np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

In [15]:
print(test)

             0         1         2         3         4         5         6  \
0    -0.233754  1.497649 -1.516337  0.455211  0.974691 -0.707808  0.723249   
1    -0.065850  0.162517  0.513102  0.143610  0.117548 -0.749876  0.279704   
2    -0.153111  0.117002  0.484812  0.018099  0.043449 -0.734681  0.107581   
3     0.863016  1.925486  0.256789 -0.041229  0.086725 -0.332766  0.508763   
4    -0.559341  0.053292  0.166469 -0.678945  0.964164 -0.592612 -0.341839   
...        ...       ...       ...       ...       ...       ...       ...   
6118  0.725966  0.922044  0.257779  0.301038 -0.098568 -1.283566 -0.702538   
6119  0.098618  0.304279  1.126940  0.659740  1.027492 -1.311324  0.740979   
6120 -0.089114 -0.758385  0.504484 -0.116139 -0.249253 -1.261058  0.266612   
6121  0.865673 -0.178038  1.012662  0.528514  0.571117 -0.281545 -0.289072   
6122  0.974369  0.072033  0.628774  0.616380 -0.178352 -1.094258  0.301362   

             7         8         9        10        11        1

In [31]:
score_list = []
# Load saved arrays
x_train = np.load('x_train.npy')
x_test = np.load('x_test.npy')
y_train = np.load('y_train.npy')
y_test = np.load('y_test.npy')

for random_state in range(100):
    clf = MLPClassifier(max_iter = 500, random_state=random_state)

    # train the model
    clf.fit(x_train, y_train)

    # evaluate Classifier
#     print(clf.score(x_test, y_test))
    score_list.append(clf.score(x_test, y_test))
    
print(score_list)







[0.830744071954211, 0.8331970564186427, 0.8421913327882257, 0.8397383483237939, 0.8323793949304987, 0.8364677023712184, 0.83892068683565, 0.8233851185609158, 0.8372853638593623, 0.8454619787408013, 0.8323793949304987, 0.8413736713000818, 0.8421913327882257, 0.8364677023712184, 0.8372853638593623, 0.8372853638593623, 0.8446443172526574, 0.8397383483237939, 0.8364677023712184, 0.8421913327882257, 0.8421913327882257, 0.8421913327882257, 0.8413736713000818, 0.8405560098119379, 0.8323793949304987, 0.8356500408830744, 0.8454619787408013, 0.8430089942763695, 0.83892068683565, 0.8331970564186427, 0.8356500408830744, 0.8397383483237939, 0.8356500408830744, 0.8381030253475061, 0.8340147179067866, 0.8438266557645135, 0.8372853638593623, 0.8340147179067866, 0.8462796402289452, 0.8340147179067866, 0.8242027800490597, 0.8356500408830744, 0.8364677023712184, 0.8176614881439084, 0.8405560098119379, 0.83892068683565, 0.8454619787408013, 0.8430089942763695, 0.8421913327882257, 0.8528209321340965, 0.8413



In [32]:
print(np.mean(score_list))
print(np.std(score_list))

0.8379067865903517
0.005958413210488855


In [15]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

In [38]:
X = np.array([[0.0,2,3], [1,0,3], [3,4,0]])
print(X)
dist_matrix = pdist(X, metric = 'cityblock')
# dist_matrix = pdist(X)
# print(dist_matrix)
print(squareform(dist_matrix))

[[0. 2. 3.]
 [1. 0. 3.]
 [3. 4. 0.]]
[[0. 3. 8.]
 [3. 0. 9.]
 [8. 9. 0.]]
