In [15]:
import pandas as pd
import networkx as nx
from gensim.models import Word2Vec
import stellargraph as sg
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report
from __future__ import division
import numpy as np
from numpy.linalg import inv
from scipy.spatial.distance import pdist, squareform
import pickle

# Supporting Functions

In [2]:
def DSD_calculator(adjacency, walk_length, restart_p):
    """
    adjacency - adjacency matrix represented as a numpy array
                assumes graph is fully connected.
    walk_length - the length of random walks used to calculate DSD
                  if walk_length = -1, then calculate DSD at convergence
    restart_p - the restart probability
        if p = 0, then it's a traditional random walk
    returns DSD matrix represented as a numpy array
    """
    adjacency = np.asmatrix(adjacency)
    n = adjacency.shape[0]
    degree = adjacency.sum(axis=1)
    p = adjacency / degree
    if walk_length >= 0:
        c = np.eye(n)
        for i in range(walk_length):
            c = (1 - restart_p) * np.dot(c, p) + restart_p * np.eye(n)
        return squareform(pdist(c,metric='cityblock'))
    else:
        pi = degree / degree.sum()
        return squareform(pdist(inv(np.eye(n) - p - pi.T),metric='cityblock'))

# Load Dataset & Construct Graph

In [3]:
edges_df = pd.read_csv("data/git_web_ml/musae_git_edges.csv")
nodes_df = pd.read_csv("data/git_web_ml/musae_git_target.csv")

In [4]:
G = pickle.load(open('graph.pickle10k', 'rb'))
print(G)

Graph with 6123 nodes and 16591 edges


In [24]:
node_list = np.array(range(len(G.nodes())))
print(node_list)
random.Random(42).shuffle(node_list)
print(node_list)
trainNum = node_list[:4900]
testNum = node_list[4900:]
print(trainNum)
print(testNum)

[   0    1    2 ... 6120 6121 6122]
[4336 2119 4940 ...  204  912 5238]
[4336 2119 4940 ... 1794 2886 5885]
[2344 4069  401 ...  204  912 5238]


# Random Walk

In [6]:
# G = StellarGraph.from_networkx(G)
# rw = BiasedRandomWalk(G)

# walks = rw.run(
#     nodes=G.nodes(),  # root nodes
#     length=80,  # maximum length of a random walk
#     n=10,  # number of random walks per root node
#     p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
#     q=2,  # Defines (unormalised) probability, 1/q, for moving away from source node
#     weighted=False,  # for weighted random walks
#     seed=42,  # random seed fixed for reproducibility
# )

In [7]:
# print(walks[110])

[32821, 30814, 32821, 30814, 32821, 30554, 32821, 1767, 10281, 1767, 36289, 14614, 19405, 2856, 2281, 21877, 26649, 30387, 13638, 27723, 27803, 21114, 32206, 21114, 32206, 7394, 37627, 7394, 21364, 7394, 21364, 7394, 32206, 24346, 32206, 24346, 37627, 24346, 37627, 7394, 37627, 7394, 19573, 16702, 19573, 16702, 19573, 16702, 19573, 7394, 29844, 7394, 29844, 7394, 31785, 22442, 10830, 24950, 10830, 6639, 9633, 21312, 31722, 27803, 14727, 27803, 4183, 18149, 27803, 16985, 26842, 35009, 15921, 33471, 15921, 7896, 15921, 23310, 23297, 10503]


# Node Embedding

In [8]:
# str_walks = [[str(n) for n in walk] for walk in walks]

# model = Word2Vec(str_walks, vector_size = 16, window=10, min_count=1, sg=1, workers=4)

In [11]:
# emb_df = (
#     pd.DataFrame(
#         [model.wv.get_vector(str(n)) for n in G.nodes()],
#         index = G.nodes()
#     )
# )

# test = emb_df.merge(
#     nodes_df[['id', 'ml_target']].set_index('id'),
#     left_index = True,
#     right_index = True
# )


# test = test.sort_index()

# test.to_csv("emd.csv", index=False)

# Train Classifier

In [25]:
test = pd.read_csv("emd.csv")

trainData = test.iloc[trainNum]
testData = test.iloc[testNum]

x_train = trainData.drop('ml_target', axis=1).values
y_train = trainData['ml_target'].values
x_test = testData.drop('ml_target', axis=1).values
y_test = testData['ml_target'].values

x = emb_df.values
y = test['ml_target'].values

np.save('x_trainN.npy', x_train)
np.save('x_testN.npy', x_test)
np.save('y_trainN.npy', y_train)
np.save('y_testN.npy', y_test)

In [28]:
score_list = []

x_train = np.load('x_trainN.npy')
x_test = np.load('x_testN.npy')
y_train = np.load('y_trainN.npy')
y_test = np.load('y_testN.npy')

for random_state in range(100):
#     clf = SVC(kernel='linear', random_state=random_state)
    clf = MLPClassifier(max_iter = 500, random_state=random_state)

    # train the model
    clf.fit(x_train, y_train)

    # evaluate Classifier
#     print(clf.score(x_test, y_test))
    score_list.append(clf.score(x_test, y_test))

print(score_list)







[0.8364677023712184, 0.8282910874897792, 0.8356500408830744, 0.8266557645134914, 0.8348323793949305, 0.83892068683565, 0.8299264104660671, 0.8430089942763695, 0.8340147179067866, 0.8348323793949305, 0.8348323793949305, 0.830744071954211, 0.8315617334423548, 0.8266557645134914, 0.8348323793949305, 0.8274734260016353, 0.830744071954211, 0.8233851185609158, 0.8331970564186427, 0.8331970564186427, 0.8372853638593623, 0.8397383483237939, 0.8356500408830744, 0.8282910874897792, 0.8315617334423548, 0.8258381030253475, 0.8381030253475061, 0.8266557645134914, 0.8258381030253475, 0.8323793949304987, 0.8372853638593623, 0.8340147179067866, 0.8348323793949305, 0.8315617334423548, 0.8299264104660671, 0.8364677023712184, 0.8340147179067866, 0.8291087489779232, 0.8274734260016353, 0.8323793949304987, 0.8356500408830744, 0.8258381030253475, 0.830744071954211, 0.8152085036794767, 0.8430089942763695, 0.8397383483237939, 0.8372853638593623, 0.8331970564186427, 0.8381030253475061, 0.8413736713000818, 0.83



In [29]:
print(np.mean(score_list))
print(np.std(score_list))

0.83227309893704
0.005399854049129125


In [None]:
X = np.array([[0,2,3], [1,0,3], [3,4,0]])
dist_matrix = pdist(X, metric = 'cityblock')
# dist_matrix = pdist(X)
# print(dist_matrix)
print(squareform(dist_matrix))