In [1]:
# import all libraries 
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.manifold import TSNE
import random
import warnings
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
import community as comm
import pandas as pd
warnings.filterwarnings('ignore')
%run ./helper.ipynb

In [None]:
pip install littleballoffur

In [2]:
def node_classification(embeddings, label):
    X, Y = read_node_label(label,skip_head=True)
    
    ltrainfrac = [0.05, 0.1, 0.2, 0.3, .4, .5, .6, .7, .8]
    for tf in ltrainfrac:
        print("Training classifier using {:.2f}% nodes...".format(tf * 100))
        split_train_evaluate(X, Y, embeddings, tf)


        
        
def makeLinkPredictionData(graph, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0,G.number_of_nodes()):
        X[i] = embeddings[i+1]
    X = np.array(X)
    
    Xd = []
    Yd = []
    count = 0
    # for all vertices
    nodes=np.array(list(graph.nodes()),dtype=int)
    nodes.sort()
    print(nodes.shape)
    print(nodes[:20])
    for u in range(graph.number_of_nodes()):
        # print(u)
        Nu = list(graph.neighbors(u))
        count += len(Nu)
        cn = 0
        totalns = 0
        # for all neighbors of u
        for n in Nu:
            x = []
            if n > u:
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[n][d]) # distance between the embeddings of u and its neighbor n
                Xd.append(x)
                Yd.append(1) # positive sample (edge present)
                totalns += 1
        tmpnn = []
        if len(Nu) > graph.number_of_nodes() // 2:
            totalns = (graph.number_of_nodes() - len(Nu)) // 2
            #print("Testing neighbors!")
        while cn < totalns:
            nn = random.randint(0, graph.number_of_nodes() - 1)
            # non-neighbors of u
            if nn not in Nu and nn not in tmpnn:
                cn += 1
                x = []
                for d in range(len(X[0])):
                    x.append(X[u][d] - X[nn][d])
                Xd.append(x)
                Yd.append(0) # negative sample (edge absent)
                tmpnn.append(nn)
    Xd, Yd = np.array(Xd), np.array(Yd)
    indices = np.array(range(len(Yd)))
    np.random.shuffle(indices)
    Xt = Xd[indices]
    Yt = Yd[indices]
    #print(len(Xd), len(Yd), count)
    
    
    ltrainfrac = .75
    # for tf in ltrainfrac:
    CV = int(len(Yt) * ltrainfrac)
    trainX = Xt[0:CV]
    testX = Xt[CV:]
    trainY = Yt[0:CV]
    testY = Yt[CV:]
    modelLR = LogisticRegression().fit(trainX, trainY)
    predictedY = modelLR.predict(testX)
    acc = accuracy_score(predictedY, testY)
    #f1macro = f1_score(predictedY, testY, average='macro', labels=np.unique(predictedY))
    #f1micro = f1_score(predictedY, testY, average='micro', labels=np.unique(predictedY))
    #print("Link predictions:", tf, ":Accuracy:",acc, "F1-macro:", f1macro, "F1-micro:",f1micro)
    print("Link predictions:", ltrainfrac, ":Accuracy:",acc)



def cluster_eval(G, embeddings):
    # converting embedding to a numpy array
    X = [[0] for i in range(G.number_of_nodes())]
    for i in range(0, G.number_of_nodes()):
        X[i] = embeddings[str(i+1)]
    X = np.array(X)

    bestModularity = 0
    bestC = 2
    NOC = 30
    allmodularity = []
    for cls in range(2, NOC):
        
        # find clusters using a kmeans clustering algorithm on the embedding
        # Number of clusters is set to cls
        clusters = KMeans(n_clusters=cls, random_state=0).fit(X)
        predG = dict()
        for node in range(len(clusters.labels_)):
            predG[node] = clusters.labels_[node]
        
        # compute the modularity score of the Kmeans clustering
        modularity = comm.community_louvain.modularity(predG, G)
        allmodularity.append(modularity)
        print("Number of clusters: ", cls, "  Modularity: ", modularity)
        if modularity > bestModularity:
            bestModularity = modularity
            bestC = cls
    plt.scatter(range(2, NOC), allmodularity)
    plt.xlabel("Number of clusters")
    plt.ylabel("Modularity score")
    plt.show()
    #print("Best Modularity:",bestModularity, "Clusters:", bestC)

    
    
def plot_embeddings(embeddings, label):

    X, Y = read_node_label(label,skip_head=True)
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)

    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)
    color_idx = {}

    for i in range(len(X)):
        color_idx.setdefault(Y[i][0], [])
        color_idx[Y[i][0]].append(i)

    for c, idx in color_idx.items():
        plt.scatter(node_pos[idx, 0], node_pos[idx, 1], label=c)  # c=node_colors)
    plt.legend()
    plt.show()

In [3]:
### Read the twitch streamers dataset as a networkx grap
graphfile = 'twitch_edges.txt'
G = nx.read_edgelist('edges_twich_gamers.txt', create_using=nx.Graph(), nodetype=int)
print("Number of nodes: ", G.number_of_nodes())
print("Number of edges: ", G.number_of_edges())


Number of nodes:  168114
Number of edges:  6797557


In [5]:
###Embedding on the entire graph using Dwwpwalk
walks_deepwalk = deepwalk_walks(G, walk_length=6,num_walks=10)
embeddings_deepwalk = get_embedding(G,walks_deepwalk)

Learning embedding vectors...
Learning embedding vectors done!


In [None]:
import networkx as nx
from littleballoffur import RandomWalkSampler
model = RandomWalkSampler(number_of_nodes = 100)
new_graph_rw_100 = model.sample(G)
print("Number of nodes: ", new_graph_rw_100.number_of_nodes())
print("Number of edges: ", new_graph_rw_100.number_of_edges())

In [6]:
import networkx as nx
from littleballoffur import RandomWalkSampler
model = RandomWalkSampler(number_of_nodes = 10000)
new_graph_rw = model.sample(G)
print("Number of nodes: ", new_graph_rw.number_of_nodes())
print("Number of edges: ", new_graph_rw.number_of_edges())

Number of nodes:  10000
Number of edges:  468545


In [7]:
walks_node2vec = node2vec_walks(new_graph_rw, p=0.25,q=2, walk_length=6, num_walks=10)
embeddings_node2vec = get_embedding(new_graph_rw,walks_node2vec)

KeyError: (14513, 10097)

In [10]:
cc_sorted = [len(c) for c in sorted(nx.connected_components(G), key=len, reverse=True)]
cc_sorted

[168114]

In [None]:
### Embedding using deepwalk
walks_deepwalk = deepwalk_walks(new_graph_rw, walk_length=6,num_walks=10)
embeddings_deepwalk = get_embedding(new_graph_rw,walks_deepwalk)

Exception in thread Exception in thread Exception in thread Thread-7Thread-6:
Traceback (most recent call last):
:
Traceback (most recent call last):
  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
Thread-5  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
:
Traceback (most recent call last):
  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 973, in _bootstrap_inner
        self.run()
  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 910, in run
    self.run()
  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 910, in run
self.run()
  File "C:\Users\bhakt\anaconda3\lib\threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\bhakt\anaconda3\lib\site-packages\gensim\models\word2vec.py", line 1162, in _worker_loop
    self._target(*self._args, **self._kwargs)
      File "C:\Users\bhakt\anaconda3\lib\site-packages\gensim\models\word2vec.py", line 1162, in _worker_loop

Learning embedding vectors...


In [None]:
print("Number of nodes: ", new_graph_rw.number_of_nodes())
print("Number of edges: ", new_graph_rw.number_of_edges())

In [None]:
### Random Neighbor sampling
from littleballoffur.exploration_sampling import RandomNodeNeighborSampler
sampler = RandomNodeNeighborSampler()
new_graph_rn = sampler.sample(G)
print("Number of nodes: ", new_graph_rn.number_of_nodes())
print("Number of edges: ", new_graph_rn.number_of_edges())

In [None]:
first_node = next(iter(new_graph_rn))
print(type(first_node))


In [None]:
### Sample the graph using littltballoffur
from littleballoffur import MetropolisHastingsRandomWalkSampler

number_of_nodes = int(0.1*G.number_of_nodes())
sampler = MetropolisHastingsRandomWalkSampler(number_of_nodes = number_of_nodes)
new_graph_exp = sampler.sample(G)
print("Number of nodes: ", new_graph_exp.number_of_nodes())
print("Number of edges: ", new_graph_exp.number_of_edges())

In [None]:
"""Breadth first search sampler example"""
from littleballoffur.exploration_sampling import BreadthFirstSearchSampler

sampler = BreadthFirstSearchSampler()

new_graph = sampler.sample(G)


In [None]:
# G.nodes()

In [None]:
# new_graph_exp.nodes()

In [None]:
np.savetxt("new_graph_exp.txt", new_graph_exp, delimiter=" ", fmt="%s")

In [None]:
np.savetxt("G.txt", G, delimiter=" ", fmt="%s")


In [None]:
### Embedding using deepwalk
walks_deepwalk = deepwalk_walks(new_graph_rn, walk_length=6,num_walks=10)
embeddings_deepwalk = get_embedding(new_graph_rn,walks_deepwalk)

In [None]:
makeLinkPredictionData(new_graph_rw, embeddings_deepwalk) 