In [29]:
import networkx as nx
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from joblib import Parallel,delayed
import random
import itertools
from gensim.models import Word2Vec

In [25]:
def partition_num(num, workers):
    if num % workers == 0:
        return [num//workers]*workers
    else:
        return [num//workers]*workers + [num % workers]

class RandomWalker:
    def __init__(self,G,p=1,q=1):
        self.G = G
        self.p = p
        self.q = q
        
    def deepwalk_walk(self,walk_length,start_node):
        walk = [start_node]
        
        while len(walk) < walk_length:
            cur = walk[-1]
            cur_nbrs = list(self.G.neighbors(cur))
            if len(cur_nbrs) > 0:
                walk.append(random.choice(cur_nbrs))
            else:
                break
        return walk
    
    def simulate_walks(self,num_walks,walk_length,workers=1,verbose=0):
        G = self.G
        
        nodes = list(G.nodes())
        
        results = Parallel(n_jobs=workers,verbose=verbose,)(
            delayed(self._simulate_walks)(nodes,num,walk_length)
            for num in partition_num(num_walks,workers))
        
        walks = list(itertools.chain(*results))
        
        return walks
    
    def _simulate_walks(self,nodes,num_walks,walk_length,):
        walks = []
        for _ in range(num_walks):
            random.shuffle(nodes)
            for v in nodes:
                if self.p == 1 and self.q == 1:
                    walks.append(self.deepwalk_walk(
                        walk_length=walk_length,start_node=v))
                else:
                    walks.append(self.node2vec_walk(
                        walk_length=walk_length,start_node=v))
        return walks

In [9]:
class DeepWalk:
    def __init__(self,graph,walk_length,num_walks,workers=1):
        self.graph = graph
        self.w2v_model = None
        self._embeddings = {}
        
        self.walker = RandomWalker(graph,p=1,q=1,)
        self.sentences = self.walker.simulate_walks(num_walks=num_walks,walk_length=walk_length,workers=workers,verbose=1)
        
    def train(self,embed_size=128,window_size=5,workers=3,iter=5,**kwargs):
        kwargs["sentences"] = self.sentences
        kwargs["min_count"] = kwargs.get("min_count",0)
        kwargs["size"] = embed_size
        kwargs["sg"] = 1                # skip gram
        kwargs["hs"] = 1                # deepwalk use Hierarchical Softmax
        kwargs["workers"] = workers
        kwargs["window"] = window_size
        kwargs["iter"] = iter
        
        print("Learning embedding vectors...")
        model = Word2Vec(**kwargs)
        print("Learning embedding vertors done!")
        
        self.w2v_model = model
        return model
    
    def get_embeddings(self,):
        if self.w2v_model in None:
            print("model not train")
            return{}
        
        self._embeddings = {}
        for word in self.graph.nodes():
            self._embeddings[word] = self.w2v_model.wv[word]
        
        return self._embeddings

In [7]:
def read_node_label(filename,skip_head=False):
    fin = open(filename,'r')
    X = []
    Y = []
    while 1:
        if skip_head:
            fin_readline()
        l = fin.readline()
        if l == '':
            break
        vec = l.strip().split('')
        X.append(vec[0])
        Y.append(vec[1:])
    fin.close()
    return X,Y

class TopKRanker(OneVsRestClassifier):
    def predict(self,X,top_k_list):
        probs = numpy.asarray(super(TopKRanker,self).predict_proba(X))
        all_labels = []
        for i,k in enumerate(top_k_list):
            probs = probs[i,:]
            labels = self.classes_[probs_.argsort()[-k:]].tolist()
            probs_[:] = 0
            probs_[labels] = 1
            all_labels.append(probs_)
        return numpy.asarray(all_labels)

class Classifier(object):
    
    def __init__(self,embedding,clf):
        self.embeddings = embeddings
        self.clf = TopKRanker(clf)
        self.binarizer = MultiLabelBinarizer(sparse_output=True)
        
    def train(self,X,Y,Y_all):
        self.binarizer.fit(Y_all)
        X_train = [self.embeddings[x] for x in X]
        Y = self.binarizer.transform(Y)
        self.clf.fit(X_train,Y)
        
    def evaluate(self,X,Y):
        top_k_list = [len(l) for l in Y]
        Y = self.predict(X,top_k_list)
        Y = self.binarizer.transform(Y)
        averages = ["micro","macro","samples","weighted"]
        results = {}
        for average in averages:
            results[average] = f1_score(Y,Y_,average=average)
        results['acc'] = accuracy_score(Y,Y_)
        print('------------------------------')
        print(results)
        return results
        
    def split_train_evaluate(self,X,Y,train_precent,seed=0):
        state = numpy.random.get_state()
        
        training_size = int(train_precent * len(X))
        numpy.random.seed(seed)
        shuffle_indices = numpy.random.permutation(numpy.arange(len(X)))
        X_train = [X[shuffle_indices[i]] for i in range(training_size)]
        Y_train = [Y[shuffle_indices[i]] for i in range(training_size)]
        X_test = [X[shuffle_indices[i]] for i in range(training_size,len(X))]
        Y_test = [Y[shuffle_indices[i]] for i in range(training_size,len(X))]
        
        self.train(X_train,Y_train,Y)
        numpy.random.set_state(state)
        return  self.evaluate(X_test,Y_test)

def evaluate_embeddings(embeddings):
    X,Y = read_node_label('./data/wiki/wiki_labels.txt')
    tr_frac = 0.8
    print("Training classifier using {:.2f}% nodes...".format(tr_frac * 100))
    clf = Classifier(embeddings=embeddings,clf=LogisticRegression())
    clf.split_train_evaluate(X,Y,tr_frac)
    
def plot_embeddings(embeddings,):
    X,Y = read_node_label('../data/wiki/wiki_labels.txt')
    
    emb_list = []
    for k in X:
        emb_list.append(embeddings[k])
    emb_list = np.array(emb_list)
    
    model = TSNE(n_components=2)
    node_pos = model.fit_transform(emb_list)
    
    color_idx = {}
    for i in range(len(x)):
        color_idx.setdefault(Y[i][0],[])
        color_idx[Y[i][0]].append(i)
        
    for c,idx in color_idx.items():
        plt.scatter(node_pos[idx,0],node_pos[idx,1],label=c)
    plt.legend()
    plt.show()

In [None]:
if __name__ == "__main__":
    G = nx.read_edgelist('./data/wiki/Wiki_edgelist.txt',create_using=nx.DiGraph(),nodetype=None,data=[('weight',int)])
    
    model = DeepWalk(G,walk_length=10,num_walks=80,workers=1)
    model.train(window_size=5,iter=3)
    embeddings = model.get_embeddings()
    
    evaluate_embeddings(embeddings)
    plot_embeddings(embeddings)