In [1]:
import numpy as np
import re
from collections import defaultdict
import networkx as nx
from tqdm import tqdm
from node2vec import Node2Vec
import pickle
import multiprocessing
from gensim.models import Word2Vec
import pandas
from abc import ABC, abstractmethod



# Load serialzed graph

In [2]:
with open('py_data/nx_fc_graph.pickle', 'rb') as f:
    G = pickle.load(f)

# Load nodes

In [3]:
nodes = pandas.read_csv('py_data/functions.csv').set_index('id')

# Defime Function embedding

In [4]:
class WalkGenerator(ABC):
    @abstractmethod
    def generate(self, G: nx.Graph) -> list:
        pass

In [5]:
class N2VWalkGenerator(WalkGenerator):
    
    def __init__(self, 
                 p = 0.8, 
                 q = 0.5, 
                 num_walks = 30, 
                 walk_length = 10, 
                 workers = 1):
        self.args = {'p' : p, 
                     'q' : q, 
                     'num_walks' : num_walks, 
                     'walk_length' : walk_length, 
                     'workers' : workers}
    
    def generate(self, G: nx.Graph) -> list:
        node2vec = Node2Vec(G, **self.args)
        return node2vec.walks

In [6]:
class FunctionEmbedding(object):
    
    def __init__(self, 
                 G: nx.Graph,
                 nodes: pandas.DataFrame,
                 walk_generator: WalkGenerator,
                 p_pos: float = 0.1, 
                 p_neg: float = 0.1,
                 embedding_dim = 128,
                 random_state: int = 0):
        
        self.G = G.copy()
        self.p_pos = p_pos
        self.p_neg = p_neg
        self.rnd = np.random.RandomState(seed = random_state)
        self.embedding_dim = embedding_dim
        self.nodes = nodes
        self.walk_generator = walk_generator
        self._generate_pos_neg_edges()
        self._generate_walks()
        
    def _generate_neg_edges(self, n_neg: int):
        
        nodes = np.array(G.nodes())
        neg_edges = set()
        for i in tqdm(range(n_neg), desc = 'Generating negative edges'):
            while True:
                u, v = self.rnd.choice(nodes, 2, replace=False)
                if not self.G.has_edge(u, v) and (u, v) not in neg_edges:
                    neg_edges.add((u, v))
                    break
        return [edge for edge in neg_edges]
    
    def _generate_pos_neg_edges(self):
        n_edges = G.number_of_edges()
        n_nodes = G.number_of_nodes()

        n_pos = int(self.p_pos * n_edges)
        n_neg = int(self.p_neg * n_edges)

        neg_edges = self._generate_neg_edges(n_neg)

        pos_edges = np.array(self.G.edges())
        ids = self.rnd.choice(n_nodes, n_pos, replace=False)
        pos_edges = pos_edges[ids]
        
        G.remove_edges_from(pos_edges)

        self.pos_edges = pos_edges
        self.neg_edges = neg_edges
        
    def _generate_walks(self):
        self.walks = self.walk_generator.generate(self.G)
    
    def fit_word2vec(self,
                     window = 5, 
                     epochs=10, 
                     workers = multiprocessing.cpu_count() - 1):
        
        self.model = Word2Vec(size = self.embedding_dim, window=window, min_count=0, workers = workers, sg = 1)
        import logging  # Setting up the loggings to monitor gensim
        from time import time  # To time our operations
        logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
        self.model.build_vocab(self.walks, progress_per=10000)
        t = time()
        self.model.train(self.walks, total_examples=self.model.corpus_count, epochs=epochs, report_delay=1)
        print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))
        
        in_vecs = []
        out_vecs = []
        for node_id, val in self.model.wv.vocab.items():
            id = val.index
            in_vecs.append((int(node_id), self.model.wv.vectors[id]))
            out_vecs.append((int(node_id), self.model.trainables.syn1neg[id]))
        in_vecs = pandas.DataFrame(in_vecs, columns=['id', 'in_vec']).set_index('id')
        out_vecs = pandas.DataFrame(out_vecs, columns=['id', 'out_vec']).set_index('id')
        vecs = in_vecs.merge(out_vecs, left_index=True, right_index=True)
        self.nodes = self.nodes.merge(vecs, left_index=True, right_index=True)
    
    def _get_score(self, src_node_id: int, trg_node_id: int):
        src_vec = self.nodes.loc[src_node_id].in_vec
        trg_vec = self.nodes.loc[src_node_id].out_vec
        score = src_vec @ trg_vec / (np.linalg.norm(src_vec) * np.linalg.norm(trg_vec))
        return score
    
    def get_accuracy(self, T = 0.5):
        success = 0
        for src, trg in self.pos_edges:
            if self._get_score(src, trg) >= T:
                success += 1
        for src, trg in self.neg_edges:
            if self._get_score(src, trg) < T:
                success += 1
        return success / (len(self.pos_edges) + len(self.neg_edges))

In [51]:
func_emb = FunctionEmbedding(G, nodes, N2VWalkGenerator(), 0.2, 0.2)

Generating negative edges: 100%|██████████████████████████████████████████████| 102594/102594 [07:30<00:00, 227.60it/s]
Computing transition probabilities: 100%|███████████████████████████████████| 198960/198960 [00:08<00:00, 23781.53it/s]
Generating walks (CPU: 1): 100%|███████████████████████████████████████████████████████| 30/30 [02:01<00:00,  4.46s/it]


In [52]:
func_emb.fit_word2vec()

INFO - 12:13:43: collecting all words and their counts
INFO - 12:13:43: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 12:13:43: PROGRESS: at sentence #10000, processed 14698 words, keeping 11964 word types
INFO - 12:13:43: PROGRESS: at sentence #20000, processed 29526 words, keeping 23358 word types
INFO - 12:13:43: PROGRESS: at sentence #30000, processed 44336 words, keeping 34324 word types
INFO - 12:13:43: PROGRESS: at sentence #40000, processed 58877 words, keeping 45096 word types
INFO - 12:13:43: PROGRESS: at sentence #50000, processed 73530 words, keeping 55635 word types
INFO - 12:13:43: PROGRESS: at sentence #60000, processed 88321 words, keeping 66058 word types
INFO - 12:13:43: PROGRESS: at sentence #70000, processed 103013 words, keeping 76306 word types
INFO - 12:13:43: PROGRESS: at sentence #80000, processed 117655 words, keeping 86358 word types
INFO - 12:13:43: PROGRESS: at sentence #90000, processed 132478 words, keeping 96362 word types
INFO

INFO - 12:13:44: PROGRESS: at sentence #830000, processed 1219032 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #840000, processed 1233667 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #850000, processed 1248268 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #860000, processed 1262969 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #870000, processed 1277677 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #880000, processed 1292276 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #890000, processed 1307009 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #900000, processed 1321602 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #910000, processed 1336406 words, keeping 198960 word types
INFO - 12:13:44: PROGRESS: at sentence #920000, processed 1350918 words, keeping 198960 word types
INFO - 12:

INFO - 12:13:45: PROGRESS: at sentence #1650000, processed 2423323 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1660000, processed 2437996 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1670000, processed 2452685 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1680000, processed 2467478 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1690000, processed 2482077 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1700000, processed 2496738 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1710000, processed 2511357 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1720000, processed 2526153 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1730000, processed 2540953 words, keeping 198960 word types
INFO - 12:13:45: PROGRESS: at sentence #1740000, processed 2555520 words, keeping 198960 word types


INFO - 12:13:46: PROGRESS: at sentence #2460000, processed 3613367 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2470000, processed 3628025 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2480000, processed 3642727 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2490000, processed 3657473 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2500000, processed 3672245 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2510000, processed 3686989 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2520000, processed 3701733 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2530000, processed 3716319 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2540000, processed 3731060 words, keeping 198960 word types
INFO - 12:13:46: PROGRESS: at sentence #2550000, processed 3745735 words, keeping 198960 word types


INFO - 12:13:47: PROGRESS: at sentence #3270000, processed 4803217 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3280000, processed 4817964 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3290000, processed 4832725 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3300000, processed 4847327 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3310000, processed 4861917 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3320000, processed 4876602 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3330000, processed 4891435 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3340000, processed 4906369 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3350000, processed 4920967 words, keeping 198960 word types
INFO - 12:13:47: PROGRESS: at sentence #3360000, processed 4935538 words, keeping 198960 word types


INFO - 12:13:48: PROGRESS: at sentence #4080000, processed 5992663 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4090000, processed 6007362 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4100000, processed 6022129 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4110000, processed 6036890 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4120000, processed 6051471 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4130000, processed 6066318 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4140000, processed 6081182 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4150000, processed 6095900 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4160000, processed 6110674 words, keeping 198960 word types
INFO - 12:13:48: PROGRESS: at sentence #4170000, processed 6125161 words, keeping 198960 word types


INFO - 12:13:49: PROGRESS: at sentence #4890000, processed 7182662 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4900000, processed 7197388 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4910000, processed 7212051 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4920000, processed 7226574 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4930000, processed 7241088 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4940000, processed 7255735 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4950000, processed 7270371 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4960000, processed 7285228 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4970000, processed 7299814 words, keeping 198960 word types
INFO - 12:13:49: PROGRESS: at sentence #4980000, processed 7314548 words, keeping 198960 word types


INFO - 12:13:50: PROGRESS: at sentence #5700000, processed 8372539 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5710000, processed 8387018 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5720000, processed 8401668 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5730000, processed 8416204 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5740000, processed 8430863 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5750000, processed 8445762 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5760000, processed 8460466 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5770000, processed 8475096 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5780000, processed 8489768 words, keeping 198960 word types
INFO - 12:13:50: PROGRESS: at sentence #5790000, processed 8504473 words, keeping 198960 word types


INFO - 12:15:00: EPOCH 3 - PROGRESS: at 39.23% examples, 546434 words/s, in_qsize 14, out_qsize 1
INFO - 12:15:01: EPOCH 3 - PROGRESS: at 45.85% examples, 548197 words/s, in_qsize 13, out_qsize 0
INFO - 12:15:02: EPOCH 3 - PROGRESS: at 52.35% examples, 548083 words/s, in_qsize 12, out_qsize 1
INFO - 12:15:03: EPOCH 3 - PROGRESS: at 58.74% examples, 545860 words/s, in_qsize 13, out_qsize 1
INFO - 12:15:04: EPOCH 3 - PROGRESS: at 64.33% examples, 538639 words/s, in_qsize 11, out_qsize 6
INFO - 12:15:05: EPOCH 3 - PROGRESS: at 71.17% examples, 541741 words/s, in_qsize 13, out_qsize 1
INFO - 12:15:06: EPOCH 3 - PROGRESS: at 77.90% examples, 543468 words/s, in_qsize 14, out_qsize 0
INFO - 12:15:07: EPOCH 3 - PROGRESS: at 84.52% examples, 543796 words/s, in_qsize 13, out_qsize 0
INFO - 12:15:08: EPOCH 3 - PROGRESS: at 91.01% examples, 543969 words/s, in_qsize 14, out_qsize 0
INFO - 12:15:09: EPOCH 3 - PROGRESS: at 97.52% examples, 544005 words/s, in_qsize 13, out_qsize 0
INFO - 12:15:09: wor

INFO - 12:16:00: EPOCH 7 - PROGRESS: at 26.46% examples, 541020 words/s, in_qsize 13, out_qsize 1
INFO - 12:16:01: EPOCH 7 - PROGRESS: at 33.30% examples, 545809 words/s, in_qsize 13, out_qsize 0
INFO - 12:16:02: EPOCH 7 - PROGRESS: at 39.92% examples, 547590 words/s, in_qsize 13, out_qsize 0
INFO - 12:16:03: EPOCH 7 - PROGRESS: at 46.31% examples, 545173 words/s, in_qsize 14, out_qsize 1
INFO - 12:16:04: EPOCH 7 - PROGRESS: at 52.92% examples, 544870 words/s, in_qsize 12, out_qsize 2
INFO - 12:16:05: EPOCH 7 - PROGRESS: at 59.65% examples, 547116 words/s, in_qsize 14, out_qsize 0
INFO - 12:16:06: EPOCH 7 - PROGRESS: at 65.93% examples, 544694 words/s, in_qsize 13, out_qsize 1
INFO - 12:16:07: EPOCH 7 - PROGRESS: at 72.54% examples, 545254 words/s, in_qsize 9, out_qsize 4
INFO - 12:16:08: EPOCH 7 - PROGRESS: at 79.16% examples, 546334 words/s, in_qsize 13, out_qsize 0
INFO - 12:16:09: EPOCH 7 - PROGRESS: at 85.32% examples, 544348 words/s, in_qsize 13, out_qsize 0
INFO - 12:16:10: EPOC

INFO - 12:16:57: training on a 87669380 raw words (84881639 effective words) took 153.7s, 552374 effective words/s


Time to train the model: 2.56 mins


In [57]:
func_emb.get_accuracy(0.01)

0.5334863637249742

In [64]:
with open('n2v-model.pickle', 'wb') as f:
    pickle.dump(func_emb, f)

In [19]:
with open('n2v-model.pickle', 'rb') as f:
    func_emb = pickle.load(f)

# Link prediction 2

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics, pipeline
from sklearn.preprocessing import StandardScaler

In [59]:
def prepare_features(model, edges, fnc = lambda a, b: 0.5 * (a + b)):
    X = []
    for src, trg in edges:
        src_vec = model.nodes.loc[src].in_vec
        trg_vec = model.nodes.loc[trg].in_vec
        vec = fnc(src_vec, trg_vec)
        X.append(vec)
    return X

In [60]:
pos_edges = func_emb.pos_edges
neg_edges = func_emb.neg_edges

In [61]:
X = prepare_features(func_emb, np.concatenate((pos_edges, neg_edges)))
y = [1] * len(pos_edges) + [0] * len(neg_edges)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [63]:
scaler = StandardScaler()
lin_clf = LogisticRegression(C=1)
clf = pipeline.make_pipeline(scaler, lin_clf)

In [64]:
clf.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [65]:
print('Accuracy:', metrics.accuracy_score(clf.predict(X_test), y_test))
print('Roc-auc score:', metrics.scorer.roc_auc_scorer(clf, X_test, y_test))

Accuracy: 0.8111993761879234
Roc-auc score: 0.8845040366085203
