In [21]:
import argparse,os,logging,psutil,time
from joblib import Parallel,delayed

#import utils file
%run utils.ipynb
%run make_graph2vec_corpus.ipynb
%run train_utils.ipynb

logger = logging.getLogger()
logger.setLevel("INFO")

In [2]:
args = argparse.ArgumentParser("graph2vec")
    
args.add_argument("-c","--corpus", default = "../data/kdd_datasets/ptc",
                  help="Path to directory containing graph files to be used for graph classification or clustering")

args.add_argument('-l','--class_labels_file_name', default='../data/kdd_datasets/ptc.Labels',
                  help='File name containg the name of the sample and the class labels')

args.add_argument('-o', "--output_dir", default = "../embeddings",
                  help="Path to directory for storing output embeddings")

args.add_argument('-b',"--batch_size", default=128, type=int,
                  help="Number of samples per training batch")

args.add_argument('-e',"--epochs", default=1000, type=int,
                  help="Number of iterations the whole dataset of graphs is traversed")

args.add_argument('-d',"--embedding_size", default=1024, type=int,
                  help="Intended graph embedding size to be learnt")

args.add_argument('-neg', "--num_negsample", default=10, type=int,
                  help="Number of negative samples to be used for training")

args.add_argument('-lr', "--learning_rate", default=0.3, type=float,
                  help="Learning rate to optimize the loss function")

args.add_argument("--wlk_h", default=3, type=int, help="Height of WL kernel (i.e., degree of rooted subgraph "
                                                       "features to be considered for representation learning)")

args.add_argument('-lf', '--label_filed_name', default='Label', help='Label field to be used '
                                                                     'for coloring nodes in graphs using WL kenrel')

#args.parse_args()

_StoreAction(option_strings=['-lf', '--label_filed_name'], dest='label_filed_name', nargs=None, const=None, default='Label', type=None, choices=None, help='Label field to be used for coloring nodes in graphs using WL kenrel', metavar=None)

In [7]:
# Set it manually for now
corpus_dir = "/home/ipsita/BTP/graph2vec/data/kdd_datasets/ptc"
output_dir = "/home/ipsita/BTP/graph2vec/embeddings"
batch_size = 128
epochs = 100
embedding_size = 1024
num_negsample = 10
learning_rate = 0.3
wlk_h = 3
label_filed_name = 'Label'
class_labels_fname = '/home/ipsita/BTP/graph2vec/data/kdd_datasets/ptc.Labels'

wl_extn = 'g2v'+str(wlk_h)    # wlk_h is height to be considered

assert os.path.exists(corpus_dir), "File {} does not exist".format(corpus_dir)
assert os.path.exists(output_dir), "Dir {} does not exist".format(output_dir)

In [8]:
graph_files = get_files(dirname=corpus_dir, extn='.gexf', max_files=0)
logging.info('Loaded {} graph file names form {}'.format(len(graph_files),corpus_dir))

INFO:root:Loaded 344 graph file names form /home/ipsita/BTP/graph2vec/data/kdd_datasets/ptc


In [9]:
t0 = time()
wlk_relabel_and_dump_memory_version(graph_files, max_h=wlk_h, node_label_attr_name=label_filed_name)
logging.info('dumped sg2vec sentences in {} sec.'.format(time() - t0))

loaded all graphs in 0.33 sec
initial relabeling done in 0.06 sec
WL iteration 1 done in 0.06 sec.
num of WL rooted subgraphs in iter 1 is 160
WL iteration 2 done in 0.07 sec.
num of WL rooted subgraphs in iter 2 is 1038


INFO:root:dumped sg2vec sentences in 0.6182849407196045 sec.


WL iteration 3 done in 0.09 sec.
num of WL rooted subgraphs in iter 3 is 2624
dumped sg2vec sentences in 0.0


In [22]:
t0 = time()
embedding_fname = train_skipgram(corpus_dir, wl_extn, learning_rate, embedding_size, num_negsample,
                                 epochs, batch_size, output_dir)
logging.info('Trained the skipgram model in {} sec.'.format(round(time()-t0, 2)))

INFO:root:Initializing SKIPGRAM...
INFO:root:number of graphs: 344
INFO:root:subgraph vocabulary size: 3804
INFO:root:total number of subgraphs to be trained: 34837
INFO:root:Epoch: 0 : Average loss for step: 100 : 44.053448
INFO:root:Epoch: 0 : Average loss for step: 200 : 37.702990
INFO:root:#########################   Epoch: 0 :  35.445183, 1.09 sec.  #####################
INFO:root:Epoch: 1 : Average loss for step: 100 : 25.386242
INFO:root:Epoch: 1 : Average loss for step: 200 : 24.069743
INFO:root:#########################   Epoch: 1 :  22.902555, 0.97 sec.  #####################
INFO:root:Epoch: 2 : Average loss for step: 100 : 15.687668
INFO:root:Epoch: 2 : Average loss for step: 200 : 16.182734
INFO:root:#########################   Epoch: 2 :  15.780944, 0.97 sec.  #####################
INFO:root:Epoch: 3 : Average loss for step: 100 : 13.728236
INFO:root:Epoch: 3 : Average loss for step: 200 : 12.342629
INFO:root:#########################   Epoch: 3 :  12.033786, 0.96 sec.  #

INFO:root:Epoch: 38 : Average loss for step: 100 : 1.192453
INFO:root:Epoch: 38 : Average loss for step: 200 : 1.208227
INFO:root:#########################   Epoch: 38 :  1.186123, 0.95 sec.  #####################
INFO:root:Epoch: 39 : Average loss for step: 100 : 1.231534
INFO:root:Epoch: 39 : Average loss for step: 200 : 1.195807
INFO:root:#########################   Epoch: 39 :  1.178413, 0.94 sec.  #####################
INFO:root:Epoch: 40 : Average loss for step: 100 : 1.222653
INFO:root:Epoch: 40 : Average loss for step: 200 : 1.195820
INFO:root:#########################   Epoch: 40 :  1.207544, 0.95 sec.  #####################
INFO:root:Epoch: 41 : Average loss for step: 100 : 1.234114
INFO:root:Epoch: 41 : Average loss for step: 200 : 1.215157
INFO:root:#########################   Epoch: 41 :  1.199390, 0.94 sec.  #####################
INFO:root:Epoch: 42 : Average loss for step: 100 : 1.153730
INFO:root:Epoch: 42 : Average loss for step: 200 : 1.154644
INFO:root:##############

INFO:root:#########################   Epoch: 76 :  0.919826, 0.94 sec.  #####################
INFO:root:Epoch: 77 : Average loss for step: 100 : 0.904998
INFO:root:Epoch: 77 : Average loss for step: 200 : 0.923373
INFO:root:#########################   Epoch: 77 :  0.934361, 0.93 sec.  #####################
INFO:root:Epoch: 78 : Average loss for step: 100 : 0.918342
INFO:root:Epoch: 78 : Average loss for step: 200 : 0.922286
INFO:root:#########################   Epoch: 78 :  0.915841, 0.93 sec.  #####################
INFO:root:Epoch: 79 : Average loss for step: 100 : 0.942286
INFO:root:Epoch: 79 : Average loss for step: 200 : 0.933504
INFO:root:#########################   Epoch: 79 :  0.910514, 0.93 sec.  #####################
INFO:root:Epoch: 80 : Average loss for step: 100 : 0.955701
INFO:root:Epoch: 80 : Average loss for step: 200 : 0.930187
INFO:root:#########################   Epoch: 80 :  0.925473, 0.94 sec.  #####################
INFO:root:Epoch: 81 : Average loss for step: 100 :

In [23]:
%run classify.ipynb

In [24]:
perform_classification (corpus_dir, wl_extn, embedding_fname, class_labels_fname)

INFO:root:Y (label) matrix shape: (344,)
INFO:root:Train and Test matrix shapes: (309, 1024), (35, 1024), (309,), (35,) 


Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    8.8s finished
INFO:root:best classifier model's hyperparamters
INFO:root:Linear SVM accuracy: 0.4857142857142857
INFO:root:             precision    recall  f1-score   support

         -1       0.40      0.57      0.47        14
          1       0.60      0.43      0.50        21

avg / total       0.52      0.49      0.49        35



In [1]:
import argparse,os,logging,psutil,time
from joblib import Parallel,delayed

from utils import get_files
from train_utils import train_skipgram
from classify import perform_classification
from make_graph2vec_corpus import *
from time import time

logger = logging.getLogger()
logger.setLevel("INFO")


def main(args):
    
    '''
    :param args: arguments for
    1. training the skigram model for learning subgraph representations
    2. construct the deep WL kernel using the learnt subgraph representations
    3. performing graph classification using  the WL and deep WL kernel
    :return: None
    '''
    corpus_dir = args.corpus
    output_dir = args.output_dir
    batch_size = args.batch_size
    epochs = args.epochs
    embedding_size = args.embedding_size
    num_negsample = args.num_negsample
    learning_rate = args.learning_rate
    wlk_h = args.wlk_h
    label_filed_name = args.label_filed_name
    class_labels_fname = args.class_labels_file_name

    wl_extn = 'g2v'+str(wlk_h)    # wlk_h is height to be considered

    assert os.path.exists(corpus_dir), "File {} does not exist".format(corpus_dir)
    assert os.path.exists(output_dir), "Dir {} does not exist".format(output_dir)

    graph_files = get_files(dirname=corpus_dir, extn='.gexf', max_files=0)
    logging.info('Loaded {} graph file names form {}'.format(len(graph_files),corpus_dir))


    t0 = time()
    wlk_relabel_and_dump_memory_version(graph_files, max_h=wlk_h, node_label_attr_name=label_filed_name)
    logging.info('dumped sg2vec sentences in {} sec.'.format(time() - t0))

    t0 = time()
    embedding_fname = train_skipgram(corpus_dir, wl_extn, learning_rate, embedding_size, num_negsample,
                                     epochs, batch_size, output_dir)
    logging.info('Trained the skipgram model in {} sec.'.format(round(time()-t0, 2)))

    perform_classification (corpus_dir, wl_extn, embedding_fname, class_labels_fname)

ModuleNotFoundError: No module named 'utils'