In [1]:
import numpy as np
from dotenv import load_dotenv

load_dotenv('../.env')

True

In [2]:
import itertools

embedding_methods = [
    embedding_method
    for embedding_method
    in itertools.product(
        ['deepwalk', 'node2vec'],
        ['FC', 'VC', 'LC', 'NC']
    )
]

embedding_methods

[('deepwalk', 'FC'),
 ('deepwalk', 'VC'),
 ('deepwalk', 'LC'),
 ('deepwalk', 'NC'),
 ('node2vec', 'FC'),
 ('node2vec', 'VC'),
 ('node2vec', 'LC'),
 ('node2vec', 'NC')]

In [None]:
from typing import Optional, Iterable, Generator
from pathlib import Path
from psycopg.rows import namedtuple_row
from psycopg import Connection
import psycopg as pg
import tensorflow as tf
from gensim.models import Word2Vec
import csv
from logging import debug
import logging

logging.getLogger().setLevel('INFO')
tf.data.experimental.enable_debug_mode()
# Add node training dataset

save_dir = Path.cwd() / 'save'
model_dir = Path.cwd() / '..' / 'data' /'models'
score_dir = Path.cwd() / '..' / 'data' / 'scores'

score_fname = score_dir / 'codeworkout_program_scores.csv'
score_mapper = {}

strats = ['VC', 'LC', 'FC', 'NC']


def get_scores(programs: Iterable[str], ) -> list[int]:
    if len(score_mapper) == 0:
        with open(score_fname, 'r') as score_file:
            score_reader = csv.DictReader(score_file)
            for score in score_reader:
                score_mapper[score['program_id']] = float(score['score'])
    return [score_mapper[program_id] for program_id in programs]


def content_string(node_type: str, label: Optional[str], contents: Optional[str]) -> str:
    if node_type == 'Source':
        return f'Source {contents}'
    if node_type == 'Sink':
        return f'Sink {contents}'
    out_label = f'{label}: ' if label else ''
    out_contents = contents if contents else ''
    return f'{out_label}{out_contents}'


def get_graph_batch_cursor(conn: Connection, strat: str, limit: int, last_row: Optional[str]):
    return conn.execute('''
select
    p.program_id as program,
    g.graph_id as graph,
    n.contents as content,
    n.label as label,
    n.node_type as ntype
from
    programs as p
        join
    graphs as g on p.id = g.program_id
        join
    nodes as n on g.id = n.graph_id
where
    p.program_id in (
        select
            program_id
        from
            programs
        where
                    program_id like %(strat)s
          and
                    ( cast(%(last_row)s as varchar) IS NULL or program_id > %(last_row)s)
        order by
            program_id
        limit
            %(limit)s
    )
''', {'strat': f'{strat}-%', 'limit': limit, 'last_row': last_row}, prepare=True)


db_dataset_shapes = {}


def get_db_dataset_shapes(conn: Connection):
    if len(db_dataset_shapes) != 0:
        return db_dataset_shapes
    cursor = conn.execute("""
-- dimension 0 - programs
with dim0 as (
    select
        count(*) as dim, substring(p.program_id for 2) as strat
    from
        programs as p
    group by
        substring(p.program_id for 2)
),

-- dimension 1 - max number of graphs
dim1 as (
    select
        max(graph_counts.num_graphs) as dim, graph_counts.strat
    from
        (select count(*) as num_graphs, substring(p.program_id for 2) as strat
        from
            programs as p
                join graphs as g on p.id = g.program_id
        group by p.program_id) as graph_counts
    group by
        graph_counts.strat
),

-- dimension 2 - max number of nodes (dimension 3 is the node embeddings which are size 50)
dim2 as (
    select
        max(node_counts.num_nodes) as dim, node_counts.strat
    from
        (
            select
                count(*) as num_nodes, substring(p.program_id for 2) as strat
            from
                programs as p
                    join graphs g on p.id = g.program_id
                    join nodes as n on g.id = n.graph_id
            group by
                p.program_id
        ) as node_counts
    group by
        node_counts.strat
)

select
    dim0.strat as strat, dim0.dim as dim0, dim1.dim as dim1, dim2.dim as dim2, 50 as dim3
from
    dim0
        join dim1 on dim0.strat = dim1.strat
        join dim2 on dim0.strat = dim1.strat
    """)
    cursor.row_factory = namedtuple_row

    shapes = cursor.fetchall()
    for shape in shapes:
        db_dataset_shapes[shape.strat] = (shape.dim0, shape.dim1, shape.dim2, shape.dim3)
    return db_dataset_shapes


def get_graph_batches(conn: Connection, w2v: Word2Vec, shapes: dict[str, tuple[int, int, int, int]], strat: str, batch_size: int) -> Generator[tf.Tensor, None, None]:
    processed = 0
    last_row = None
    while processed < shapes[strat][0]:
        processed += batch_size
        programs = {}
        cursor = get_graph_batch_cursor(conn, strat, batch_size, last_row)
        cursor.row_factory = namedtuple_row
        for row in cursor:
            if not row.program in programs:
                programs[row.program] = {}
            if not row.graph in programs[row.program]:
                programs[row.program][row.graph] = []
            content_str = content_string(row.ntype, row.label, row.content)
            if content_str in w2v.wv:
                node_embedding = w2v.wv[content_str]
            else:
                node_embedding = np.zeros(50)
            programs[row.program][row.graph].append(node_embedding)
            last_row = row.program
        yield programs


def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))


def _float_feature(value):
    """Returns a float_list from a float."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))


def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


feature_description = {
    'program': tf.io.FixedLenFeature((), tf.string),
    'score': tf.io.FixedLenFeature((), tf.float32, default_value=0.0)
}


def _parse_graph_tfrecord(example_proto):
    example = tf.io.parse_single_example(example_proto, feature_description)
    debug("Example: ", example)
    wrapper = tf.io.parse_tensor(example['program'], out_type=tf.string)
    debug("Wrapper: ", wrapper)
    indices = tf.io.parse_tensor(wrapper[0], out_type=tf.int64)
    debug("Indices: ", indices)
    values = tf.io.parse_tensor(wrapper[1], out_type=tf.float32)
    debug(values)
    shape = tf.io.parse_tensor(wrapper[2], out_type=tf.int64)
    debug(shape)
    sparse = tf.sparse.SparseTensor(indices, values, shape)
    debug(sparse)
    dense = tf.sparse.to_dense(sparse)
    debug(dense)
    return dense, example['score']


def get_graph_dataset(conn: Connection, ds_loc: Path, shapes: dict[str, tuple[int, int, int, int]], strat: str, embed_method: str, batch_size: int):
    if not ds_loc.exists():
        w2v = Word2Vec.load(str(model_dir / f'{strat}-{embed_method}.model'))
        strat_strip_length = len(f'{strat}-')
        with tf.io.TFRecordWriter(str(ds_loc)) as data_writer:
            for programs in get_graph_batches(conn, w2v, shapes, strat, batch_size):
                scores = get_scores(map(lambda key: key[strat_strip_length:], programs.keys()))
                for (graphs, score) in zip(programs.values(), scores):
                    ragged_tensor = tf.ragged.constant([nodes for nodes in graphs.values()])
                    debug("Ragged tensor: ", ragged_tensor)
                    sparse_tensor = ragged_tensor.to_sparse()
                    debug("Sparse tensor: ", sparse_tensor)
                    sparse_tensor = tf.sparse.SparseTensor(sparse_tensor.indices, sparse_tensor.values, shapes[strat][1:])
                    debug("Reshaped Sparse Tensor", sparse_tensor)
                    sparse_tensor_tensor = tf.io.serialize_sparse(sparse_tensor)
                    debug("Sparse tensor tensor: ", sparse_tensor_tensor)
                    serialized_program_tensor = tf.io.serialize_tensor(sparse_tensor_tensor)
                    debug("Serialized program tensor: ", serialized_program_tensor)
                    example_proto = tf.train.Example(
                        features=tf.train.Features(feature={
                            'program': _bytes_feature(serialized_program_tensor),
                            'score': _float_feature(score)
                        })
                    )

                    debug(example_proto)
                    data_writer.write(example_proto.SerializeToString())

    return tf.data.TFRecordDataset([str(ds_loc)]).prefetch(buffer_size=tf.data.AUTOTUNE).map(_parse_graph_tfrecord).batch(batch_size)


    #
    # ds = tf.data.Dataset.from_tensor_slices((programs, scores), name=f'{strat}_{embed_method}')
    # ds.save(str(dataset_loc))
    # with open(f'{dataset_loc}.shape', 'w') as dataset_shape_file:
    #     dataset_shape_file.write(f'{programs.shape}')
    # return ds

# TODO: Calculate the size of the tensor, batch the creation of the tensors into multiple steps.
# Consider using a generator for the dataset instead of loading it all into memory before saving it.
# If we use a generator, it should be able to automatically delete anything that takes up too much
# memory while it saves to disk.
def get_graph_datasets(batch_size: int) -> Generator[tf.data.Dataset, None, None]:
    with pg.connect() as conn:
        for embed_method, strat in embedding_methods:
            db_shapes = get_db_dataset_shapes(conn)
            yield get_graph_dataset(conn, save_dir / f'{strat}_{embed_method}.tfrecord', get_db_dataset_shapes(conn), strat, embed_method, batch_size), db_shapes[strat]


BATCH_SIZE = 100
datasets_info = [(dataset, shape) for dataset, shape in get_graph_datasets(BATCH_SIZE)]

2022-09-11 19:56:35.758159: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-11 19:56:36.009199: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-11 19:56:36.723200: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-11 19:56:36.723275: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [None]:
# test = ''
# for element in datasets_info[0].take(1):
#     test = element
# test

In [None]:
from tensorflow import keras

In [None]:
# Graph Embedding Model
# 1 ex. input:
# [
#   [1 .. 1],
#   ...,
#   [0 .. 2]
# ]
class AttentionEmbedding(keras.layers.Layer):
    def __init__(self, ishape):
        super(AttentionEmbedding, self).__init__()
        self.ishape = ishape
        debug('AE ishape: ', ishape)
        self.mha = keras.layers.MultiHeadAttention(num_heads=2, key_dim=2, name='embedding_attention_layer')
        self.summer = keras.layers.Add()

    def call(self, inputs, *args, **kwargs):
        inputs = tf.expand_dims(inputs, axis=0)
        debug("AE - call - self.ishape: ", self.ishape)
        debug("AE - call - inputs.shape: ", inputs.shape)
        tensors = self.mha(inputs, inputs, training=kwargs['training'])
        return self.summer(tf.unstack(tf.squeeze(tensors, axis=0)))

In [None]:
class ProgramsEmbedding(keras.layers.Layer):
    def __init__(self, ishape, nodes_shape, graphs_shape):
        super(ProgramsEmbedding, self).__init__()
        self.ishape = ishape
        self.embed_graph = AttentionEmbedding(nodes_shape)
        self.embed_program = AttentionEmbedding(graphs_shape)

    def call(self, inputs, *args, **kwargs):
        programs = tf.TensorArray(size=self.ishape[0], dtype=tf.float32)
        debug('PE - call - input_shape: ', self.ishape)
        for i in tf.range(self.ishape[0]):
            graphs = tf.TensorArray(dtype=tf.float32, size=self.ishape[1])
            for j in tf.range(self.ishape[1]):
                graph = self.embed_graph(inputs[i][j], training=kwargs['training'])
                debug('PE - call - graph.shape: ', graph.shape)
                graphs.write(j, graph)
            graphs = graphs.stack()
            debug('PE - call - graphs.shape: ' , graphs.shape)
            programs.write(i, self.embed_program(graphs))
        return programs.stack()


# Program Embedding Model
# 1 ex. input:
# [
#   [
#     [0 .. 1],
#     [1 .. 4]
#   ],
#   [
#     [1 .. 2],
#     [0 .. 0]
#   ]
# ]
for dataset, dataset_shape in datasets_info:
    logging.info('Creating program input layer')
    program_input = keras.Input(shape=dataset_shape[1:], ragged=True, name='programs')

    debug(program_input.shape)
    logging.info('Creating programs embedding layer')
    program_embeddings = ProgramsEmbedding((BATCH_SIZE, *dataset_shape), dataset_shape[2:], (dataset_shape[1], dataset_shape[3]))

    logging.info('Connecting program input layer to program embeddings layer')
    embedded = program_embeddings(program_input)

    logging.info('Creating an output layer and connecting it to the program embedding layer')
    program_output = keras.layers.Dense(units=1, name='scores')(embedded)

    logging.info('Creating the model that we will be training')
    program_model = keras.Model(inputs=program_input, outputs=program_output, name='program_embedding_model')

    logging.info('Printing program model summary')
    program_model.summary()

    logging.info('Compile the model')
    program_model.compile(optimizer='adam',
                          loss='mse')

    logging.info('Fit the model to the dataset')
    program_model.fit(dataset, epochs=2)