In [45]:
from stellargraph import datasets, StellarGraph
import networkx as nx
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import json

In [4]:
# dataset = datasets.Cora()
# G, node_subjects = dataset.load(largest_connected_component_only=True)


In [35]:
def normalize_graph_features(graph, features, train_node_ids=[]):
    if not train_node_ids:
        train_node_ids = graph.nodes
    funcs = []
    for feature in features:
        data = np.array([graph.nodes[node_id][feature] for node_id in train_node_ids])
        mean = np.mean(data)
        std = np.std(data)
        funcs.append((mean, std))
    for node_id in graph.nodes:
        node = graph.nodes[node_id]
        for ix, feature in enumerate(features):
            node[feature] = (node[feature] - funcs[ix][0]) / funcs[ix][1]
    return funcs

In [78]:
# embeddings = KeyedVectors.load("../test_data/pgfam_and_pfam_wv_embedding.pkl", mmap='r')
embeddings = KeyedVectors.load("../test_data/pgfam_wv_embedding.pkl", mmap='r')


In [125]:
# nx_graph = nx.readwrite.read_gexf("../test_data/test-families-and-dists-only.gexf")
nx_graph = nx.readwrite.read_gexf("../test_data/test-families-and-dists-only-pgfams.gexf")

class_tups = []
start_stop = []
for node_id in list(nx_graph.nodes):
    class_tups.append((node_id, 1 if nx_graph.nodes[node_id]["BGC"] else 0))
    start_stop.append((node_id, nx_graph.nodes[node_id]["dist_to_last"], nx_graph.nodes[node_id]["dist_to_next"]))
    
bgc_labels = pd.DataFrame.from_records(class_tups, columns=["id", "BGC"]).set_index("id")["BGC"]
start_stops = pd.DataFrame.from_records(start_stop, columns=["id", "start", "stop"]).set_index("id")
train_bgcs, test_bgcs, train_starts, test_starts = model_selection.train_test_split(
    bgc_labels, start_stops, test_size=0.1, stratify=bgc_labels, random_state=42
)

In [126]:
funcs = normalize_graph_features(nx_graph, ["dist_to_last", "dist_to_next"], train_node_ids = list(train_bgcs.index))

for node_id in nx_graph.nodes:
    node = nx_graph.nodes[node_id]
    family = node["family"]
    to_last = node["dist_to_last"]
    to_next = node["dist_to_next"]
    if type(family) == str:
        family = family.replace('"', "")
#         print(type(embeddings.wv.get_vector(family)))
        node["family"] = np.concatenate(([to_last, to_next],  embeddings.wv.get_vector(family)))
    else:
#         print(family)
        pass

for node_id in nx_graph.nodes:
    node = nx_graph.nodes[node_id]
    if "label" in node.keys():
        del node["label"]

In [88]:
funcs[1]

(130.12934782608696, 356.25938333578256)

In [127]:
graph = StellarGraph.from_networkx(nx_graph, node_features="family")

In [121]:
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 73609, Edges: 92630

 Node types:
  default: [73609]
    Features: float32 vector, length 102
    Edge types: default-default->default

 Edge types:
    default-default->default: [92630]
        Weights: range=[0.0212766, 1], mean=0.0569955, std=0.125069
        Features: none


### Same Internal Network Test
#### Using https://stellargraph.readthedocs.io/en/stable/demos/node-classification/graphsage-node-classification.html

In [91]:
import networkx as nx
import pandas as pd
import os

import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import Callback
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.utils import class_weight
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [100]:
bgc_metrics = [
#     metrics.FalseNegatives(name="fn"),
#     metrics.FalsePositives(name="fp"),
#     metrics.TrueNegatives(name="tn"),
#     metrics.TruePositives(name="tp"),
    metrics.Precision(name="precision"),
#     metrics.Recall(name="recall"),
]

#From https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2
    
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: {} — val_precision: {} — val_recall {}".format(_val_f1, _val_precision, _val_recall))
#         return
 
f1metrics = Metrics()

In [152]:
for ix, i in enumerate(train_gen):
    print(sum(i[1]))
    if ix > 3:
        break

0
0
2
0
3


In [150]:
train_bgcs[150:200].sum()

0

In [151]:
# train_bgcs, test_bgcs = model_selection.train_test_split(
#     bgc_labels, train_size=0.1, test_size=None, stratify=bgc_labels
# )
# target_encoding = preprocessing.LabelBinarizer()

# train_targets = target_encoding.fit_transform(train_bgcs)
# test_targets = target_encoding.transform(test_bgcs)
batch_size = 50
num_samples = [10, 5]
generator = GraphSAGENodeGenerator(graph, batch_size, num_samples)
train_gen = generator.flow(train_bgcs.index, train_bgcs, shuffle=False)

graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5
)

start_stop_input = layers.Input(shape=(2,), name="start_stop")

x_inp, x_out = graphsage_model.in_out_tensors()
# prediction = layers.Dense(units=1, activation="sigmoid")(x_out)
graph_dense = layers.Dense(units=50, activation="sigmoid", name="graph")(x_out)
graph_and_start_stop = layers.concatenate([graph_dense, start_stop_input])
prediction = layers.Dense(units=1, activation="sigmoid", name="BGC")(graph_and_start_stop)

model = Model(inputs=[x_inp, start_stop_input], outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.binary_crossentropy,
    metrics=bgc_metrics,
)

test_gen = generator.flow(test_bgcs.index, test_bgcs)

# Calculate the weights for each class so that we can balance the data
weights = class_weight.compute_class_weight('balanced',
                                            np.unique(train_bgcs),
                                            train_bgcs)
weights = {i:weights[i] for i in range(2)}
history = model.fit(
    train_gen, epochs=5, validation_data=test_gen, verbose=2, shuffle=False, class_weight=weights
)

19899    0
37956    0
3325     0
50436    0
53141    0
        ..
40321    0
49348    0
24072    0
42073    0
30197    0
Name: BGC, Length: 66248, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


Epoch 1/5


AssertionError: in user code:

    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:805 train_function  *
        return step_function(self, iterator)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/distribute/distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:788 run_step  **
        outputs = model.train_step(data)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/training.py:754 train_step
        y_pred = self(x, training=True)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:425 call
        inputs, training=training, mask=mask)
    /sfs/lustre/bahamut/scratch/jho5ze/bionets/BGCs/venvs/stellar/lib/python3.7/site-packages/tensorflow/python/keras/engine/functional.py:569 _run_internal_graph
        assert x_id in tensor_dict, 'Could not compute output ' + str(x)

    AssertionError: Could not compute output KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.float32, name=None), name='BGC/Sigmoid:0', description="created by layer 'BGC'")


In [107]:
y_pred = model.predict(test_gen)


In [110]:
len(y_pred)

7361

In [108]:
from sklearn.metrics import classification_report

print(classification_report(test_bgcs, [round(i[0]) for i in y_pred]))

              precision    recall  f1-score   support

           0       1.00      0.88      0.94      7239
           1       0.10      0.80      0.18       122

    accuracy                           0.88      7361
   macro avg       0.55      0.84      0.56      7361
weighted avg       0.98      0.88      0.92      7361



In [39]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=0.1, test_size=None, stratify=node_subjects
)
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
test_targets = target_encoding.transform(test_subjects)

batch_size = 50
num_samples = [10, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
train_gen = generator.flow(train_subjects.index, train_targets, shuffle=True)



Epoch 1/20


InvalidArgumentError:  Matrix size-incompatible: In[0]: [50,7], In[1]: [32,1]
	 [[node gradient_tape/model/dense/MatMul (defined at <ipython-input-39-bc9c07c649f0>:31) ]] [Op:__inference_train_function_1699]

Function call stack:
train_function
