In [25]:
from stellargraph import datasets, StellarGraph
import networkx as nx
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd

In [4]:
dataset = datasets.Cora()
G, node_subjects = dataset.load(largest_connected_component_only=True)


In [3]:
embeddings = KeyedVectors.load("../test_data/pgfam_and_pfam_wv_embedding.pkl", mmap='r')

In [16]:
nx_graph = nx.readwrite.read_gexf("../test_data/test-families-and-dists-only.gexf")

for node_id in nx_graph.nodes:
    node = nx_graph.nodes[node_id]
    family = node["family"]
    to_last = node["dist_to_last"]
    to_next = node["dist_to_next"]
    if type(family) == str:
        family = family.replace('"', "")
#         print(type(embeddings.wv.get_vector(family)))
        node["family"] = np.concatenate(([to_last, to_next],  embeddings.wv.get_vector(family)))
    else:
#         print(family)
        pass

for node_id in nx_graph.nodes:
    node = nx_graph.nodes[node_id]
    if "label" in node.keys():
        del node["label"]

In [17]:
graph = StellarGraph.from_networkx(nx_graph, node_features="family")

In [63]:
class_tups = []
for node_id in list(nx_graph.nodes):
    class_tups.append((node_id, 1 if nx_graph.nodes[node_id]["BGC"] else 0))

In [64]:
bgc_labels = pd.DataFrame.from_records(class_tups, columns=["id", "BGC"]).set_index("id")["BGC"]

In [18]:
print(graph.info())

StellarGraph: Undirected multigraph
 Nodes: 132112, Edges: 171172

 Node types:
  default: [132112]
    Features: float32 vector, length 102
    Edge types: default-default->default

 Edge types:
    default-default->default: [171172]
        Weights: range=[0.0212766, 0.978723], mean=0.0568194, std=0.0965053
        Features: none


In [6]:
dataset.description

'The Cora dataset consists of 2708 scientific publications classified into one of seven classes. The citation network consists of 5429 links. Each publication in the dataset is described by a 0/1-valued word vector indicating the absence/presence of the corresponding word from the dictionary. The dictionary consists of 1433 unique words.'

In [9]:
from IPython.display import display, HTML
dataset = datasets.Cora()
display(HTML(dataset.description))
G, node_subjects = dataset.load()

In [11]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 2708, Edges: 5429

 Node types:
  paper: [2708]
    Features: float32 vector, length 1433
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [5429]
        Weights: all 1 (default)
        Features: none


In [71]:
node_subjects

31336             Neural_Networks
1061127             Rule_Learning
1106406    Reinforcement_Learning
13195      Reinforcement_Learning
37879       Probabilistic_Methods
                    ...          
1128975        Genetic_Algorithms
1128977        Genetic_Algorithms
1128978        Genetic_Algorithms
117328                 Case_Based
24043             Neural_Networks
Name: subject, Length: 2708, dtype: object

### Same Internal Network Test
#### Using https://stellargraph.readthedocs.io/en/stable/demos/node-classification/graphsage-node-classification.html

In [46]:
import networkx as nx
import pandas as pd
import os

import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE

from tensorflow.keras import layers, optimizers, losses, metrics, Model
from tensorflow.keras.callbacks import Callback
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.utils import class_weight
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
%matplotlib inline

In [53]:
bgc_metrics = [
    metrics.FalseNegatives(name="fn"),
    metrics.FalsePositives(name="fp"),
    metrics.TrueNegatives(name="tn"),
    metrics.TruePositives(name="tp"),
    metrics.Precision(name="precision"),
    metrics.Recall(name="recall"),
]

#From https://towardsdatascience.com/implementing-macro-f1-score-in-keras-what-not-to-do-e9f1aa04029d
# class F1Metric(Callback):
#     def __init__(self, validation):   
#         super(Metrics, self).__init__()
#         self.validation = validation    
            
#         print('validation shape', len(self.validation[0]))
        
#     def on_train_begin(self, logs={}):        
#         self.val_f1s = []
#         self.val_recalls = []
#         self.val_precisions = []
     
#     def on_epoch_end(self, epoch, logs={}):
#         val_targ = self.validation[1]   
#         val_predict = (np.asarray(self.model.predict(self.validation[0]))).round()        
    
#         val_f1 = f1_score(val_targ, val_predict)
#         val_recall = recall_score(val_targ, val_predict)         
#         val_precision = precision_score(val_targ, val_predict)
        
#         self.val_f1s.append(round(val_f1, 6))
#         self.val_recalls.append(round(val_recall, 6))
#         self.val_precisions.append(round(val_precision, 6))
 
#         print(f' — val_f1: {val_f1} — val_precision: {val_precision}, — val_recall: {val_recall}')
    
#From https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2
    
class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.model.validation_data[0]))).round()
        val_targ = self.model.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        print("— val_f1: {} — val_precision: {} — val_recall {}".format(_val_f1, _val_precision, _val_recall))
#         return
 
f1metrics = Metrics()

In [65]:
train_bgcs, test_bgcs = model_selection.train_test_split(
    bgc_labels, train_size=0.1, test_size=None, stratify=bgc_labels
)
# target_encoding = preprocessing.LabelBinarizer()

# train_targets = target_encoding.fit_transform(train_bgcs)
# test_targets = target_encoding.transform(test_bgcs)
batch_size = 50
num_samples = [10, 5]
generator = GraphSAGENodeGenerator(graph, batch_size, num_samples)
train_gen = generator.flow(train_bgcs.index, train_bgcs, shuffle=True)

graphsage_model = GraphSAGE(
    layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.5,
)

x_inp, x_out = graphsage_model.in_out_tensors()
prediction = layers.Dense(units=1, activation="sigmoid")(x_out)

model = Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=optimizers.Adam(lr=0.005),
    loss=losses.binary_crossentropy,
    metrics=bgc_metrics,
)

test_gen = generator.flow(test_bgcs.index, test_bgcs)

# Calculate the weights for each class so that we can balance the data
weights = class_weight.compute_class_weight('balanced',
                                            np.unique(train_bgcs),
                                            train_bgcs)
weights = {i:weights[i] for i in range(2)}
history = model.fit(
    train_gen, epochs=20, validation_data=test_gen, verbose=2, shuffle=False, class_weight=weights
)

122927    0
45224     1
26305     1
89373     0
29862     0
         ..
85828     0
26003     0
64854     0
117608    0
6695      0
Name: BGC, Length: 13211, dtype: int64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


Epoch 1/20
265/265 - 37s - loss: 0.6944 - fn: 276.0000 - fp: 106158.0000 - tn: 23233.0000 - tp: 2445.0000 - precision: 0.0225 - recall: 0.8986 - val_loss: 0.7167 - val_fn: 751.0000 - val_fp: 66170.0000 - val_tn: 50282.0000 - val_tp: 1698.0000 - val_precision: 0.0250 - val_recall: 0.6933
Epoch 2/20
265/265 - 35s - loss: 0.6921 - fn: 112.0000 - fp: 6368.0000 - tn: 6571.0000 - tp: 160.0000 - precision: 0.0245 - recall: 0.5882 - val_loss: 0.7069 - val_fn: 768.0000 - val_fp: 60565.0000 - val_tn: 55887.0000 - val_tp: 1681.0000 - val_precision: 0.0270 - val_recall: 0.6864
Epoch 3/20
265/265 - 35s - loss: 0.6936 - fn: 119.0000 - fp: 6425.0000 - tn: 6514.0000 - tp: 153.0000 - precision: 0.0233 - recall: 0.5625 - val_loss: 0.6724 - val_fn: 1092.0000 - val_fp: 45228.0000 - val_tn: 71224.0000 - val_tp: 1357.0000 - val_precision: 0.0291 - val_recall: 0.5541
Epoch 4/20
265/265 - 35s - loss: 0.6773 - fn: 99.0000 - fp: 6174.0000 - tn: 6765.0000 - tp: 173.0000 - precision: 0.0273 - recall: 0.6360 - val

In [67]:
y_pred = model.predict(test_gen)


In [74]:
from sklearn.metrics import classification_report

print(classification_report(test_bgcs, [round(i[0]) for i in y_pred]))

              precision    recall  f1-score   support

           0       0.99      0.88      0.93    116452
           1       0.08      0.47      0.13      2449

    accuracy                           0.87    118901
   macro avg       0.53      0.68      0.53    118901
weighted avg       0.97      0.87      0.92    118901



In [39]:
train_subjects, test_subjects = model_selection.train_test_split(
    node_subjects, train_size=0.1, test_size=None, stratify=node_subjects
)
target_encoding = preprocessing.LabelBinarizer()

train_targets = target_encoding.fit_transform(train_subjects)
test_targets = target_encoding.transform(test_subjects)

batch_size = 50
num_samples = [10, 5]
generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
train_gen = generator.flow(train_subjects.index, train_targets, shuffle=True)



Epoch 1/20


InvalidArgumentError:  Matrix size-incompatible: In[0]: [50,7], In[1]: [32,1]
	 [[node gradient_tape/model/dense/MatMul (defined at <ipython-input-39-bc9c07c649f0>:31) ]] [Op:__inference_train_function_1699]

Function call stack:
train_function
