In [1]:
!pip install tensorflow_gnn gwpy &> /dev/null

In [2]:
!pip install flair gwpy &> /dev/null

In [3]:
!pip install basemap gwpy &> /dev/null

In [4]:
!python --version

Python 3.10.11


In [5]:
import pandas as pd
import numpy as np
import networkx as nx
import tensorflow as tf
import tensorflow_gnn as tfgnn
import urllib.request
import matplotlib.pyplot as plt
import io
import zipfile
import sys
import datasets as ds
import tensorflow as tf
import re
import gensim
import nltk
import gensim.corpora as corpora
nltk.download('stopwords')
import nltk

from difflib import SequenceMatcher
from mpl_toolkits.basemap import Basemap
from sklearn.model_selection import train_test_split
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from math import sqrt, pow, exp
from wordcloud import WordCloud
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.cluster import KMeansClusterer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from flair.embeddings import BertEmbeddings
from flair.data import Sentence
bert_embedding = BertEmbeddings()

  bert_embedding = BertEmbeddings()


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
"""
  Contains Modular Classes for:

  DataPreProcessor: Preprocesses the data
  -- Builds the node dataframe which contains:
    -- node name
    -- each of it's attributes
  -- Builds the edge dataframe which contains:
    -- node source
    -- node target
    -- each edge type

  DataFrameAdaptor: Adapting Data For the Graph
  -- Defines nodes by index
  -- Updates graph edge node references to point to ids by index
  -- Generates training and test sets
  -- Generates a full adjacency matrix for testing

  TFGraph: Builds the graph structure and deep learning layers
  -- Builds the graph structure using tensors
  -- Builds the deep learning layers using Keras

  TFGNN: Exposes API methods to work with the model:
  -- _get_model
  -- _compile
  -- _fit (train)
  -- _predict
  
  @param training_data_df: pandas dataframe
  @param training_data_df: pandas dataframe
  @param is_node_prediction: boolean
  @return: node dataframe, edge dataframe
"""
class CategoryAwareTFGNN():
  def __init__(self, training_data_df, labels, is_node_prediction=True):
    self.data_df = training_data_df
    self.df_category_labels = labels
    # Preprocess data into nodes and edges
    dataPreProcessor = self._DataPreProcessor(outer_self=self)

    # Adapt dataframes for graph
    self.node_df = dataPreProcessor.nodes
    self.edge_df = dataPreProcessor.edges
    self.dataFrameAdaptor = self._DataFrameAdaptor(outer_self=self)

    # Build TF-Graph
    self.tfGraph = self._TFGraph(outer_self=self)
    
    # Set up datasets.  Edge predictions can be requested by @param is_node_prediction
    training_dataset = self.tfGraph.train_node_dataset
    validation_dataset = self.tfGraph.full_node_dataset
    prediction_dataset = self.tfGraph.full_node_dataset
    if not is_node_prediction:
      training_dataset = self.tfGraph.train_edge_dataset
      validation_dataset = self.tfGraph.full_edge_dataset
      prediction_dataset = self.tfGraph.full_edge_dataset

    # Build the graph tensors and keras layers
    self.tfGNN = self._TFGNN(outer_self=self, num_graph_updates=3, training_dataset=training_dataset, validation_dataset=validation_dataset, prediction_dataset=prediction_dataset)

    """
      Compiles, trains, and generates predictions from the model.
      This can be called after instantiating the CategoryAwareTFGNN for the pipeline results.
      @return: predictions from the model generated by the pipeline
    """
  def _process(self):
    self.tfGNN._compile()
    self.tfGNN._fit()
    return self.tfGNN._predict()

  """
    Manages the preprocessing to collect and generate nodes and edges
  """
  class _DataPreProcessor():
    def __init__(self, outer_self):

      self.data_df = outer_self.data_df
      self.df_category_labels = outer_self.df_category_labels
      nodes, edges = self._process_training_data_to_graph_entities()
      self.nodes = nodes
      self.edges = edges

    """
      Builds the node and edge dataframes from the data.
      The node dataframe:
        -- node name
        -- each of it's attributes
      The edge dataframe:
        -- node source
        -- node target
        -- each edge type
      @return: node dataframe, edge dataframe
    """
    def _process_training_data_to_graph_entities(self):
      # Set up matrix variables
      node_cols = ["text_name", "category_num", "node_attr_stub"]
      graph_df_nodes = pd.DataFrame(columns=node_cols)
      edge_cols = ["source","target", "shares_category", "cos_sim_stub"]
      graph_df_edges = pd.DataFrame(columns=edge_cols)
      text_index = 0
      
      for text_index_i in range(0, len(self.data_df)):
        graph_df_nodes.loc[len(graph_df_nodes.index)] = [
            # text name
            "text-" + str(text_index_i),
            # category num
            self.data_df['label'][text_index_i].astype(np.int32),
            # node_attr_stub
            np.int32(4)
        ]
        for text_index_j in range(0, len(self.data_df)):
          text_category_i = self.df_category_labels[self.data_df['label'][text_index_i]]
          label_i = self.data_df['label'][text_index_i]
          text_category_j = self.df_category_labels[self.data_df['label'][text_index_j]]
          label_j = self.data_df['label'][text_index_j]
          shares_category_binary = 0
          if label_i == label_j:
            shares_category_binary = 1
          graph_df_edges.loc[len(graph_df_edges.index)] = [
            # source
            "text-" + str(text_index_i),
            # target
            "text-" + str(text_index_j),
            # shares category
            np.int32(shares_category_binary),
            # cos_sim_stub
            np.float32(0.65)
        ]
      return graph_df_nodes, graph_df_edges

    def _tokenize(self, token):
      word = Sentence(token)
      bert_embedding.embed(word)
      return word[0].embedding.numpy()

    def _squared_sum(self, x):
      return round(sqrt(sum([a*a for a in x])),3)

    def _cos_similarity(self, x,y):
      return round(sum(a*b for a,b in zip(x,y))/float(self._squared_sum(x)*self._squared_sum(y)),3)

  """
    Manages the dataframe mutations needed for the GNN
  """
  class _DataFrameAdaptor():
    def __init__(self, outer_self):

      self.node_df = outer_self.node_df
      self.edge_df = outer_self.edge_df

      node_train, node_test = self._get_node_train_test_split()
      self.node_train = node_train
      self.node_test = node_test

      edge_train, edge_test = self._get_edge_train_test_split()
      self.edge_train = edge_train
      self.edge_test = edge_test

      node_full_adj, edge_full_adj = self._create_adj_id(self.node_df, self._generate_bidirectional_matrices(self.edge_df))
      self.node_full_adj = node_full_adj
      self.edge_full_adj = edge_full_adj

      node_train_adj, edge_train_adj = self._create_adj_id(self.node_train, self._generate_bidirectional_matrices(self.edge_train))
      self.edge_train_adj = edge_train_adj
      self.node_train_adj = node_train_adj

    def _get_node_train_test_split(self):
      return train_test_split(self.node_df,test_size=0.15,random_state=42)

    def _get_edge_train_test_split(self):
      edge_train = self.edge_df.loc[~((self.edge_df['source'].isin(self.node_test.index)) | (self.edge_df['target'].isin(self.node_test.index)))]
      edge_test = self.edge_df.loc[(self.edge_df['source'].isin(self.node_test.index)) | (self.edge_df['target'].isin(self.node_test.index))]
      return edge_train, edge_test
    
    def _generate_bidirectional_matrices(self, directional_df):
      reverse_df = directional_df.rename(columns={'source':'target','target':'source'})
      reverse_df = reverse_df[directional_df.columns]
      reverse_df = pd.concat([directional_df, reverse_df], ignore_index=True, axis=0)
      return reverse_df

    def _create_adj_id(self, node_df, edge_df):
      node_df = node_df.reset_index()
      edge_df = pd.merge(edge_df,node_df[['text_name','index']].rename(columns={"index":"source_id"}),
                        how='left',left_on='source',right_on='text_name').drop(columns=['text_name'])
      edge_df = pd.merge(edge_df,node_df[['text_name','index']].rename(columns={"index":"target_id"}),
                        how='left',left_on='target',right_on='text_name').drop(columns=['text_name'])
      
      edge_df.dropna(inplace=True)
      edge_df = edge_df.astype({col: 'int32' for col in edge_df.select_dtypes('int64').columns})
      return node_df, edge_df

  """
    Builds the graph structure for the GNN from the node and edge dataframes using tfgnn.GraphTensor, tfgnn.NodeSet, and tfgnn.EdgeSet.
    Builds the deep learning layers for the GNN using Keras and makes these available for fitting the TF-GNN model
  """
  class _TFGraph():
    def __init__(self, outer_self):
      full_tensor = self._create_graph_tensor(outer_self.dataFrameAdaptor.node_full_adj, outer_self.dataFrameAdaptor.edge_full_adj)
      train_tensor = self._create_graph_tensor(outer_self.dataFrameAdaptor.node_train_adj, outer_self.dataFrameAdaptor.edge_train_adj)

      self.full_node_dataset = self._generate_dataset_from_graph(full_tensor, self._node_batch_merge)
      self.train_node_dataset = self._generate_dataset_from_graph(train_tensor, self._node_batch_merge)

      self.full_edge_dataset = self._generate_dataset_from_graph(full_tensor, self._edge_batch_merge)
      self.train_edge_dataset = self._generate_dataset_from_graph(train_tensor, self._edge_batch_merge)

      self.set_initial_node_state = self._set_initial_node_state
      self.set_initial_edge_state = self._set_initial_edge_state

      graph_spec = self.train_edge_dataset.element_spec[0]
      self.input_graph = tf.keras.layers.Input(type_spec=graph_spec)
      print("graph spec compatibility: ", graph_spec.is_compatible_with(full_tensor))

      self.dense_layer = self._dense_layer

    def _create_graph_tensor(self, node_df, edge_df):
      graph_tensor = tfgnn.GraphTensor.from_pieces(
        node_sets = {
            "articles": tfgnn.NodeSet.from_fields(
                sizes = [len(node_df)],
                features ={
                    'article_category': np.array(node_df['category_num'],
                                    dtype='int32').reshape(len(node_df),1),
                    'article_attr': np.array(node_df['node_attr_stub'],
                                    dtype='int32').reshape(len(node_df),1)
                })
            },
        edge_sets ={
            "topics": tfgnn.EdgeSet.from_fields(
                sizes = [len(edge_df)],
                features = {
                    'topics_shared': np.array(edge_df['shares_category'],
                                          dtype='int32').reshape(len(edge_df),1),
                    'cosine_similarity': np.array(edge_df['cos_sim_stub'], dtype='float32').reshape(len(edge_df),1)},
                adjacency = tfgnn.Adjacency.from_indices(
                    source = ("articles", np.array(edge_df['source_id'], dtype='int32')),
                    target = ("articles", np.array(edge_df['target_id'], dtype='int32'))))
        }
      )
      return graph_tensor

    def _node_batch_merge(self, graph):
      graph = graph.merge_batch_to_components()
      node_features = graph.node_sets['articles'].get_features_dict()
      edge_features = graph.edge_sets['topics'].get_features_dict()
      
      label = node_features.pop('article_category')
      _ = edge_features.pop('topics_shared')
      
      new_graph = graph.replace_features(
          node_sets={'articles':node_features},
          edge_sets={'topics':edge_features})
      return new_graph, label

    def _edge_batch_merge(self, graph):
      graph = graph.merge_batch_to_components()
      node_features = graph.node_sets['articles'].get_features_dict()
      edge_features = graph.edge_sets['topics'].get_features_dict()
      
      _ = node_features.pop('article_category')
      label = edge_features.pop('topics_shared')
      
      new_graph = graph.replace_features(
          node_sets={'articles':node_features},
          edge_sets={'topics':edge_features})
      return new_graph, label

    def _generate_dataset_from_graph(self, graph, function):
      dataset = tf.data.Dataset.from_tensors(graph)
      dataset = dataset.batch(32)
      return dataset.map(function)

    def _set_initial_node_state(self, node_set, node_set_name):
      features = [
          tf.keras.layers.Dense(32,activation="relu")(node_set['article_attr'])
      ]
      return tf.keras.layers.Concatenate()(features)

    def _set_initial_edge_state(self, edge_set, edge_set_name):
      features = [
          tf.keras.layers.Dense(32,activation="relu")(edge_set['cosine_similarity'])
      ]
      return tf.keras.layers.Concatenate()(features)
    
    def _dense_layer(self, units=64, l2_reg=0.1, dropout=0.25, activation='relu'):
      regularizer = tf.keras.regularizers.l2(l2_reg)
      return tf.keras.Sequential([
          tf.keras.layers.Dense(units,
                                kernel_regularizer=regularizer,
                                bias_regularizer=regularizer),
          tf.keras.layers.Dropout(dropout)])
        
  """
    Exposes API methods to:
    -- Build the model
    -- Compile the model
    -- Fit the model (training)
    -- Predicting from the model

    These methods need to be called in order.  However multiple predictions can be made on the same model.
  """
  class _TFGNN():
    def __init__(self, outer_self, num_graph_updates, training_dataset, validation_dataset, prediction_dataset, steps=10, epochs=100, loss=None, metrics=None):
      # Graph update variables
      self.input_graph = outer_self.tfGraph.input_graph
      self.dense_layer = outer_self.tfGraph.dense_layer
      self.set_initial_node_state = outer_self.tfGraph.set_initial_node_state
      self.set_initial_edge_state = outer_self.tfGraph.set_initial_edge_state


      self.num_graph_updates = num_graph_updates
      self.model = self._get_model()
      
      self.training_dataset = training_dataset
      self.validation_dataset = validation_dataset
      self.prediction_dataset = prediction_dataset

      # Training params
      self.steps_per_epoch = steps
      self.epochs = epochs

    def _get_model(self):
      graph = tfgnn.keras.layers.MapFeatures(
          node_sets_fn=self.set_initial_node_state,
          edge_sets_fn=self.set_initial_edge_state
          )(self.input_graph)

      graph_updates = self.num_graph_updates
      for i in range(graph_updates):
          graph = tfgnn.keras.layers.GraphUpdate(
              node_sets = {
                  'articles': tfgnn.keras.layers.NodeSetUpdate({
                      'topics': tfgnn.keras.layers.SimpleConv(
                          message_fn = self.dense_layer(32),
                          reduce_type="sum",
                          sender_edge_feature = tfgnn.HIDDEN_STATE,
                          receiver_tag=tfgnn.TARGET)},
                      tfgnn.keras.layers.NextStateFromConcat(
                          self.dense_layer(64)))})(graph) #start here
          
          logits = tf.keras.layers.Dense(1,activation='softmax')(graph.node_sets["articles"][tfgnn.HIDDEN_STATE])
      return tf.keras.Model(self.input_graph, logits)

    """
      TODO: The model training breaks here with the loss uncommented out.  The metrics are useless without the loss.
    """
    def _compile(self):
      self.model.compile(
          tf.keras.optimizers.Adam(learning_rate=0.3),
          # loss = 'categorical_crossentropy',
          # metrics = ['categorical_accuracy']
      )

    def _fit(self):
      es = tf.keras.callbacks.EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=100,restore_best_weights=True)
      self.model.fit(self.training_dataset.repeat(),
            validation_data=self.validation_dataset,
            steps_per_epoch=self.steps_per_epoch,
            epochs=self.epochs,
            callbacks=[es])
      
    def _predict(self):
      return self.model.predict(self.prediction_dataset)

In [8]:
"""
  Loads training data.
  Shuffles the training data.
  Takes a slice of the training data up to the side defined in data_set_size.
  This training set size can be reduced for pipeline experiments.

  @param data_set_size: number
  @return: dataframe, label array
"""
def collect_training_data(data_set_size):
    ag_news_ds = ds.load_dataset('ag_news', save_infos=True)
    ag_news_ds_train = ag_news_ds['train']
    ag_news_ds_train_df = pd.DataFrame(data=ag_news_ds_train)
    ag_news_ds_train_df_shuffled = ag_news_ds_train_df.sample(frac = 1).reset_index(drop=True)
    ag_news_ds_train_df_shuffled = ag_news_ds_train_df_shuffled[:data_set_size]
    labels = ag_news_ds['test'].features['label'].names
    return ag_news_ds_train_df_shuffled, labels

In [9]:
training_data_df, labels = collect_training_data(100)

Downloading builder script:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.65k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading and preparing dataset ag_news/default to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548...


Downloading data:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/751k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
ca_TFGNN = CategoryAwareTFGNN(training_data_df=training_data_df, labels=labels)



graph spec compatibility:  False


### Observe the graph structures in their dataframe form

In [11]:
ca_TFGNN.node_df.head()

Unnamed: 0,text_name,category_num,node_attr_stub
0,text-0,2,4
1,text-1,3,4
2,text-2,0,4
3,text-3,0,4
4,text-4,1,4


In [12]:
ca_TFGNN.edge_df.head()

Unnamed: 0,source,target,shares_category,cos_sim_stub
0,text-0,text-0,1,0.65
1,text-0,text-1,0,0.65
2,text-0,text-2,0,0.65
3,text-0,text-3,0,0.65
4,text-0,text-4,0,0.65


In [13]:
predictions = ca_TFGNN._process()

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [14]:
predictions

array([[0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.99999994],
       [0.999

## Reference:
Malin, Michael. "TensorFlow-GNN: An End-To-End Guide For Graph Neural Networks", Towards Data Science, 1/16/2023, https://towardsdatascience.com/tensorflow-gnn-an-end-to-end-guide-for-graph-neural-networks-a66bfd237c8c.