**Due to computational limitations this Notebook has not been fully tested!**

In [None]:
import re
import h5py
import os.path as osp
import torch
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.nn import Linear, Parameter
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData, Dataset, Data
from torch_geometric.nn import GCNConv, HeteroConv, SAGEConv, GATConv, MessagePassing
from torch_geometric.utils import to_networkx
from torch_geometric.loader import NeighborLoader
# from torch_geometric.explain import Explainer, GNNExplainer
from tqdm import tqdm
tqdm.pandas()

In [None]:
# stop_words = nltk.corpus.stopwords.words('english')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'distilbert/distilbert-base-uncased'
model = SentenceTransformer(model_name).to(device)
# model = SentenceTransformer('anferico/bert-for-patents').to(device)

In [None]:
class PatentHeteroDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(PatentHeteroDataset, self).__init__(root, transform, pre_transform)
        self.data = None
        # processed_path = osp.join(self.processed_dir, self.processed_file_names)
        # if osp.exists(processed_path):
        #     self.data = torch.load(processed_path)
        # else:
        self.process()

    @property
    def num_classes(self):
        return 2

    @property
    def raw_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Identification/raw//'
    
    @property
    def processed_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Identification/processed/'

    @property
    def raw_file_names(self):
        return [
            'torch_tek_dataset_distilbert.h5' # Adjust to correct model
        ]

    @property
    def processed_file_names(self):
        return 'gnn_tek_data_distilbert.pt' # Adjust to correct model

    def download(self):
        pass

    def process(self):
        # Initialize HeteroData object
        data = HeteroData()
    
        # Open an HDF5 file
        with h5py.File(osp.join(self.raw_dir, 'torch_tek_dataset_distilbert.h5'), 'r') as f:
            # Load and process node features
            data['patent'].x = torch.tensor(f['g_patent/x'][:], dtype=torch.float)
            data['patent'].y = torch.tensor(f['g_patent/y'][:], dtype=torch.long)
            data['author'].x = torch.tensor(f['g_author_nodes/x'][:], dtype=torch.float)
            
            # Load and process edge indices
            data['patent', 'cites', 'patent'].edge_index = torch.tensor(f['patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['author', 'author_of', 'patent'].edge_index = torch.tensor(f['person_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_author', 'author'].edge_index = torch.tensor(f['patent_person_edge_index'][:], dtype=torch.long).t().contiguous()

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # Create train_mask, val_mask, and test_mask
        data['patent'].train_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].val_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].test_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].train_mask[:int(0.8*data['patent'].num_nodes)] = 1
        data['patent'].val_mask[int(0.8*data['patent'].num_nodes):int(0.9*data['patent'].num_nodes)] = 1
        data['patent'].test_mask[int(0.9*data['patent'].num_nodes):] = 1

        # Diagnostic print statements
        print("Data keys after processing:", data.keys())
        print("Node types and their feature shapes:")
        for node_type, node_data in data.node_items():
            print(f"Node type: {node_type}")
            for key, item in node_data.items():
                if key == 'x' or key == 'y':
                    print(f"Features ({key}) shape:", item.size())

        print("Edge types and their index shapes:")
        for edge_type, edge_data in data.edge_items():
            print(f"Edge type: {edge_type}")
            if 'edge_index' in edge_data:
                print("Edge index shape:", edge_data['edge_index'].size())
            else:
                print(f"{edge_type} has no edge index.")

        self.data = data  # Save the processed data to self.data
        torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def len(self):
        return 1

    def get(self, idx):
        return self.data

In [None]:
dataset = PatentHeteroDataset(root='/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Idnetification/')
PatentDataset = dataset[0]

In [None]:
class SimplifiedHeteroGCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(SimplifiedHeteroGCN, self).__init__()
        torch.manual_seed(42)  # For reproducible results
        
        self.dropout = torch.nn.Dropout(p=0.2)  # Define dropout layer

        # Define a SAGEConv for essential relations
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict
        
        # Apply dropout to 'patent' node features
        x_dict['patent'] = self.dropout(x_dict['patent'])

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Predictions for 'patent' node embeddings
        out = self.lin(x_dict['patent'])
        return out


In [None]:
class HeteroGCN(MessagePassing):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(HeteroGCN, self).__init__(aggr='mean')
        torch.manual_seed(42) # For reproducible results
        
        # Define a separate SAGEConv for each edge type with correct input feature sizes
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('author', 'author_of', 'patent'): SAGEConv(num_node_features_dict['author'], hidden_channels, add_self_loops=True),
            ('patent', 'has_author', 'author'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('author', 'author_of', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_author', 'author'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict

        # Include dropout for regularization
        x_dict['patent'] = F.dropout(x_dict['patent'], p=0.2, training=self.training)

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Only use the 'patent' node embeddings for the final prediction
        out = self.lin(x_dict['patent'])
        return out

In [None]:
num_node_features_dict = {'patent': 768, 'author': 768}
num_classes = 2

# model = HeteroGCN(hidden_channels=64, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
# model = HeteroGCN(hidden_channels=512, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
# model = SimplifiedHeteroGCN(hidden_channels=512, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
PatentDataset = PatentDataset.to(device)
train_loader = NeighborLoader(PatentDataset, num_neighbors=[100], batch_size=512, shuffle=True, input_nodes=('patent', PatentDataset['patent'].train_mask))
test_loader = NeighborLoader(PatentDataset, num_neighbors=[100], batch_size=512, shuffle=False, input_nodes=('patent', PatentDataset['patent'].test_mask))

In [None]:
def train():
    model.train()
    total_loss = 0
    total_batches = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        try:
            out = model(batch) # For Heterogeneous NN
            loss = criterion(out[batch['patent'].train_mask], batch['patent'].y[batch['patent'].train_mask]) # For Hetereogeneous NN 
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_batches += 1
        except Exception as e:
            print("Error during training:", e)
            raise
            
    return total_loss / total_batches if total_batches else 0

In [None]:
def test():
    model.eval()
    correct = 0
    total = 0

    for batch in test_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = model(batch)
            pred = out.argmax(dim=1)
            
            # Assuming batch['patent'].test_mask is a boolean mask
            test_mask = batch['patent'].test_mask # For Hetereogeneous NN 
            test_labels = batch['patent'].y # For Hetereogeneous NN 

            # Update correct and total counts
            correct += int((pred[test_mask] == test_labels[test_mask]).sum())
            total += int(test_mask.sum())

    test_acc = correct / total
    return test_acc


In [None]:
num_epochs = 10

for epoch in range(1, num_epochs + 1):
    loss = train()
    test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {test_acc:.4f}')