In [None]:
import re
import h5py
import os.path as osp
import torch
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData, Dataset
from torch_geometric.nn import GCNConv, HeteroConv, SAGEConv, GATConv, MessagePassing
from torch_geometric.utils import to_networkx
from torch_geometric.loader import NeighborLoader
from tqdm import tqdm
tqdm.pandas()

# stop_words = nltk.corpus.stopwords.words('english')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = SentenceTransformer('anferico/bert-for-patents').to(device)

**Sources** (Also for tomorrow)
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/heterogeneous.html
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_gnn.html
- https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.HeteroConv.html#torch_geometric.nn.conv.HeteroConv
- https://pytorch-geometric.readthedocs.io/en/latest/cheatsheet/gnn_cheatsheet.html

# Data Preprocessing

## Patents

In [None]:
g_patent = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_patent.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'patent_title', 'patent_abstract'], low_memory=False)
g_cpc = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_cpc_current.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'cpc_class'], low_memory=False)
g_patent = g_patent.astype(str)
g_cpc = g_cpc.astype(str)

In [None]:
g_patent_cpc = pd.merge(g_patent, g_cpc, on='patent_id')
g_patent_cpc = g_patent_cpc.groupby('patent_id').agg({
    'cpc_class': list,
    'patent_title': 'first',
    'patent_abstract': 'first'
}).reset_index().rename(columns={'cpc_class': 'cpc_groups'})

In [None]:
g_patent_cleantech = g_patent_cpc[g_patent_cpc['cpc_groups'].apply(lambda x: 'Y02' in x)]
g_patent_non_cleantech = g_patent_cpc.sample(n=len(g_patent_cleantech), random_state=42)

### Data Cleaning

In [None]:
def preprocess_text(text):
    # text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
g_patent_cleantech.loc[:, 'patent_title'] = g_patent_cleantech['patent_title'].progress_apply(preprocess_text)
g_patent_cleantech.loc[:, 'patent_abstract'] = g_patent_cleantech['patent_abstract'].progress_apply(preprocess_text)
g_patent_non_cleantech.loc[:, 'patent_title'] = g_patent_non_cleantech['patent_title'].progress_apply(preprocess_text)
g_patent_non_cleantech.loc[:, 'patent_abstract'] = g_patent_non_cleantech['patent_abstract'].progress_apply(preprocess_text)

In [None]:
g_patent_cleantech.loc[:, 'patent_title_abstract'] = g_patent_cleantech['patent_title'] + ' [SEP] ' + g_patent_cleantech['patent_abstract']
g_patent_non_cleantech.loc[:, 'patent_title_abstract'] = g_patent_non_cleantech['patent_title'] + ' [SEP] ' + g_patent_non_cleantech['patent_abstract']

g_patent_cleantech.loc[:, 'label'] = 1
g_patent_non_cleantech.loc[:, 'label'] = 0

g_patent = pd.concat([g_patent_cleantech, g_patent_non_cleantech], ignore_index=True)

g_patent = g_patent.sort_values(by=['patent_id']).reset_index(drop=True)

g_patent['index'] = g_patent.index

In [None]:
g_patent['patent_title_abstract_bert_for_patents_embedding'] = model.encode(g_patent['patent_title_abstract'].tolist(), show_progress_bar=True, convert_to_tensor=True, device=device)

In [None]:
# If I want to load precomputed embeddings
g_patent = pd.read_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_patent_embedding.csv')

## Assignees, Inventors and Authors

In [None]:
g_assignee = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_assignee_disambiguated.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'assignee_id', 'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last', 'disambig_assignee_organization'], low_memory=False)
g_inventor = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_inventor_disambiguated.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'inventor_id', 'disambig_inventor_name_first', 'disambig_inventor_name_last'], low_memory=False)

In [None]:
g_assignee = g_assignee[g_assignee['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_assignee = g_assignee.sort_values(by=['assignee_id']).reset_index(drop=True)
g_inventor = g_inventor[g_inventor['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_inventor = g_inventor.sort_values(by=['inventor_id']).reset_index(drop=True)

In [None]:
g_assignee_nodes = g_assignee[['assignee_id', 'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last', 'disambig_assignee_organization']].drop_duplicates().reset_index(drop=True)
g_inventor_nodes = g_inventor[['inventor_id', 'disambig_inventor_name_first', 'disambig_inventor_name_last']].drop_duplicates().reset_index(drop=True)
g_assignee_nodes['index'] = g_assignee_nodes.index
g_inventor_nodes['index'] = g_inventor_nodes.index

In [None]:
g_assignee_nodes['assignee_embedding'] = np.random.rand(len(g_assignee_nodes), 1024).tolist()
g_inventor_nodes['inventor_embedding'] = np.random.rand(len(g_inventor_nodes), 1024).tolist()

## Patent Citations

In [None]:
g_patent_citation = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_us_patent_citation.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'citation_patent_id'], low_memory=False)

In [None]:
g_patent_citation = g_patent_citation[g_patent_citation['citation_patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_patent_citation = g_patent_citation[g_patent_citation['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)

In [None]:
g_patent_citation = pd.merge(g_patent_citation, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_patent_citation = pd.merge(g_patent_citation, g_patent[['patent_id', 'index']].rename(columns={'index': 'citation_patent_id_index'}), left_on='citation_patent_id', right_on='patent_id').drop(columns=['patent_id_y']).rename(columns={'patent_id_x': 'patent_id'})

In [None]:
patent_edge_index = []
for i in tqdm(range(len(g_patent_citation))):
    patent_edge_index.append([g_patent_citation['patent_id_index'][i], g_patent_citation['citation_patent_id_index'][i]])
    patent_edge_index.append([g_patent_citation['citation_patent_id_index'][i], g_patent_citation['patent_id_index'][i]])

## Inventor and Assignee - Patent Relationships

In [None]:
g_assignee_patent = pd.merge(g_assignee, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_assignee_patent = pd.merge(g_assignee_patent, g_assignee_nodes[['assignee_id', 'index']].rename(columns={'index': 'assignee_id_index'}), on='assignee_id')
g_inventor_patent = pd.merge(g_inventor, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_inventor_patent = pd.merge(g_inventor_patent, g_inventor_nodes[['inventor_id', 'index']].rename(columns={'index': 'inventor_id_index'}), on='inventor_id')

In [None]:
assignee_patent_edge_index = []
for i in tqdm(range(len(g_assignee_patent))):
    assignee_patent_edge_index.append([g_assignee_patent['assignee_id_index'][i], g_assignee_patent['patent_id_index'][i]])

patent_assignee_edge_index = []
for i in tqdm(range(len(g_assignee_patent))):
    patent_assignee_edge_index.append([g_assignee_patent['patent_id_index'][i], g_assignee_patent['assignee_id_index'][i]])

inventor_patent_edge_index = []
for i in tqdm(range(len(g_inventor_patent))):
    inventor_patent_edge_index.append([g_inventor_patent['inventor_id_index'][i], g_inventor_patent['patent_id_index'][i]])

patent_inventor_edge_index = []
for i in tqdm(range(len(g_inventor_patent))):
    patent_inventor_edge_index.append([g_inventor_patent['patent_id_index'][i], g_inventor_patent['inventor_id_index'][i]])

In [None]:
g_inventor_nodes.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_inventor_nodes.csv', index=False)
g_assignee_nodes.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_assignee_nodes.csv', index=False)

In [None]:
patent_edge_index = pd.DataFrame(patent_edge_index, columns=['source', 'target'])
assignee_patent_edge_index = pd.DataFrame(assignee_patent_edge_index, columns=['source', 'target'])
patent_assignee_edge_index = pd.DataFrame(patent_assignee_edge_index, columns=['source', 'target'])
inventor_patent_edge_index = pd.DataFrame(inventor_patent_edge_index, columns=['source', 'target'])
patent_inventor_edge_index = pd.DataFrame(patent_inventor_edge_index, columns=['source', 'target'])

patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_edge_index.csv', index=False)
assignee_patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/assignee_patent_edge_index.csv', index=False)
patent_assignee_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_assignee_edge_index.csv', index=False)
inventor_patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/inventor_patent_edge_index.csv', index=False)
patent_inventor_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_inventor_edge_index.csv', index=False)

In [None]:
# If I want to load precomputed patent embeddings
g_patent = pd.read_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_patent_embedding.csv')

## Save Data

In [None]:
g_assignee_nodes = g_assignee_nodes.astype(str)
g_inventor_nodes = g_inventor_nodes.astype(str)

In [None]:
# Print datatypes of g_patent all columns
for col in g_patent.columns:
    print(col, g_patent[col].dtype)

# Print datatypes of g_assignee_nodes all columns
for col in g_assignee_nodes.columns:
    print(col, g_assignee_nodes[col].dtype)

In [None]:
def string_to_array(str_repr):
    return np.fromstring(str_repr.strip('[]'), sep=',')

# Open an HDF5 file
with h5py.File('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/torch_tek_dataset.h5', 'w') as f:
    # Save node data
    f.create_dataset('g_patent/x', data=np.stack(g_patent['patent_title_abstract_bert_for_patents_embedding'].apply(string_to_array).values))
    f.create_dataset('g_patent/y', data=g_patent['label'].values.astype(np.int64))
    f.create_dataset('g_assignee_nodes/x', data=np.stack(g_assignee_nodes['assignee_embedding'].apply(string_to_array).values))
    f.create_dataset('g_inventor_nodes/x', data=np.stack(g_inventor_nodes['inventor_embedding'].apply(string_to_array).values))
    
    # Save edge indices
    f.create_dataset('patent_edge_index', data=patent_edge_index.values, dtype=np.int64)
    f.create_dataset('assignee_patent_edge_index', data=assignee_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_assignee_edge_index', data=patent_assignee_edge_index.values, dtype=np.int64)
    f.create_dataset('inventor_patent_edge_index', data=inventor_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_inventor_edge_index', data=patent_inventor_edge_index.values, dtype=np.int64)

# Instantiate (heterogeneous) data model
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
- https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/ogb_mag.html

In [None]:
class PatentHeteroDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(PatentHeteroDataset, self).__init__(root, transform, pre_transform)
        self.data = None
        processed_path = osp.join(self.processed_dir, self.processed_file_names)
        if osp.exists(processed_path):
            self.data = torch.load(processed_path)
        else:
            self.process()

    @property
    def num_classes(self):
        return 2

    @property
    def raw_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/'
    
    @property
    def processed_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/processed/'

    # @property
    # def raw_file_names(self):
    #     return [
    #         'g_patent_embedding.csv',
    #         'g_inventor_nodes.csv',
    #         'g_assignee_nodes.csv',
    #         'patent_edge_index.csv',
    #         'assignee_edge_index.csv',
    #         'inventor_edge_index.csv'
    #     ]

    @property
    def raw_file_names(self):
        return [
            'torch_tek_dataset.h5'
        ]

    @property
    def processed_file_names(self):
        return 'gnn_tek_data.pt'

    def download(self):
        pass

    # def process(self): # Process for reading from csv using pandas
    #     # Initialize HeteroData object
    #     data = HeteroData()

    #     # Load and process node features
    #     # Assuming that the feature columns contain lists or arrays of features
    #     g_patent = pd.read_csv(osp.join(self.raw_dir, 'g_patent_embedding.csv'), usecols=['index', 'patent_title_abstract_bert_for_patents_embedding', 'label'])
    #     g_inventor_nodes = pd.read_csv(osp.join(self.raw_dir, 'g_inventor_nodes.csv'))
    #     g_assignee_nodes = pd.read_csv(osp.join(self.raw_dir, 'g_assignee_nodes.csv'))

    #     data['patent'].x = torch.tensor(np.stack(g_patent['patent_title_abstract_bert_for_patents_embedding'].apply(eval).values), dtype=torch.float)
    #     data['patent_inventor'].x = torch.tensor(np.stack(g_inventor_nodes['inventor_embedding'].apply(eval).values), dtype=torch.float)
    #     data['patent_assignee'].x = torch.tensor(np.stack(g_assignee_nodes['assignee_embedding'].apply(eval).values), dtype=torch.float)

    #     # Load and process edge indices
    #     patent_edge_index = pd.read_csv(osp.join(self.raw_dir, 'patent_edge_index.csv')).values
    #     inventor_edge_index = pd.read_csv(osp.join(self.raw_dir, 'inventor_edge_index.csv')).values
    #     assignee_edge_index = pd.read_csv(osp.join(self.raw_dir, 'assignee_edge_index.csv')).values

    #     data['patent', 'cites', 'patent'].edge_index = torch.tensor(patent_edge_index, dtype=torch.long).t().contiguous()
    #     data['patent_inventor', 'inventor_of', 'patent'].edge_index = torch.tensor(inventor_edge_index, dtype=torch.long).t().contiguous()
    #     data['patent_assignee', 'assignee_of', 'patent'].edge_index = torch.tensor(assignee_edge_index, dtype=torch.long).t().contiguous()

    #     # Load and process labels
    #     data['patent'].y = torch.tensor(g_patent['label'].values, dtype=torch.long)

    #     if self.pre_transform is not None:
    #         data = self.pre_transform(data)

    #     self.data = data  # Save the processed data to self.data
    #     torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def process(self):
        # Initialize HeteroData object
        data = HeteroData()
    
        # Open an HDF5 file
        with h5py.File(osp.join(self.raw_dir, 'torch_tek_dataset.h5'), 'r') as f:
            # Load and process node features
            data['patent'].x = torch.tensor(f['g_patent/x'][:], dtype=torch.float)
            data['patent'].y = torch.tensor(f['g_patent/y'][:], dtype=torch.long)
            data['patent_inventor'].x = torch.tensor(f['g_inventor_nodes/x'][:], dtype=torch.float)
            data['patent_assignee'].x = torch.tensor(f['g_assignee_nodes/x'][:], dtype=torch.float)
            
            # Load and process edge indices
            data['patent', 'cites', 'patent'].edge_index = torch.tensor(f['patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent_inventor', 'inventor_of', 'patent'].edge_index = torch.tensor(f['inventor_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent_assignee', 'assignee_of', 'patent'].edge_index = torch.tensor(f['assignee_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_assignee', 'patent_assignee'].edge_index = torch.tensor(f['patent_assignee_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_inventor', 'patent_inventor'].edge_index = torch.tensor(f['patent_inventor_edge_index'][:], dtype=torch.long).t().contiguous()

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # Create train_mask, val_mask, and test_mask
        data['patent'].train_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].val_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].test_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].train_mask[:int(0.8*data['patent'].num_nodes)] = 1
        data['patent'].val_mask[int(0.8*data['patent'].num_nodes):int(0.9*data['patent'].num_nodes)] = 1
        data['patent'].test_mask[int(0.9*data['patent'].num_nodes):] = 1

        # Diagnostic print statements
        print("Data keys after processing:", data.keys())
        print("Node types and their feature shapes:")
        for node_type, node_data in data.node_items():
            print(f"Node type: {node_type}")
            for key, item in node_data.items():
                if key == 'x' or key == 'y':
                    print(f"Features ({key}) shape:", item.size())

        print("Edge types and their index shapes:")
        for edge_type, edge_data in data.edge_items():
            print(f"Edge type: {edge_type}")
            if 'edge_index' in edge_data:
                print("Edge index shape:", edge_data['edge_index'].size())
            else:
                print(f"{edge_type} has no edge index.")
        # print("Train, validation, and test masks:")
        # print("Train mask:", data['patent'].train_mask)
        # print("Validation mask:", data['patent'].val_mask)
        # print("Test mask:", data['patent'].test_mask)
        

        self.data = data  # Save the processed data to self.data
        torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def len(self):
        return 1

    def get(self, idx):
        return self.data

In [None]:
dataset = PatentHeteroDataset(root='/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/')

In [None]:
PatentDataset = dataset[0]

In [None]:
for node_type in PatentDataset.node_types:
    print(f"Node type: {node_type}")
    print(f"Features (x) shape: {PatentDataset[node_type].x.shape}")
    if 'y' in PatentDataset[node_type]:
        print(f"Labels (y) shape: {PatentDataset[node_type].y.shape}")

for edge_type in PatentDataset.edge_types:
    print(f"Edge type: {edge_type}")
    print(f"Edge index shape: {PatentDataset[edge_type].edge_index.shape}")

# Build PyTorch Geoemtric Model

In [None]:
## Helper functions for visualization
def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

In [None]:
G = to_networkx(PatentDataset, to_undirected=False)

## Define NN architecture

- https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.HeteroConv.html#torch_geometric.nn.conv.HeteroConv
- https://github.com/pyg-team/pytorch_geometric/issues/4657
- https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.NeighborLoader

In [1]:
# Sanity Check -> check embeddings of nodes if they changed after training

In [None]:
class HeteroGCN(MessagePassing):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(HeteroGCN, self).__init__()
        torch.manual_seed(42)
        
        # Define a separate SAGEConv for each edge type with correct input feature sizes
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('patent_inventor', 'inventor_of', 'patent'): SAGEConv(num_node_features_dict['patent_inventor'], hidden_channels, add_self_loops=True),
            ('patent_assignee', 'assignee_of', 'patent'): SAGEConv(num_node_features_dict['patent_assignee'], hidden_channels, add_self_loops=True),
            ('patent', 'has_assignee', 'patent_assignee'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('patent', 'has_inventor', 'patent_inventor'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent_inventor', 'inventor_of', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent_assignee', 'assignee_of', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_assignee', 'patent_assignee'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_inventor', 'patent_inventor'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Only use the 'patent' node embeddings for the final prediction
        out = self.lin(x_dict['patent'])
        return out

In [None]:
class HeteroGAT(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(HeteroGAT, self).__init__()
        torch.manual_seed(42)

        self.conv1 = HeteroConv({
            edge_type: GATConv(num_node_features_dict[src_node_type], hidden_channels, add_self_loops=True)
            for edge_type, (src_node_type, _, dst_node_type) in
            [('patent', 'cites', 'patent'),
             ('patent_inventor', 'inventor_of', 'patent'),
             ('patent_assignee', 'assignee_of', 'patent'),
             ('patent', 'has_assignee', 'patent_assignee'),
             ('patent', 'has_inventor', 'patent_inventor')]
        }, aggr='mean')

        self.conv2 = HeteroConv({
            edge_type: GATConv(hidden_channels, hidden_channels, add_self_loops=True)
            for edge_type in self.conv1.convs
        }, aggr='mean')

        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict

        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        
        out = self.lin(x_dict['patent'])
        return out

In [None]:
class HeteroMetaPathGNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes, metapath_list):
        super(HeteroMetaPathGNN, self).__init__()
        torch.manual_seed(42)

        # Assuming MetaPath2Vec embeddings are precomputed
        self.embedding = nn.Embedding(num_embeddings=num_nodes, embedding_dim=hidden_channels)

        self.conv1 = HeteroConv({
            edge_type: SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
            for edge_type in metapath_list
        }, aggr='mean')

        self.conv2 = HeteroConv({
            edge_type: SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
            for edge_type in self.conv1.convs
        }, aggr='mean')

        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data, metapath_indices):
        # metapath_indices are indices for nodes following the metapath
        x_dict = {node_type: self.embedding(metapath_indices[node_type]) for node_type in metapath_indices}

        x_dict = self.conv1(x_dict, data.edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        
        x_dict = self.conv2(x_dict, data.edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}
        
        out = self.lin(x_dict['patent'])
        return out

In [None]:
num_node_features_dict = {'patent': 1024, 'patent_inventor': 1024, 'patent_assignee': 1024}
num_classes = 2

model = HeteroGCN(hidden_channels=64, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
model = model.to(device)
PatentDataset = PatentDataset.to(device)
train_loader = NeighborLoader(PatentDataset, num_neighbors=[1000], batch_size=64, shuffle=True, input_nodes=('patent', PatentDataset['patent'].train_mask))
test_loader = NeighborLoader(PatentDataset, num_neighbors=[1000], batch_size=64, shuffle=False, input_nodes=('patent', PatentDataset['patent'].test_mask))

# Train model

In [None]:
def train():
    model.train()
    total_loss = 0
    total_batches = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        try:
            out = model(batch)
            loss = criterion(out[batch['patent'].train_mask], batch['patent'].y[batch['patent'].train_mask])
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_batches += 1
        except Exception as e:
            print("Error during training:", e)
            raise
            
    return total_loss / total_batches if total_batches > 0 else 0

In [None]:
def test():
    model.eval()
    correct = 0
    total = 0

    for batch in test_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = model(batch)
            pred = out.argmax(dim=1)
            
            # Assuming batch['patent'].test_mask is a boolean mask
            test_mask = batch['patent'].test_mask
            test_labels = batch['patent'].y
            
            # Update correct and total counts
            correct += int((pred[test_mask] == test_labels[test_mask]).sum())
            total += int(test_mask.sum())

    test_acc = correct / total
    return test_acc


In [None]:
num_epochs = 100

for epoch in range(1, num_epochs + 1):
    loss = train()
    test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {test_acc:.4f}')