**Construct new dataset where different labels types are also well connected**

In [19]:
import re
import h5py
import os.path as osp
import torch
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer
from torch_geometric.data import HeteroData, Dataset, Data
from torch_geometric.nn import GCNConv, HeteroConv, SAGEConv, GATConv, MessagePassing
from torch_geometric.utils import to_networkx
from torch_geometric.loader import NeighborLoader
from torch_geometric.explain import Explainer, GNNExplainer
from tqdm import tqdm
tqdm.pandas()

# stop_words = nltk.corpus.stopwords.words('english')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'distilbert/distilbert-base-uncased'
model = SentenceTransformer(model_name).to(device)
# model = SentenceTransformer('anferico/bert-for-patents').to(device)

No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/distilbert_distilbert-base-uncased. Creating a new one with MEAN pooling.


**Sources** (Also for tomorrow)
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/heterogeneous.html
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_gnn.html
- https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.HeteroConv.html#torch_geometric.nn.conv.HeteroConv
- https://pytorch-geometric.readthedocs.io/en/latest/cheatsheet/gnn_cheatsheet.html

# Data Preprocessing

## Patents

In [2]:
g_patent = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_patent.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'patent_title', 'patent_abstract'], low_memory=False)
g_cpc = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_cpc_current.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'cpc_class'], low_memory=False)
g_patent = g_patent.astype(str)
g_cpc = g_cpc.astype(str)

In [3]:
g_patent_cpc = pd.merge(g_patent, g_cpc, on='patent_id')
g_patent_cpc = g_patent_cpc.groupby('patent_id').agg({
    'cpc_class': list,
    'patent_title': 'first',
    'patent_abstract': 'first'
}).reset_index().rename(columns={'cpc_class': 'cpc_groups'})

In [4]:
g_patent_cleantech = g_patent_cpc[g_patent_cpc['cpc_groups'].apply(lambda x: 'Y02' in x)]
# g_patent_non_cleantech = g_patent_cpc.sample(n=len(g_patent_cleantech), random_state=42)

In [None]:
g_patent_citation = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_us_patent_citation.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'citation_patent_id'], low_memory=False)

In [7]:
is_patent_id_in_cleantech = g_patent_citation['patent_id'].isin(set(g_patent_cleantech['patent_id']))
is_citation_id_in_cleantech = g_patent_citation['citation_patent_id'].isin(set(g_patent_cleantech['patent_id']))

g_patent_citation_non_cleantech = g_patent_citation[is_patent_id_in_cleantech | is_citation_id_in_cleantech]

not_patent_id_in_cleantech = ~is_patent_id_in_cleantech
not_citation_id_in_cleantech = ~is_citation_id_in_cleantech
patent_id_counts = g_patent_citation_non_cleantech.loc[not_patent_id_in_cleantech, 'patent_id'].value_counts()
citation_patent_id_counts = g_patent_citation_non_cleantech.loc[not_citation_id_in_cleantech, 'citation_patent_id'].value_counts()

In [9]:
n = min(len(g_patent_cleantech) // 2 + 100000, len(patent_id_counts), len(citation_patent_id_counts))
top_patent_id_counts = patent_id_counts.head(n)
top_citation_patent_id_counts = citation_patent_id_counts.head(n)

In [10]:
g_patent_non_cleantech = pd.concat([
    g_patent_cpc.loc[g_patent_cpc['patent_id'].isin(top_patent_id_counts.index)],
    g_patent_cpc.loc[g_patent_cpc['patent_id'].isin(top_citation_patent_id_counts.index)]
]).drop_duplicates(subset='patent_id').sample(n=len(g_patent_cleantech), random_state=42)

print(g_patent_non_cleantech.head())

        patent_id                      cpc_groups  \
5610296   8123546                           [H01]   
2335129   4827887       [F02, F02, F02, F02, F02]   
7320789   9849578  [H02, B25, B23, B23, Y10, B25]   
2797642   5292511            [C12, A23, C12, C12]   
1653626   4145281                           [B01]   

                                              patent_title  \
5610296             Connector for large power transmission   
2335129  Adaptive charge mixture control system for int...   
7320789          Conductive boot for power tool protection   
2797642  Process for manufacturing a health-supplementa...   
1653626                         Water purification process   

                                           patent_abstract  
5610296  A connector includes a male terminal housing w...  
2335129  An adaptive charge mixture control for an inte...  
7320789  Apparatus for high voltage power line maintena...  
2797642   A process for manufacturing a health-suppleme...  
165

### Data Cleaning

In [13]:
def preprocess_text(text):
    # text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub(r'\S+@\S+', '', text) # remove emails
    text = re.sub(r'[^A-Za-z\s]', '', text) # remove non-alphabets
    text = re.sub(r'\s+', ' ', text).strip() # remove multiple spaces
    # text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

In [14]:
g_patent_cleantech.loc[:, 'patent_title'] = g_patent_cleantech['patent_title'].progress_apply(preprocess_text)
g_patent_cleantech.loc[:, 'patent_abstract'] = g_patent_cleantech['patent_abstract'].progress_apply(preprocess_text)
g_patent_non_cleantech.loc[:, 'patent_title'] = g_patent_non_cleantech['patent_title'].progress_apply(preprocess_text)
g_patent_non_cleantech.loc[:, 'patent_abstract'] = g_patent_non_cleantech['patent_abstract'].progress_apply(preprocess_text)

100%|██████████| 515783/515783 [00:04<00:00, 124440.51it/s]
100%|██████████| 515783/515783 [00:26<00:00, 19661.59it/s]
100%|██████████| 515783/515783 [00:03<00:00, 130574.08it/s]
100%|██████████| 515783/515783 [00:26<00:00, 19165.96it/s]


In [15]:
g_patent_cleantech.loc[:, 'patent_title_abstract'] = g_patent_cleantech['patent_title'] + ' [SEP] ' + g_patent_cleantech['patent_abstract']
g_patent_non_cleantech.loc[:, 'patent_title_abstract'] = g_patent_non_cleantech['patent_title'] + ' [SEP] ' + g_patent_non_cleantech['patent_abstract']

g_patent_cleantech.loc[:, 'label'] = 1
g_patent_non_cleantech.loc[:, 'label'] = 0

g_patent = pd.concat([g_patent_cleantech, g_patent_non_cleantech], ignore_index=True)

g_patent = g_patent.sort_values(by=['patent_id']).reset_index(drop=True)

g_patent['index'] = g_patent.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g_patent_cleantech.loc[:, 'patent_title_abstract'] = g_patent_cleantech['patent_title'] + ' [SEP] ' + g_patent_cleantech['patent_abstract']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  g_patent_cleantech.loc[:, 'label'] = 1


In [20]:
model_name_pure = model_name.split('/')[-1]
embeddings = model.encode(g_patent['patent_title_abstract'].tolist(), show_progress_bar=True, convert_to_tensor=True, device=device)
g_patent[f"patent_title_abstract_{model_name_pure}_embedding"] = list(embeddings.cpu().numpy())
# g_patent['patent_title_abstract_bert_for_patents_embedding'] = model.encode(g_patent['patent_title_abstract'].tolist(), show_progress_bar=True, convert_to_tensor=True, device=device)

Batches:   0%|          | 0/32237 [00:00<?, ?it/s]

In [21]:
g_patent.head()

Unnamed: 0,patent_id,cpc_groups,patent_title,patent_abstract,patent_title_abstract,label,index,patent_title_abstract_distilbert-base-uncased_embedding
0,10000011,"[B22, Y02, B22, B29, B29, B29, B22, B29, B29, ...",Supports for sintering additively manufactured...,To reduce distortion in an additively manufact...,Supports for sintering additively manufactured...,1,0,"[-0.30896538, -0.010423301, 0.29560864, 0.1019..."
1,10000017,"[Y02, B29, B29, B29, F05, B29, B29, F16, B29, ...",Method for mounting a vortex generator and mou...,The invention relates to a method for securing...,Method for mounting a vortex generator and mou...,1,1,"[-0.22623001, 0.13069649, 0.36648774, 0.050247..."
2,10000021,"[Y02, B22, B29, B22, B23, B29, B33, B22, B22, ...",Method for manufacturing threedimensional shap...,There is provided a method for manufacturing a...,Method for manufacturing threedimensional shap...,1,2,"[-0.37131295, 0.11316093, 0.28202266, -0.02531..."
3,10000023,"[B29, B29, B29, B29, B33, B29, B33, B33, G05]",Apparatus and method for forming threedimensio...,An apparatus and method for making a threedime...,Apparatus and method for forming threedimensio...,0,3,"[-0.36689574, 0.17876409, 0.2525317, -0.055100..."
4,10000025,"[Y02, B32, B32, B29, B29, Y10, B32, B29, B32, ...",Optimized crossply orientation in composite la...,A composite laminate has a primary axis of loa...,Optimized crossply orientation in composite la...,1,4,"[-0.05470236, 0.021735363, 0.25705713, 0.20773..."


In [None]:
# If I want to load precomputed embeddings
g_patent.to_csv(f"/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_patent_embedding_{model_name_pure}.csv", index=False)
# g_patent = pd.read_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_patent_embedding.csv')

## Assignees, Inventors and Authors

In [22]:
g_assignee = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_assignee_disambiguated.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'assignee_id', 'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last', 'disambig_assignee_organization'], low_memory=False)
g_inventor = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_inventor_disambiguated.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'inventor_id', 'disambig_inventor_name_first', 'disambig_inventor_name_last'], low_memory=False)

In [23]:
g_assignee = g_assignee[g_assignee['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_assignee = g_assignee.sort_values(by=['assignee_id']).reset_index(drop=True)
g_inventor = g_inventor[g_inventor['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_inventor = g_inventor.sort_values(by=['inventor_id']).reset_index(drop=True)

In [24]:
g_assignee_nodes = g_assignee[['assignee_id', 'disambig_assignee_individual_name_first', 'disambig_assignee_individual_name_last', 'disambig_assignee_organization']].drop_duplicates().reset_index(drop=True)
g_inventor_nodes = g_inventor[['inventor_id', 'disambig_inventor_name_first', 'disambig_inventor_name_last']].drop_duplicates().reset_index(drop=True)
g_assignee_nodes['index'] = g_assignee_nodes.index
g_inventor_nodes['index'] = g_inventor_nodes.index

In [25]:
# g_assignee_nodes['assignee_embedding'] = np.random.rand(len(g_assignee_nodes), 1024).tolist()
# g_inventor_nodes['inventor_embedding'] = np.random.rand(len(g_inventor_nodes), 1024).tolist()
g_assignee_nodes['assignee_embedding'] = np.random.rand(len(g_assignee_nodes), 768).tolist()
g_inventor_nodes['inventor_embedding'] = np.random.rand(len(g_inventor_nodes), 768).tolist()

## Patent Citations

In [24]:
# g_patent_citation = pd.read_csv('/mnt/hdd01/patentsview/Raw files/Raw zip files/g_us_patent_citation.tsv.zip', sep='\t', compression='zip', usecols=['patent_id', 'citation_patent_id'], low_memory=False)

In [26]:
g_patent_citation = g_patent_citation[g_patent_citation['citation_patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)
g_patent_citation = g_patent_citation[g_patent_citation['patent_id'].isin(g_patent_cleantech['patent_id'].tolist() + g_patent_non_cleantech['patent_id'].tolist())].reset_index(drop=True)

In [27]:
g_patent_citation = pd.merge(g_patent_citation, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_patent_citation = pd.merge(g_patent_citation, g_patent[['patent_id', 'index']].rename(columns={'index': 'citation_patent_id_index'}), left_on='citation_patent_id', right_on='patent_id').drop(columns=['patent_id_y']).rename(columns={'patent_id_x': 'patent_id'})

In [28]:
patent_edge_index = []
for i in tqdm(range(len(g_patent_citation))):
    patent_edge_index.append([g_patent_citation['patent_id_index'][i], g_patent_citation['citation_patent_id_index'][i]])
    patent_edge_index.append([g_patent_citation['citation_patent_id_index'][i], g_patent_citation['patent_id_index'][i]])

100%|██████████| 18401447/18401447 [05:38<00:00, 54356.88it/s]


## Inventor and Assignee - Patent Relationships

In [29]:
g_assignee_patent = pd.merge(g_assignee, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_assignee_patent = pd.merge(g_assignee_patent, g_assignee_nodes[['assignee_id', 'index']].rename(columns={'index': 'assignee_id_index'}), on='assignee_id')
g_inventor_patent = pd.merge(g_inventor, g_patent[['patent_id', 'index']].rename(columns={'index': 'patent_id_index'}), on='patent_id')
g_inventor_patent = pd.merge(g_inventor_patent, g_inventor_nodes[['inventor_id', 'index']].rename(columns={'index': 'inventor_id_index'}), on='inventor_id')

In [30]:
assignee_patent_edge_index = []
for i in tqdm(range(len(g_assignee_patent))):
    assignee_patent_edge_index.append([g_assignee_patent['assignee_id_index'][i], g_assignee_patent['patent_id_index'][i]])

patent_assignee_edge_index = []
for i in tqdm(range(len(g_assignee_patent))):
    patent_assignee_edge_index.append([g_assignee_patent['patent_id_index'][i], g_assignee_patent['assignee_id_index'][i]])

inventor_patent_edge_index = []
for i in tqdm(range(len(g_inventor_patent))):
    inventor_patent_edge_index.append([g_inventor_patent['inventor_id_index'][i], g_inventor_patent['patent_id_index'][i]])

patent_inventor_edge_index = []
for i in tqdm(range(len(g_inventor_patent))):
    patent_inventor_edge_index.append([g_inventor_patent['patent_id_index'][i], g_inventor_patent['inventor_id_index'][i]])

100%|██████████| 993034/993034 [00:07<00:00, 127129.48it/s]
100%|██████████| 993034/993034 [00:07<00:00, 127618.61it/s]
100%|██████████| 2850236/2850236 [00:31<00:00, 89096.46it/s] 
100%|██████████| 2850236/2850236 [00:22<00:00, 126760.90it/s]


In [None]:
# g_inventor_nodes.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_inventor_nodes.csv', index=False)
# g_assignee_nodes.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_assignee_nodes.csv', index=False)

In [31]:
patent_edge_index = pd.DataFrame(patent_edge_index, columns=['source', 'target'])
assignee_patent_edge_index = pd.DataFrame(assignee_patent_edge_index, columns=['source', 'target'])
patent_assignee_edge_index = pd.DataFrame(patent_assignee_edge_index, columns=['source', 'target'])
inventor_patent_edge_index = pd.DataFrame(inventor_patent_edge_index, columns=['source', 'target'])
patent_inventor_edge_index = pd.DataFrame(patent_inventor_edge_index, columns=['source', 'target'])

In [None]:
# patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_edge_index.csv', index=False)
# assignee_patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/assignee_patent_edge_index.csv', index=False)
# patent_assignee_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_assignee_edge_index.csv', index=False)
# inventor_patent_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/inventor_patent_edge_index.csv', index=False)
# patent_inventor_edge_index.to_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/patent_inventor_edge_index.csv', index=False)

In [None]:
# If I want to load precomputed patent embeddings
# g_patent = pd.read_csv('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/g_patent_embedding.csv')

## Save Data

In [32]:
g_assignee_nodes = g_assignee_nodes.astype(str)
g_inventor_nodes = g_inventor_nodes.astype(str)

In [None]:
# # Print datatypes of g_patent all columns
# for col in g_patent.columns:
#     print(col, g_patent[col].dtype)

# # Print datatypes of g_assignee_nodes all columns
# for col in g_assignee_nodes.columns:
#     print(col, g_assignee_nodes[col].dtype)

In [33]:
def string_to_array(str_repr):
    return np.fromstring(str_repr.strip('[]'), sep=',')

# Open an HDF5 file
with h5py.File('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/torch_tek_dataset_distilbert.h5', 'w') as f:
    # Save node data
    # f.create_dataset('g_patent/x', data=np.stack(g_patent['patent_title_abstract_bert_for_patents_embedding'].apply(string_to_array).values))
    f.create_dataset('g_patent/x', data=np.stack(g_patent[f"patent_title_abstract_{model_name_pure}_embedding"].values))
    f.create_dataset('g_patent/y', data=g_patent['label'].values.astype(np.int64))
    f.create_dataset('g_assignee_nodes/x', data=np.stack(g_assignee_nodes['assignee_embedding'].apply(string_to_array).values))
    f.create_dataset('g_inventor_nodes/x', data=np.stack(g_inventor_nodes['inventor_embedding'].apply(string_to_array).values))
    
    # Save edge indices
    f.create_dataset('patent_edge_index', data=patent_edge_index.values, dtype=np.int64)
    f.create_dataset('assignee_patent_edge_index', data=assignee_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_assignee_edge_index', data=patent_assignee_edge_index.values, dtype=np.int64)
    f.create_dataset('inventor_patent_edge_index', data=inventor_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_inventor_edge_index', data=patent_inventor_edge_index.values, dtype=np.int64)

# Instantiate (heterogeneous) data model
- https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html
- https://pytorch-geometric.readthedocs.io/en/latest/_modules/torch_geometric/datasets/ogb_mag.html

## Heterogeneous Dataset with Patents, Assignees and Inventors

In [35]:
class PatentHeteroDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(PatentHeteroDataset, self).__init__(root, transform, pre_transform)
        self.data = None
        # processed_path = osp.join(self.processed_dir, self.processed_file_names)
        # if osp.exists(processed_path):
        #     self.data = torch.load(processed_path)
        # else:
        self.process()

    @property
    def num_classes(self):
        return 2

    @property
    def raw_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/'
    
    @property
    def processed_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/processed/'

    # @property
    # def raw_file_names(self):
    #     return [
    #         'g_patent_embedding.csv',
    #         'g_inventor_nodes.csv',
    #         'g_assignee_nodes.csv',
    #         'patent_edge_index.csv',
    #         'assignee_edge_index.csv',
    #         'inventor_edge_index.csv'
    #     ]

    @property
    def raw_file_names(self):
        return [
            'torch_tek_dataset_distilbert.h5' # Adjust to correct model
        ]

    @property
    def processed_file_names(self):
        return 'gnn_tek_data_distilbert.pt' # Adjust to correct model

    def download(self):
        pass

    # def process(self): # Process for reading from csv using pandas
    #     # Initialize HeteroData object
    #     data = HeteroData()

    #     # Load and process node features
    #     # Assuming that the feature columns contain lists or arrays of features
    #     g_patent = pd.read_csv(osp.join(self.raw_dir, 'g_patent_embedding.csv'), usecols=['index', 'patent_title_abstract_bert_for_patents_embedding', 'label'])
    #     g_inventor_nodes = pd.read_csv(osp.join(self.raw_dir, 'g_inventor_nodes.csv'))
    #     g_assignee_nodes = pd.read_csv(osp.join(self.raw_dir, 'g_assignee_nodes.csv'))

    #     data['patent'].x = torch.tensor(np.stack(g_patent['patent_title_abstract_bert_for_patents_embedding'].apply(eval).values), dtype=torch.float)
    #     data['patent_inventor'].x = torch.tensor(np.stack(g_inventor_nodes['inventor_embedding'].apply(eval).values), dtype=torch.float)
    #     data['patent_assignee'].x = torch.tensor(np.stack(g_assignee_nodes['assignee_embedding'].apply(eval).values), dtype=torch.float)

    #     # Load and process edge indices
    #     patent_edge_index = pd.read_csv(osp.join(self.raw_dir, 'patent_edge_index.csv')).values
    #     inventor_edge_index = pd.read_csv(osp.join(self.raw_dir, 'inventor_edge_index.csv')).values
    #     assignee_edge_index = pd.read_csv(osp.join(self.raw_dir, 'assignee_edge_index.csv')).values

    #     data['patent', 'cites', 'patent'].edge_index = torch.tensor(patent_edge_index, dtype=torch.long).t().contiguous()
    #     data['patent_inventor', 'inventor_of', 'patent'].edge_index = torch.tensor(inventor_edge_index, dtype=torch.long).t().contiguous()
    #     data['patent_assignee', 'assignee_of', 'patent'].edge_index = torch.tensor(assignee_edge_index, dtype=torch.long).t().contiguous()

    #     # Load and process labels
    #     data['patent'].y = torch.tensor(g_patent['label'].values, dtype=torch.long)

    #     if self.pre_transform is not None:
    #         data = self.pre_transform(data)

    #     self.data = data  # Save the processed data to self.data
    #     torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def process(self):
        # Initialize HeteroData object
        data = HeteroData()
    
        # Open an HDF5 file
        with h5py.File(osp.join(self.raw_dir, 'torch_tek_dataset_distilbert.h5'), 'r') as f:
            # Load and process node features
            data['patent'].x = torch.tensor(f['g_patent/x'][:], dtype=torch.float)
            data['patent'].y = torch.tensor(f['g_patent/y'][:], dtype=torch.long)
            data['patent_inventor'].x = torch.tensor(f['g_inventor_nodes/x'][:], dtype=torch.float)
            data['patent_assignee'].x = torch.tensor(f['g_assignee_nodes/x'][:], dtype=torch.float)
            
            # Load and process edge indices
            data['patent', 'cites', 'patent'].edge_index = torch.tensor(f['patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent_inventor', 'inventor_of', 'patent'].edge_index = torch.tensor(f['inventor_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent_assignee', 'assignee_of', 'patent'].edge_index = torch.tensor(f['assignee_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_assignee', 'patent_assignee'].edge_index = torch.tensor(f['patent_assignee_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_inventor', 'patent_inventor'].edge_index = torch.tensor(f['patent_inventor_edge_index'][:], dtype=torch.long).t().contiguous()

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # Create train_mask, val_mask, and test_mask
        data['patent'].train_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].val_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].test_mask = torch.zeros(data['patent'].num_nodes, dtype=torch.bool)
        data['patent'].train_mask[:int(0.8*data['patent'].num_nodes)] = 1
        data['patent'].val_mask[int(0.8*data['patent'].num_nodes):int(0.9*data['patent'].num_nodes)] = 1
        data['patent'].test_mask[int(0.9*data['patent'].num_nodes):] = 1

        # Diagnostic print statements
        print("Data keys after processing:", data.keys())
        print("Node types and their feature shapes:")
        for node_type, node_data in data.node_items():
            print(f"Node type: {node_type}")
            for key, item in node_data.items():
                if key == 'x' or key == 'y':
                    print(f"Features ({key}) shape:", item.size())

        print("Edge types and their index shapes:")
        for edge_type, edge_data in data.edge_items():
            print(f"Edge type: {edge_type}")
            if 'edge_index' in edge_data:
                print("Edge index shape:", edge_data['edge_index'].size())
            else:
                print(f"{edge_type} has no edge index.")
        # print("Train, validation, and test masks:")
        # print("Train mask:", data['patent'].train_mask)
        # print("Validation mask:", data['patent'].val_mask)
        # print("Test mask:", data['patent'].test_mask)
        

        self.data = data  # Save the processed data to self.data
        torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def len(self):
        return 1

    def get(self, idx):
        return self.data

In [36]:
dataset = PatentHeteroDataset(root='/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/')

Data keys after processing: ['val_mask', 'train_mask', 'y', 'test_mask', 'x', 'edge_index']
Node types and their feature shapes:
Node type: patent
Features (x) shape: torch.Size([1031566, 768])
Features (y) shape: torch.Size([1031566])
Node type: patent_inventor
Features (x) shape: torch.Size([957088, 768])
Node type: patent_assignee
Features (x) shape: torch.Size([108717, 768])
Edge types and their index shapes:
Edge type: ('patent', 'cites', 'patent')
Edge index shape: torch.Size([2, 36802894])
Edge type: ('patent_inventor', 'inventor_of', 'patent')
Edge index shape: torch.Size([2, 2850236])
Edge type: ('patent_assignee', 'assignee_of', 'patent')
Edge index shape: torch.Size([2, 993034])
Edge type: ('patent', 'has_assignee', 'patent_assignee')
Edge index shape: torch.Size([2, 993034])
Edge type: ('patent', 'has_inventor', 'patent_inventor')
Edge index shape: torch.Size([2, 2850236])


In [38]:
PatentDataset = dataset[0]

In [37]:
for node_type in PatentDataset.node_types:
    print(f"Node type: {node_type}")
    print(f"Features (x) shape: {PatentDataset[node_type].x.shape}")
    if 'y' in PatentDataset[node_type]:
        print(f"Labels (y) shape: {PatentDataset[node_type].y.shape}")

for edge_type in PatentDataset.edge_types:
    print(f"Edge type: {edge_type}")
    print(f"Edge index shape: {PatentDataset[edge_type].edge_index.shape}")

Node type: patent
Features (x) shape: torch.Size([1031566, 768])
Labels (y) shape: torch.Size([1031566])
Node type: patent_inventor
Features (x) shape: torch.Size([1121503, 768])
Node type: patent_assignee
Features (x) shape: torch.Size([126076, 768])
Edge type: ('patent', 'cites', 'patent')
Edge index shape: torch.Size([2, 7331458])
Edge type: ('patent_inventor', 'inventor_of', 'patent')
Edge index shape: torch.Size([2, 2756942])
Edge type: ('patent_assignee', 'assignee_of', 'patent')
Edge index shape: torch.Size([2, 986154])
Edge type: ('patent', 'has_assignee', 'patent_assignee')
Edge index shape: torch.Size([2, 986154])
Edge type: ('patent', 'has_inventor', 'patent_inventor')
Edge index shape: torch.Size([2, 2756942])


## Homogeneous Dataset with Patent and Patent Citations

In [2]:
class TestPatentHomoDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(TestPatentHomoDataset, self).__init__(root, transform, pre_transform)
        self.data = None
        self.process()

    @property
    def num_classes(self):
        return 2

    @property
    def raw_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/raw/'

    @property
    def processed_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/processed/'

    @property
    def raw_file_names(self):
        return ['torch_tek_dataset_distilbert.h5']

    @property
    def processed_file_names(self):
        return 'gnn_tek_data_distilbert.pt'

    def process(self):
        data = Data()

        with h5py.File(osp.join(self.raw_dir, 'torch_tek_dataset_distilbert.h5'), 'r') as f:
            # Load and process node features and labels for patents
            data.x = torch.tensor(f['g_patent/x'][:], dtype=torch.float)
            data.y = torch.tensor(f['g_patent/y'][:], dtype=torch.long)
            
            # Load and process edge indices for 'patent' 'cites' 'patent'
            data.edge_index = torch.tensor(f['patent_edge_index'][:], dtype=torch.long).t().contiguous()

            num_nodes = data.x.size(0)

            # Create train_mask, val_mask, and test_mask
            data.train_mask = torch.zeros(num_nodes, dtype=torch.bool)
            data.val_mask = torch.zeros(num_nodes, dtype=torch.bool)
            data.test_mask = torch.zeros(num_nodes, dtype=torch.bool)
            
            # Define splits
            train_end = int(0.8 * num_nodes)
            val_end = int(0.9 * num_nodes)

            data.train_mask[:train_end] = 1
            data.val_mask[train_end:val_end] = 1
            data.test_mask[val_end:] = 1

        self.data = data
        torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def len(self):
        return 1

    def get(self, idx):
        return self.data

In [3]:
test_dataset = TestPatentHomoDataset(root='/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK/')

In [4]:
test_PatentDataset = test_dataset[0]

# Build PyTorch Geoemtric Model

In [39]:
## Helper functions for visualization
def visualize_graph(G, color):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    nx.draw_networkx(G, pos=nx.spring_layout(G, seed=42), with_labels=False,
                     node_color=color, cmap="Set2")
    plt.show()


def visualize_embedding(h, color, epoch=None, loss=None):
    plt.figure(figsize=(7,7))
    plt.xticks([])
    plt.yticks([])
    h = h.detach().cpu().numpy()
    plt.scatter(h[:, 0], h[:, 1], s=140, c=color, cmap="Set2")
    if epoch is not None and loss is not None:
        plt.xlabel(f'Epoch: {epoch}, Loss: {loss.item():.4f}', fontsize=16)
    plt.show()

## Define NN architecture

- https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.HeteroConv.html#torch_geometric.nn.conv.HeteroConv
- https://github.com/pyg-team/pytorch_geometric/issues/4657
- https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html#torch_geometric.loader.NeighborLoader
- https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.nn.conv.MessagePassing.html#torch_geometric.nn.conv.MessagePassing - *if passed to GNN, Message Passing will be performed on the graph*

### NN Architecture for Heteregeneous Data

In [40]:
class SimplifiedHeteroGCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(SimplifiedHeteroGCN, self).__init__()
        torch.manual_seed(42)  # For reproducible results
        
        self.dropout = torch.nn.Dropout(p=0.2)  # Define dropout layer

        # Define a SAGEConv for essential relations
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict
        
        # Apply dropout to 'patent' node features
        x_dict['patent'] = self.dropout(x_dict['patent'])

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}

        # Predictions for 'patent' node embeddings
        out = self.lin(x_dict['patent'])
        return out


In [48]:
class HeteroGCN(MessagePassing):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(HeteroGCN, self).__init__(aggr='mean')
        torch.manual_seed(42) # For reproducible results
        
        # Define a separate SAGEConv for each edge type with correct input feature sizes
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('patent_inventor', 'inventor_of', 'patent'): SAGEConv(num_node_features_dict['patent_inventor'], hidden_channels, add_self_loops=True),
            ('patent_assignee', 'assignee_of', 'patent'): SAGEConv(num_node_features_dict['patent_assignee'], hidden_channels, add_self_loops=True),
            ('patent', 'has_assignee', 'patent_assignee'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('patent', 'has_inventor', 'patent_inventor'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent_inventor', 'inventor_of', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent_assignee', 'assignee_of', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_assignee', 'patent_assignee'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_inventor', 'patent_inventor'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict

        # Include dropout for regularization
        x_dict['patent'] = F.dropout(x_dict['patent'], p=0.2, training=self.training)

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Only use the 'patent' node embeddings for the final prediction
        out = self.lin(x_dict['patent'])
        return out

In [54]:
# num_node_features_dict = {'patent': 1024, 'patent_inventor': 1024, 'patent_assignee': 1024}
num_node_features_dict = {'patent': 768, 'patent_inventor': 768, 'patent_assignee': 768}
num_classes = 2

# model = HeteroGCN(hidden_channels=64, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
model = HeteroGCN(hidden_channels=512, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
# model = SimplifiedHeteroGCN(hidden_channels=512, num_node_features_dict=num_node_features_dict, num_classes=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

### NN Architecture for Homogeneous Data

In [5]:
class TestGCN(torch.nn.Module):
    def __init__(self, hidden_channels, num_classes):
        super().__init__()
        # torch.manual_seed(42)  # For reproducible results
        self.conv1 = GCNConv(768, num_classes)
        # self.lin2 = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        # x = F.dropout(x, p=0.5, training=self.training)
        # x = self.lin2(x)
        return x

model = TestGCN(hidden_channels=16, num_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()
# print(model)

## Neighbor Loader
- https://pytorch-geometric.readthedocs.io/en/latest/modules/loader.html *If data does not fully fit into GPU memory, we can use the NeighborLoader to perform mini-batch training.*

### Neighbor Loader for Heterogeneous Data

In [55]:
model = model.to(device)
PatentDataset = PatentDataset.to(device)
train_loader = NeighborLoader(PatentDataset, num_neighbors=[100], batch_size=512, shuffle=True, input_nodes=('patent', PatentDataset['patent'].train_mask))
test_loader = NeighborLoader(PatentDataset, num_neighbors=[100], batch_size=512, shuffle=False, input_nodes=('patent', PatentDataset['patent'].test_mask))

### Neighbor Loader for Homogeneous Data

In [6]:
model = model.to(device)
test_PatentDataset = test_PatentDataset.to(device)
train_loader = NeighborLoader(test_PatentDataset, num_neighbors=[100], batch_size=512, shuffle=True)
test_loader = NeighborLoader(test_PatentDataset, num_neighbors=[100], batch_size=512, shuffle=False)

# Train model

In [12]:
# Print embeddings of first 5 nodes of node type 'patent' before training
# print(PatentDataset['patent'].x[:5])
# print(test_PatentDataset.x[:5])
# print(test_PatentDataset.y[:25])

tensor([0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        1], device='cuda:0')


In [56]:
def train():
    model.train()
    total_loss = 0
    total_batches = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        try:
            out = model(batch) # For Heterogeneous NN
            # out = model(batch.x, batch.edge_index) # For Homogeneous NN
            loss = criterion(out[batch['patent'].train_mask], batch['patent'].y[batch['patent'].train_mask]) # For Hetereogeneous NN 
            # loss = criterion(out[batch.train_mask], batch.y[batch.train_mask]) # For Homogeneous NN
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_batches += 1
        except Exception as e:
            print("Error during training:", e)
            raise
            
    return total_loss / total_batches if total_batches else 0

In [57]:
def test():
    model.eval()
    correct = 0
    total = 0

    for batch in test_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = model(batch)
            # out = model(batch.x, batch.edge_index)
            pred = out.argmax(dim=1)
            
            # Assuming batch['patent'].test_mask is a boolean mask
            test_mask = batch['patent'].test_mask # For Hetereogeneous NN 
            # test_mask = batch.test_mask # For Homogeneous NN
            test_labels = batch['patent'].y # For Hetereogeneous NN 
            # test_labels = batch.y # For Homogeneous NN

            # Update correct and total counts
            correct += int((pred[test_mask] == test_labels[test_mask]).sum())
            total += int(test_mask.sum())

    test_acc = correct / total
    return test_acc


In [58]:
num_epochs = 10

for epoch in range(1, num_epochs + 1):
    loss = train()
    test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {test_acc:.4f}')

Epoch: 001, Loss: 0.5377, Test Acc: 0.6856
Epoch: 002, Loss: 0.5141, Test Acc: 0.7243
Epoch: 003, Loss: 0.5119, Test Acc: 0.7082
Epoch: 004, Loss: 0.5104, Test Acc: 0.7282
Epoch: 005, Loss: 0.5094, Test Acc: 0.7202
Epoch: 006, Loss: 0.5088, Test Acc: 0.6956
Epoch: 007, Loss: 0.5085, Test Acc: 0.7323
Epoch: 008, Loss: 0.5080, Test Acc: 0.7236
Epoch: 009, Loss: 0.5080, Test Acc: 0.7270
Epoch: 010, Loss: 0.5076, Test Acc: 0.7319
