https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70
https://towardsdatascience.com/graph-neural-networks-with-pyg-on-node-classification-link-prediction-and-anomaly-detection-14aa38fe1275

In [None]:
import pandas as pd
import re
import numpy as np
import torch
import h5py
# import ast
from sqlalchemy import create_engine, URL, text
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
from rapidfuzz import fuzz, process, distance
from rapidfuzz.distance import Levenshtein
# from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing as mp
from torch_geometric.data import HeteroData, Dataset, Data
from torch_geometric.nn import SAGEConv, GATConv, HeteroConv, MessagePassing
from torch_geometric.loader import NeighborLoader
from sentence_transformers import SentenceTransformer
import os.path as osp
import gcld3

In [None]:
model = SentenceTransformer('distilbert/distilbert-base-uncased')
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data Preprocessing

## OpenAlex Works from Reliance on Science

In [None]:
df_rel_pcs = pd.read_csv("/mnt/hdd01/Reliance on Science/Raw Files/_pcs_oa.csv")
df_rel_ppp = pd.read_csv("/mnt/hdd01/Reliance on Science/Raw Files/_patent_paper_pairs.tsv", sep="\t")

In [None]:
df_rel_ppp = df_rel_ppp.astype(str)
df_rel_ppp['patent'] = df_rel_ppp['patent'].apply(lambda x: x.lower())
df_rel_ppp = df_rel_ppp[df_rel_ppp['patent'].apply(lambda x: "us" in x)] # ONLY US patents in the original dataset
df_rel_ppp['patent_id'] = df_rel_ppp['patent'].apply(lambda x: x.split("-", 1)[1].rsplit("-", 1)[0])

In [None]:
df_rel_pcs = df_rel_pcs.astype(str)
df_rel_pcs['patent'] = df_rel_pcs['patent'].apply(lambda x: x.lower())
df_rel_pcs = df_rel_pcs[df_rel_pcs['patent'].apply(lambda x: "us" in x)]
df_rel_pcs['patent_id'] = df_rel_pcs['patent'].apply(lambda x: x.split("-", 1)[1].rsplit("-", 1)[0])

In [58]:
df_rel_pcs_filtered = df_rel_pcs[~df_rel_pcs['oaid'].isin(df_rel_ppp['magid'])]
df_rel_pcs_filtered = df_rel_pcs_filtered.groupby('oaid').filter(lambda x: len(x) >= 5) # 5 is the minimum number of mentions in the dataset
df_rel_pcs_filtered = df_rel_pcs_filtered.groupby('patent_id').filter(lambda x: len(x) >= 5) # 5 is the minimum number of mentions in the dataset
df_rel_pcs_sample = df_rel_pcs_filtered.sample(n=df_rel_ppp['magid'].nunique(), random_state=42)
df_rel_pcs_sample = df_rel_pcs_sample.reset_index(drop=True)

In [59]:
oaid_list = np.unique(np.concatenate([df_rel_ppp['magid'].unique(), df_rel_pcs_sample['oaid'].unique()]))
patent_list = np.unique(np.concatenate([df_rel_ppp['patent_id'].unique(), df_rel_pcs_sample['patent_id'].unique()]))

### Extract Works from Postgres OpenAlex

In [None]:
url_object = URL.create(
    drivername="postgresql+psycopg2",
    username="tie",
    password="",
    host="",
    port="",
    database="",
)
engine = create_engine(url_object)

In [None]:
with engine.begin() as connection:
    connection.execute(text("""
        CREATE TEMPORARY TABLE temp_oaid_ppp (
            oaid VARCHAR PRIMARY KEY
        )
    """))
    oaid_prefixed = ['https://openalex.org/W' + str(oaid) for oaid in oaid_list]
    for oaid in tqdm(oaid_prefixed):
        connection.execute(text("INSERT INTO temp_oaid_ppp (oaid) VALUES (:oaid)"), {'oaid': oaid})

In [None]:
df_rel_postgres = pd.read_sql_query("""
    SELECT w.id, w.title, w.abstract_inverted_index 
    FROM openalex.works AS w
    JOIN temp_oaid_ppp AS t ON w.id = t.oaid
""", con=engine)
df_rel_postgres = df_rel_postgres.drop_duplicates(subset=['id'])

In [None]:
df_rel_postgres.head()

In [None]:
df_rel_postgres = df_rel_postgres[df_rel_postgres['abstract_inverted_index'].apply(lambda x: x is not None and x['InvertedIndex'] != {})]

def reconstruct_abstract(row):
    # Extract the InvertedIndex for the current row
    inverted_index = row['abstract_inverted_index']['InvertedIndex']
    
    # Create a mapping of positions to words
    position_to_word = {}
    for word, positions in inverted_index.items():
        for position in positions:
            position_to_word[position] = word
    
    # Sort positions and reconstruct the abstract
    sorted_positions = sorted(position_to_word.keys())
    full_text_abstract = " ".join(position_to_word[pos] for pos in sorted_positions)
    
    # Fix punctuation spacing
    full_text_abstract = re.sub(r'\s+([.,;?!:])', r'\1', full_text_abstract)
    
    return full_text_abstract

df_rel_postgres['abstract'] = df_rel_postgres.apply(reconstruct_abstract, axis=1)

In [None]:
df_rel_postgres.to_csv("/mnt/hdd01/Reliance on Science/ppp_oa_works.csv", index=False)

In [None]:
# df_rel_postgres = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_works.csv")

### Extract Authors from Postgres OpenAlex

In [None]:
df_rel_authors_postgres = pd.read_sql_query("""
    SELECT a.work_id, a.author_id
    FROM openalex.works_authorships as a
    JOIN temp_oaid_ppp AS t ON a.work_id = t.oaid
""", con=engine)
df_rel_authors_postgres = df_rel_authors_postgres.drop_duplicates(subset=['author_id', 'work_id'])

In [None]:
df_rel_authors_postgres_grouped = df_rel_authors_postgres.groupby('author_id')['work_id'].apply(list).reset_index()

In [None]:
with engine.begin() as connection:
    connection.execute(text("""
        CREATE TEMPORARY TABLE temp_author_id_ppp (
            author_id VARCHAR PRIMARY KEY
        )
    """))
    for author_id in tqdm(df_rel_authors_postgres_grouped['author_id']):
        connection.execute(text("INSERT INTO temp_author_id_ppp (author_id) VALUES (:author_id)"), {'author_id': author_id})

In [None]:
df_rel_authors_info_postgres = pd.read_sql_query("""
    SELECT a.id, a.display_name, a.display_name_alternatives
    FROM openalex.authors as a
    JOIN temp_author_id_ppp AS t ON a.id = t.author_id
""", con=engine)
df_rel_authors_info_postgres = df_rel_authors_info_postgres.drop_duplicates(subset=['display_name'])

In [None]:
df_rel_authors_complete = pd.merge(df_rel_authors_postgres_grouped, df_rel_authors_info_postgres, left_on='author_id', right_on='id', how='inner')

In [None]:
df_rel_authors_complete['oaid'] = df_rel_authors_complete['work_id'].apply(lambda x: [i.replace("https://openalex.org/W", "") for i in x])

In [None]:
df_rel_authors_complete.to_csv("/mnt/hdd01/Reliance on Science/ppp_oa_authors.csv", index=False)

In [None]:
# df_rel_authors_complete = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_authors.csv")

### Extract Paper Citations from Postgres OpenAlex

In [None]:
df_rel_citations_postgres = pd.read_sql_query("""
    SELECT w.work_id, w.referenced_work_id
    FROM openalex.works_referenced_works as w
    JOIN temp_oaid_ppp AS t1 ON w.work_id = t1.oaid
    JOIN temp_oaid_ppp AS t2 ON w.referenced_work_id = t2.oaid
""", con=engine)
df_rel_citations_postgres = df_rel_citations_postgres.drop_duplicates(subset=['work_id', 'referenced_work_id'])

In [None]:
df_rel_citations_postgres.to_csv("/mnt/hdd01/Reliance on Science/ppp_oa_citations.csv", index=False)

In [None]:
# df_rel_citations_postgres = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_citations.csv")

## Extract Patents from PATSTAT Postgres

In [None]:
url_object = URL.create(
    drivername="postgresql+psycopg2",
    username="tie",
    password="",
    host="",
    port="",
    database="",
)
engine = create_engine(url_object)

In [None]:
with engine.begin() as connection:
    connection.execute(text("""
        CREATE TEMPORARY TABLE temp_patentid_ppp (
            patentid VARCHAR PRIMARY KEY
        )
    """))
    for patent_id in tqdm(patent_list):
        connection.execute(text("INSERT INTO temp_patentid_ppp (patentid) VALUES (:patentid)"), {'patentid': patent_id})

## Fuzzy matching of Authors, Inventor for Patents with Authors for Papers (PCS and PPP)

### Preprocessing

In [None]:
df_patent_authors = df_patent_authors[df_patent_authors['full_name'] != "nan nan"]

In [None]:
df_rel_authors_complete['display_name_alternatives'] = df_rel_authors_complete['display_name_alternatives'].apply(lambda x: ast.literal_eval(x))
df_rel_authors_complete['display_name_alternatives'] = df_rel_authors_complete.apply(lambda row: row['display_name_alternatives'] + [row['display_name']] if isinstance(row['display_name_alternatives'], list) else [row['display_name']], axis=1)
df_rel_authors_complete_exploded = df_rel_authors_complete.explode('display_name_alternatives')
df_rel_authors_complete_exploded['oaid'] = df_rel_authors_complete_exploded['work_id'].apply(lambda x: [i.replace("https://openalex.org/W", "") for i in eval(x)])

In [None]:
df_rel_authors_complete_exploded['display_name_alternatives'] = df_rel_authors_complete_exploded['display_name_alternatives'].apply(lambda x: x.lower())
df_patent_authors['full_name'] = df_patent_authors['full_name'].apply(lambda x: x.lower())

### Full Matching

In [None]:
# # Define the match_names function
# def match_names(index):
#     row = df_patent_authors.loc[index]
#     full_name = row.get('full_name')
#     if full_name is not None:
#         return process.extractOne(full_name, df_rel_authors_complete_exploded['display_name_alternatives'], scorer=fuzz.token_set_ratio)
#     else:
#         return None

# # Use ProcessPoolExecutor to parallelize the matching process
# with ProcessPoolExecutor(max_workers=mp.cpu_count()) as executor:
#     futures = {executor.submit(match_names, index) for index in df_patent_authors.index}
#     results = []
#     for future in tqdm(as_completed(futures), total=len(futures)):
#         results.append(future.result())
#     df_patent_authors['best_match'] = results

### Matching depending on pcs and ppp relationships

In [None]:
df_rel_authors_complete_exploded_exploded = df_rel_authors_complete_exploded.explode('oaid')

In [None]:
df_patent_authors_filtered = df_patent_authors[df_patent_authors['full_name'].notna()]
df_merged = pd.merge(df_patent_authors_filtered, df_rel, on='patent_id')

df_merged = pd.merge(df_merged, df_rel_authors_complete_exploded_exploded, on='oaid')
df_merged = df_merged[['patent_id', 'assignee_id', 'inventor_id', 'full_name', 'oaid', 'author_id', 'display_name', 'display_name_alternatives']]

In [None]:
def match_names(row):
    full_name = row['full_name']
    match = distance.Levenshtein.normalized_similarity(full_name, row['display_name_alternatives'])
    return match

df_merged['best_match'] = df_merged.progress_apply(match_names, axis=1)
# df_merged_test['best_match'] = df_merged_test.progress_apply(match_names, axis=1)

# df_merged = df_merged.sort_values('best_match', ascending=False)
df_merged_filtered = df_merged[df_merged['best_match'] >= 0.75]
df_merged_filtered = df_merged_filtered.loc[df_merged_filtered.groupby(['patent_id', 'assignee_id', 'inventor_id', 'oaid', 'author_id'])['best_match'].idxmax()]

### Construct final authors dataframe

In [None]:
df_patent_authors_filtered = df_patent_authors[~df_patent_authors[['patent_id', 'assignee_id', 'inventor_id']].apply(tuple, 1).isin(df_merged_filtered[['patent_id', 'assignee_id', 'inventor_id']].apply(tuple, 1))]
df_patent_authors_filtered = df_patent_authors_filtered.drop_duplicates(subset=['patent_id', 'assignee_id', 'inventor_id'])

In [None]:
df_rel_authors_complete_filtered = df_rel_authors_complete_exploded_exploded[~df_rel_authors_complete_exploded_exploded[['oaid', 'author_id']].apply(tuple, 1).isin(df_merged_filtered[['oaid', 'author_id']].apply(tuple, 1))]
df_rel_authors_complete_filtered = df_rel_authors_complete_filtered.drop_duplicates(subset=['oaid', 'author_id'])
df_rel_authors_complete_filtered = df_rel_authors_complete_filtered[['oaid', 'author_id', 'display_name', 'display_name_alternatives']]

In [None]:
df_authors = pd.concat([df_merged_filtered, df_patent_authors_filtered, df_rel_authors_complete_filtered], ignore_index=True)

In [None]:
df_authors.to_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_patentsview_authors.csv", index=False)

In [None]:
# df_authors = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_patentsview_authors.csv", dtype=str)

In [None]:
df_authors.head()

# Graph Preparation - Embedding of all properties; Edge Indices for all properties; Create H5PY files

### Embedding of Node Properties

In [None]:
df_patent_cpc['embedding'] = model.encode(df_patent_cpc['patent_title'] + ' [SEP] ' + df_patent_cpc['patent_abstract'].apply(lambda x: " ".join(x)), device=device)

In [None]:
df_rel_postgres['embedding'] = model.encode(df_rel_postgres['title'] + ' [SEP] ' + df_rel_postgres['abstract'].apply(lambda x: " ".join(x)), device=device)

In [None]:
df_patent_cpc.to_csv("/mnt/hdd01/Reliance on Science/cleantech_patents_embedding.csv", index=False)
df_rel_postgres.to_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_works_embedding.csv", index=False)

In [None]:
# df_patent_cpc = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_patents_embedding.csv", dtype=str)
# df_rel_postgres = pd.read_csv("/mnt/hdd01/Reliance on Science/cleantech_oa_works_embedding.csv", dtype=str)

In [None]:
d = model.get_sentence_embedding_dimension()  
df_authors['embedding'] = df_authors.apply(lambda _: np.random.rand(d), axis=1)
df_authors['embedding'] = df_authors['embedding'].apply(lambda x: x / np.linalg.norm(x))

### Edge Indices for all relationships

In [None]:
patent_id_to_index = pd.Series(df_patent_cpc.index, index=df_patent_cpc['patent_id']).to_dict()
df_patent_edge_index = df_patent_citations.copy()
df_patent_edge_index = df_patent_edge_index[['patent_id', 'citation_patent_id']]
df_patent_edge_index['patent_id'] = df_patent_edge_index['patent_id'].map(patent_id_to_index)
df_patent_edge_index['citation_patent_id'] = df_patent_edge_index['citation_patent_id'].map(patent_id_to_index)
df_patent_edge_index = df_patent_edge_index.drop_duplicates(subset=['patent_id', 'citation_patent_id'])

In [None]:
df_rel_citations_postgres['work_id'] = df_rel_citations_postgres['work_id'].apply(lambda x: x.replace("https://openalex.org/W", ""))
df_rel_citations_postgres['referenced_work_id'] = df_rel_citations_postgres['referenced_work_id'].apply(lambda x: x.replace("https://openalex.org/W", ""))

In [None]:
paper_id_to_index = pd.Series(df_rel_postgres.index, index=df_rel_postgres['oaid']).to_dict()
df_paper_edge_index = df_rel_citations_postgres.copy()
df_paper_edge_index = df_paper_edge_index[['work_id', 'referenced_work_id']]
df_paper_edge_index['work_id'] = df_paper_edge_index['work_id'].map(paper_id_to_index)
df_paper_edge_index['referenced_work_id'] = df_paper_edge_index['referenced_work_id'].map(paper_id_to_index)
df_paper_edge_index = df_paper_edge_index.drop_duplicates(subset=['work_id', 'referenced_work_id'])

In [None]:
df_patent_paper_edge_index = df_rel.copy()
df_patent_paper_edge_index = df_patent_paper_edge_index[['patent_id', 'oaid']]
df_patent_paper_edge_index['patent_id'] = df_patent_paper_edge_index['patent_id'].map(patent_id_to_index)
df_patent_paper_edge_index['oaid'] = df_patent_paper_edge_index['oaid'].map(paper_id_to_index)
df_patent_paper_edge_index = df_patent_paper_edge_index.drop_duplicates(subset=['patent_id', 'oaid'])

In [None]:
author_id_to_index = pd.Series(df_authors.index, index=df_authors['author_id']).to_dict()
df_author_patent_edge_index = df_authors.copy()
# df_author_patent_edge_index = df_author_patent_edge_index.astype(str)
df_author_patent_edge_index = df_author_patent_edge_index[['author_id', 'patent_id']]
df_author_patent_edge_index['author_id'] = df_author_patent_edge_index['author_id'].map(author_id_to_index)
df_author_patent_edge_index['patent_id'] = df_author_patent_edge_index['patent_id'].map(patent_id_to_index)
df_patent_author_edge_index = df_author_patent_edge_index[['patent_id', 'author_id']]

In [None]:
df_author_paper_edge_index = df_authors.copy()
# df_author_paper_edge_index = df_author_paper_edge_index.astype(str)
df_author_paper_edge_index = df_author_paper_edge_index[['author_id', 'oaid']]
df_author_paper_edge_index['author_id'] = df_author_paper_edge_index['author_id'].map(author_id_to_index)
df_author_paper_edge_index['oaid'] = df_author_paper_edge_index['oaid'].map(paper_id_to_index)
df_paper_author_edge_index = df_author_paper_edge_index[['oaid', 'author_id']]

In [None]:
df_author_patent_edge_index = df_author_patent_edge_index.drop_duplicates(subset=['author_id', 'patent_id'])
df_author_paper_edge_index = df_author_paper_edge_index.drop_duplicates(subset=['author_id', 'oaid'])
df_patent_author_edge_index = df_patent_author_edge_index.drop_duplicates(subset=['patent_id', 'author_id'])
df_paper_author_edge_index = df_paper_author_edge_index.drop_duplicates(subset=['oaid', 'author_id'])

### Create H5PY files

In [None]:
# Delete all rows where strings are "nan"
df_author_patent_edge_index = df_author_patent_edge_index[df_author_patent_edge_index['patent_id'] != "nan"]
df_author_patent_edge_index = df_author_patent_edge_index[df_author_patent_edge_index['author_id'] != "nan"]
df_author_paper_edge_index = df_author_paper_edge_index[df_author_paper_edge_index['oaid'] != "nan"]
df_author_paper_edge_index = df_author_paper_edge_index[df_author_paper_edge_index['author_id'] != "nan"]
df_patent_author_edge_index = df_patent_author_edge_index[df_patent_author_edge_index['author_id'] != "nan"]
df_patent_author_edge_index = df_patent_author_edge_index[df_patent_author_edge_index['patent_id'] != "nan"]
df_paper_author_edge_index = df_paper_author_edge_index[df_paper_author_edge_index['author_id'] != "nan"]
df_paper_author_edge_index = df_paper_author_edge_index[df_paper_author_edge_index['oaid'] != "nan"]

df_patent_edge_index = df_patent_edge_index[df_patent_edge_index['patent_id'] != "nan"]
df_patent_edge_index = df_patent_edge_index[df_patent_edge_index['citation_patent_id'] != "nan"]
df_paper_edge_index = df_paper_edge_index[df_paper_edge_index['work_id'] != "nan"]
df_paper_edge_index = df_paper_edge_index[df_paper_edge_index['referenced_work_id'] != "nan"]
df_patent_paper_edge_index = df_patent_paper_edge_index[df_patent_paper_edge_index['patent_id'] != "nan"]
df_patent_paper_edge_index = df_patent_paper_edge_index[df_patent_paper_edge_index['oaid'] != "nan"]

In [None]:
df_patent_edge_index = df_patent_edge_index.astype(int)
df_paper_edge_index = df_paper_edge_index.astype(int)
df_patent_paper_edge_index = df_patent_paper_edge_index.astype(int)
df_author_patent_edge_index = df_author_patent_edge_index.astype(int)
df_author_paper_edge_index = df_author_paper_edge_index.astype(int)
df_patent_author_edge_index = df_patent_author_edge_index.astype(int)
df_paper_author_edge_index = df_paper_author_edge_index.astype(int)

In [None]:
def string_to_array(str_repr):
    return np.fromstring(str_repr.strip('[]'), sep=',')

# df_patent_cpc["embedding"] = df_patent_cpc["embedding"].apply(string_to_array)
# df_rel_postgres["embedding"] = df_rel_postgres["embedding"].apply(string_to_array)
# df_rel_postgres["patent_paper_pair"] = df_rel_postgres["patent_paper_pair"].astype(int)

# Delete all NaN values from edge indices
df_patent_edge_index = df_patent_edge_index.dropna()
df_paper_edge_index = df_paper_edge_index.dropna()
df_patent_paper_edge_index = df_patent_paper_edge_index.dropna()
df_author_patent_edge_index = df_author_patent_edge_index.dropna()
df_paper_author_edge_index = df_paper_author_edge_index.dropna()
df_author_paper_edge_index = df_author_paper_edge_index.dropna()

# Open an HDF5 file
with h5py.File('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Emergence/raw/torch_tek_dataset_distilbert_emergence.h5', 'w') as f:
    # Save node data
    f.create_dataset('g_patent/x', data=np.stack(df_patent_cpc["embedding"].values))
    f.create_dataset('g_paper/x', data=np.stack(df_rel_postgres["embedding"].values))
    f.create_dataset('g_paper/y', data=np.stack(df_rel_postgres["patent_paper_pair"].values))
    f.create_dataset('g_author/x', data=np.stack(df_authors["embedding"].values))
    
    # Save edge indices
    f.create_dataset('patent_edge_index', data=df_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('paper_edge_index', data=df_paper_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_paper_edge_index', data=df_patent_paper_edge_index.values, dtype=np.int64)
    f.create_dataset('author_patent_edge_index', data=df_author_patent_edge_index, dtype=np.int64)
    f.create_dataset('patent_author_edge_index', data=df_patent_author_edge_index, dtype=np.int64)
    f.create_dataset('author_paper_edge_index', data=df_author_paper_edge_index, dtype=np.int64)
    f.create_dataset('paper_author_edge_index', data=df_paper_author_edge_index, dtype=np.int64)

# Construct Heterogeneous Graph Model

In [None]:
class PPPHeteroDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(PPPHeteroDataset, self).__init__(root, transform, pre_transform)
        self.data = None
        # processed_path = osp.join(self.processed_dir, self.processed_file_names)
        # if osp.exists(processed_path):
        #     self.data = torch.load(processed_path)
        # else:
        self.process()

    @property
    def num_classes(self):
        return 2

    @property
    def raw_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Emergence/raw/'
    
    @property
    def processed_dir(self):
        return '/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Emergence/processed/'

    @property
    def raw_file_names(self):
        return [
            'torch_tek_dataset_distilbert_emergence.h5'
        ]

    @property
    def processed_file_names(self):
        return 'gnn_tek_data_distilbert_emergence.pt'

    def download(self):
        pass

    def process(self):
        # Initialize HeteroData object
        data = HeteroData()
    
        # Open an HDF5 file
        with h5py.File(osp.join(self.raw_dir, 'torch_tek_dataset_distilbert_emergence.h5'), 'r') as f:
            # Load and process node features
            data['patent'].x = torch.tensor(f['g_patent/x'][:], dtype=torch.float)
            data['paper'].x = torch.tensor(f['g_paper/x'][:], dtype=torch.float)
            data['paper'].y = torch.tensor(f['g_paper/y'][:], dtype=torch.long)

            data['author'].x = torch.tensor(f['g_author/x'][:], dtype=torch.float)
            
            # Load and process edge indices
            data['patent', 'cites', 'patent'].edge_index = torch.tensor(f['patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['paper', 'cites', 'paper'].edge_index = torch.tensor(f['paper_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'cites', 'paper'].edge_index = torch.tensor(f['patent_paper_edge_index'][:], dtype=torch.long).t().contiguous()

            data['author', 'author_of_patent', 'patent'].edge_index = torch.tensor(f['author_patent_edge_index'][:], dtype=torch.long).t().contiguous()
            data['author', 'author_of_paper', 'paper'].edge_index = torch.tensor(f['author_paper_edge_index'][:], dtype=torch.long).t().contiguous()
            data['patent', 'has_author_patent', 'author'].edge_index = torch.tensor(f['patent_author_edge_index'][:], dtype=torch.long).t().contiguous()
            data['paper', 'has_author_paper', 'author'].edge_index = torch.tensor(f['paper_author_edge_index'][:], dtype=torch.long).t().contiguous()

        if self.pre_transform is not None:
            data = self.pre_transform(data)

        # Create train_mask, val_mask, and test_mask
        data['paper'].train_mask = torch.zeros(data['paper'].num_nodes, dtype=torch.bool)
        # data['paper'].val_mask = torch.zeros(data['paper'].num_nodes, dtype=torch.bool)
        data['paper'].test_mask = torch.zeros(data['paper'].num_nodes, dtype=torch.bool)
        data['paper'].train_mask[:int(0.8*data['paper'].num_nodes)] = 1
        # data['paper'].val_mask[int(0.8*data['paper'].num_nodes):int(0.9*data['paper'].num_nodes)] = 1]
        data['paper'].test_mask[int(0.8*data['paper'].num_nodes):] = 1

        # Diagnostic print statements
        print("Data keys after processing:", data.keys())
        print("Node types and their feature shapes:")
        for node_type, node_data in data.node_items():
            print(f"Node type: {node_type}")
            for key, item in node_data.items():
                if key == 'x' or key == 'y':
                    print(f"Features ({key}) shape:", item.size())

        print("Edge types and their index shapes:")
        for edge_type, edge_data in data.edge_items():
            print(f"Edge type: {edge_type}")
            if 'edge_index' in edge_data:
                print("Edge index shape:", edge_data['edge_index'].size())
            else:
                print(f"{edge_type} has no edge index.")
        

        self.data = data  # Save the processed data to self.data
        torch.save(data, osp.join(self.processed_dir, self.processed_file_names))

    def len(self):
        return 1

    def get(self, idx):
        return self.data

In [None]:
ppp_dataset = PPPHeteroDataset(root='/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Emergence/raw/')

In [None]:
ppp_dataset_0 = ppp_dataset[0]

In [None]:
# Get the labels of the paper nodes in the train and test set
train_labels = ppp_dataset_0['paper'].y[ppp_dataset_0['paper'].train_mask].cpu().numpy()
test_labels = ppp_dataset_0['paper'].y[ppp_dataset_0['paper'].test_mask].cpu().numpy()

In [None]:
# Print the value counts of the labels
print("Value counts of labels for paper nodes in train set:", np.bincount(train_labels))
print("Value counts of labels for paper nodes in test set:", np.bincount(test_labels))

# Graph Neural Network Model

## Construct Graph Neural Network Model

In [None]:
class HeteroGCN(MessagePassing):
    def __init__(self, hidden_channels, num_node_features_dict, num_classes):
        super(HeteroGCN, self).__init__(aggr='mean')
        torch.manual_seed(42) # For reproducible results
        
        # Define a separate SAGEConv for each edge type with correct input feature sizes
        self.conv1 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('paper', 'cites', 'paper'): SAGEConv(num_node_features_dict['paper'], hidden_channels, add_self_loops=True),
            ('patent', 'cites', 'paper'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('author', 'author_of_patent', 'patent'): SAGEConv(num_node_features_dict['author'], hidden_channels, add_self_loops=True),
            ('author', 'author_of_paper', 'paper'): SAGEConv(num_node_features_dict['author'], hidden_channels, add_self_loops=True),
            ('patent', 'has_author_patent', 'author'): SAGEConv(num_node_features_dict['patent'], hidden_channels, add_self_loops=True),
            ('paper', 'has_author_paper', 'author'): SAGEConv(num_node_features_dict['paper'], hidden_channels, add_self_loops=True)
        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('patent', 'cites', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('paper', 'cites', 'paper'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'cites', 'paper'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('author', 'author_of_patent', 'patent'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('author', 'author_of_paper', 'paper'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('patent', 'has_author_patent', 'author'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True),
            ('paper', 'has_author_paper', 'author'): SAGEConv(hidden_channels, hidden_channels, add_self_loops=True)
        }, aggr='mean')

        # Linear layer for classifying patents
        self.lin = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, data):
        x_dict, edge_index_dict = data.x_dict, data.edge_index_dict

        # Include dropout for regularization
        # x_dict['paper'] = F.dropout(x_dict['paper'], p=0.2, training=self.training)

        # First convolution layer
        x_dict = self.conv1(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Second convolution layer
        x_dict = self.conv2(x_dict, edge_index_dict)
        x_dict = {key: x.relu() for key, x in x_dict.items()}

        # Only use the 'patent' node embeddings for the final prediction
        out = self.lin(x_dict['paper'])
        return out

## Model Instantiation

In [None]:
num_node_features_dict = {'patent': 768, 'paper': 768, 'author': 768}
model = HeteroGCN(hidden_channels=64, num_node_features_dict=num_node_features_dict, num_classes=2)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

## Neighbor Loader

In [None]:
model = model.to(device)
ppp_dataset_0 = ppp_dataset[0].to(device)
train_loader = NeighborLoader(ppp_dataset_0, num_neighbors=[100], batch_size=512, shuffle=True, input_nodes=('paper', ppp_dataset_0['paper'].train_mask))
test_loader = NeighborLoader(ppp_dataset_0, num_neighbors=[100], batch_size=512, shuffle=False, input_nodes=('paper', ppp_dataset_0['paper'].test_mask))

## Train and Test Loop

In [None]:
def train():
    model.train()
    total_loss = 0
    total_batches = 0
    
    for batch in train_loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        
        try:
            out = model(batch)
            loss = criterion(out[batch['paper'].train_mask], batch['paper'].y[batch['paper'].train_mask])
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            total_batches += 1
        except Exception as e:
            print("Error during training:", e)
            raise
            
    return total_loss / total_batches if total_batches else 0

In [None]:
def test():
    model.eval()
    correct = 0
    total = 0

    for batch in test_loader:
        batch = batch.to(device)
        with torch.no_grad():
            out = model(batch)

            pred = out.argmax(dim=1)
            
            test_mask = batch['paper'].test_mask # For Hetereogeneous NN 
            test_labels = batch['paper'].y # For Hetereogeneous NN 

            # Update correct and total counts
            correct += int((pred[test_mask] == test_labels[test_mask]).sum())
            total += int(test_mask.sum())

    test_acc = correct / total
    return test_acc


In [None]:
num_epochs = 100

for epoch in range(1, num_epochs + 1):
    loss = train()
    test_acc = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Test Acc: {test_acc:.4f}')