In [2]:
import re
import h5py
import os.path as osp
import torch
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
tqdm.pandas()
import nltk
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

stop_words = nltk.corpus.stopwords.words('english')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = 'distilbert/distilbert-base-uncased'
model = SentenceTransformer(model_name).to(device)
SEED = 42

No sentence-transformers model found with name distilbert/distilbert-base-uncased. Creating a new one with mean pooling.


In [3]:
# Load Cleantech and Non-Cleantech Text Data
df_cleantech = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_title_abstract.parquet')
df_cleantech = df_cleantech.astype(str)
df_cleantech = df_cleantech.dropna(subset=['appln_abstract'])
df_cleantech['label'] = 1

df_non_cleantech = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_non_cleantech_all_title_abstract.csv')
df_non_cleantech = df_non_cleantech.astype(str)
df_non_cleantech = df_non_cleantech.dropna(subset=['appln_abstract'])
df_non_cleantech['label'] = 0

In [4]:
# Load the cleantech to non-cleantech citations first to ensure inclusion
df_cleantech_non_cleantech_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_cleantech_to_non_cleantech_citations.csv')
df_cleantech_non_cleantech_citations = df_cleantech_non_cleantech_citations.astype(str)

# Sample cross-citation pairs to include (adjust sample size as needed)
sample_size = min(30000, len(df_cleantech_non_cleantech_citations))
cross_citation_sample = df_cleantech_non_cleantech_citations.sample(sample_size, random_state=SEED)

# Rename columns 
cross_citation_sample.rename(columns={"citing_appln_id": "pat_appln_id", "cited_appln_id": "cited_pat_appln_id"}, inplace=True)

In [5]:
# Get unique patent IDs from both sides of the citations
cleantech_patents_to_include = cross_citation_sample['pat_appln_id'].unique().tolist()
non_cleantech_patents_to_include = cross_citation_sample['cited_pat_appln_id'].unique().tolist()

print(f"Number of cleantech patents from cross-citations: {len(cleantech_patents_to_include)}")
print(f"Number of non-cleantech patents from cross-citations: {len(non_cleantech_patents_to_include)}")

Number of cleantech patents from cross-citations: 27559
Number of non-cleantech patents from cross-citations: 28753


In [6]:
# Filter to get patents with cross-citations
df_cleantech_must_include = df_cleantech[df_cleantech['appln_id'].isin(cleantech_patents_to_include)]
df_non_cleantech_must_include = df_non_cleantech[df_non_cleantech['appln_id'].isin(non_cleantech_patents_to_include)]

print(f"Found {len(df_cleantech_must_include)} cleantech patents to include")
print(f"Found {len(df_non_cleantech_must_include)} non-cleantech patents to include")

Found 22200 cleantech patents to include
Found 23575 non-cleantech patents to include


In [7]:
# Calculate remaining patents to sample
remaining_cleantech = df_cleantech[~df_cleantech['appln_id'].isin(cleantech_patents_to_include)]
remaining_non_cleantech = df_non_cleantech[~df_non_cleantech['appln_id'].isin(non_cleantech_patents_to_include)]

# Calculate how many more patents to sample to reach 100k each
n_cleantech_to_sample = 100000 - len(df_cleantech_must_include)
n_non_cleantech_to_sample = 100000 - len(df_non_cleantech_must_include)

print(f"Sampling {n_cleantech_to_sample} additional cleantech patents")
print(f"Sampling {n_non_cleantech_to_sample} additional non-cleantech patents")

# Sample additional patents
df_cleantech_additional = remaining_cleantech.sample(
    min(n_cleantech_to_sample, len(remaining_cleantech)), 
    random_state=SEED
)
df_non_cleantech_additional = remaining_non_cleantech.sample(
    min(n_non_cleantech_to_sample, len(remaining_non_cleantech)), 
    random_state=SEED
)

Sampling 77800 additional cleantech patents
Sampling 76425 additional non-cleantech patents


In [8]:
# Combine the must-include patents with the additional samples
df_cleantech_final = pd.concat([df_cleantech_must_include, df_cleantech_additional])
df_non_cleantech_final = pd.concat([df_non_cleantech_must_include, df_non_cleantech_additional])

# Create the final text dataframe
df_text = pd.concat([df_cleantech_final, df_non_cleantech_final], ignore_index=True)
df_text = df_text.sample(frac=1, random_state=SEED).reset_index(drop=True)

In [9]:
# Text preprocessing function
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\S+@\S+', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess text data
df_text['appln_abstract'] = df_text['appln_abstract'].progress_apply(preprocess_text)

100%|██████████| 200000/200000 [00:29<00:00, 6685.76it/s]


In [10]:
# Load Cleantech and Non-Cleantech Authors Data
df_cleantech_authors = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_person_id.csv')
df_cleantech_authors = df_cleantech_authors.astype(str)
df_non_cleantech_authors = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_non_cleantech_all_patstat_person_id.csv')
df_non_cleantech_authors = df_non_cleantech_authors.astype(str)
df_cleantech_authors = df_cleantech_authors[df_cleantech_authors['appln_id'].isin(df_text['appln_id'])]
df_non_cleantech_authors = df_non_cleantech_authors[df_non_cleantech_authors['appln_id'].isin(df_text['appln_id'])]
df_authors_edges = pd.concat([df_cleantech_authors, df_non_cleantech_authors], ignore_index=True)
df_authors_nodes = pd.DataFrame({'person_id': df_authors_edges['person_id'].unique()})

In [11]:
# Load Non-Cleantech Citations Data as they are very large
df_non_cleantech_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_non_cleantech_all_patstat_citations.csv')
df_non_cleantech_citations = df_non_cleantech_citations.astype(str)
df_non_cleantech_citations = df_non_cleantech_citations[
    df_non_cleantech_citations['pat_appln_id'].isin(df_text['appln_id'])
]
df_non_cleantech_citations = df_non_cleantech_citations[
    df_non_cleantech_citations['cited_pat_appln_id'].isin(df_text['appln_id'])
]

In [12]:
# Load Cleantech and Non-Cleantech Citation Data
df_cleantech_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.csv')
df_cleantech_citations = df_cleantech_citations.astype(str)

# Filter the citations to only include the patents in our sample
df_cleantech_citations = df_cleantech_citations[
    df_cleantech_citations['pat_appln_id'].isin(df_text['appln_id'])
]
df_cleantech_citations = df_cleantech_citations[
    df_cleantech_citations['cited_pat_appln_id'].isin(df_text['appln_id'])
]

# Rename columns for consistency
df_cleantech_non_cleantech_citations.rename(
    columns={"citing_appln_id": "pat_appln_id", "cited_appln_id": "cited_pat_appln_id"}, inplace=True
)

# Filter the cross-citations to only include the patents in our sample
df_cleantech_non_cleantech_citations = df_cleantech_non_cleantech_citations[
    df_cleantech_non_cleantech_citations['pat_appln_id'].isin(df_text['appln_id'])
]
df_cleantech_non_cleantech_citations = df_cleantech_non_cleantech_citations[
    df_cleantech_non_cleantech_citations['cited_pat_appln_id'].isin(df_text['appln_id'])
]

In [13]:
# Combine all citation datasets
df_citations = pd.concat([df_cleantech_citations, df_non_cleantech_citations, df_cleantech_non_cleantech_citations], ignore_index=True)
df_citations = df_citations[df_citations['pat_appln_id'].isin(df_text['appln_id'])]
df_citations = df_citations[df_citations['cited_pat_appln_id'].isin(df_text['appln_id'])]
df_citations = df_citations.drop_duplicates(subset=['pat_appln_id', 'cited_pat_appln_id'])

In [14]:
### Label Mapping Check ###
# Create a mapping from appln_id to label in df_text
label_mapping = df_text.set_index('appln_id')['label']

# Map labels to the citations dataframe
df_citations['pat_label'] = df_citations['pat_appln_id'].map(label_mapping)
df_citations['cited_label'] = df_citations['cited_pat_appln_id'].map(label_mapping)

# Count the occurrences in each column (ignoring non-matches, i.e. NaN)
pat_counts = df_citations['pat_label'].value_counts(dropna=True)
cited_counts = df_citations['cited_label'].value_counts(dropna=True)

print("Label counts for pat_appln_id:")
print(pat_counts)
print("\nLabel counts for cited_pat_appln_id:")
print(cited_counts)

# For comparison, consider only rows where both labels are found
df_valid = df_citations.dropna(subset=['pat_label', 'cited_label'])
different_labels = (df_valid['pat_label'] != df_valid['cited_label']).sum()

print("\nNumber of rows where pat_appln_id and cited_pat_appln_id have different labels:")
print(different_labels)

Label counts for pat_appln_id:
pat_label
1    400123
0    132387
Name: count, dtype: int64

Label counts for cited_pat_appln_id:
cited_label
0    409977
1    122533
Name: count, dtype: int64

Number of rows where pat_appln_id and cited_pat_appln_id have different labels:
288986


In [15]:
len(df_text), len(df_authors_nodes), len(df_authors_edges), len(df_cleantech_authors), len(df_non_cleantech_authors), len(df_citations)

(200000, 535234, 756150, 387905, 368245, 532510)

# Create Embeddings for Patents and Authors

In [16]:
scaler = StandardScaler()

In [None]:
# Embed Text Data
embeddings_text = model.encode(
    df_text['appln_abstract'].tolist(),
    show_progress_bar=True,
    convert_to_tensor=True,
    device=device
)

# Move tensor to CPU and convert to NumPy array before scaling
embeddings_text = scaler.fit_transform(embeddings_text.cpu().numpy())

df_text['embeddings'] = embeddings_text.tolist()
df_text.to_csv(f'/mnt/hdd02/Projekt_EDV_TEK/gnn_dataset_identification/edv_tek_all_text_embeddings_distilbert.csv', index=False)

Batches:   0%|          | 0/6250 [00:00<?, ?it/s]

In [18]:
# Precompute mapping: person_id -> list of appln_ids (from df_authors_edges)
person_to_applns = df_authors_edges.groupby('person_id')['appln_id'].agg(list).to_dict()

# Precompute mapping: appln_id -> index in df_text
appln_to_index = df_text.reset_index().set_index('appln_id')['index'].to_dict()

author_features = {}
for person_id in tqdm(df_authors_nodes['person_id']):
    # Get all application IDs for this author using the precomputed mapping
    author_appln_ids = person_to_applns.get(person_id, [])
    
    # Get indices quickly via dictionary lookup
    indices = [appln_to_index[appln] for appln in author_appln_ids if appln in appln_to_index]
    
    if indices:
        # Average the embeddings for these indices
        author_emb = np.mean(embeddings_text[indices], axis=0)
    else:
        # Fallback to random initialization if no patents are found
        author_emb = np.random.normal(size=embeddings_text.shape[1])
    
    author_features[person_id] = author_emb

# Add embeddings to the authors DataFrame
df_authors_nodes['embeddings'] = df_authors_nodes['person_id'].map(author_features).tolist()

100%|██████████| 535234/535234 [00:07<00:00, 67555.17it/s]


# Create Edge Indices

In [19]:
# Create index column
df_text['id'] = df_text.index
df_authors_nodes['id'] = df_authors_nodes.index

In [20]:
# Create Edges DataFrames
df_authors_edges['author_id'] = df_authors_edges['person_id'].map(df_authors_nodes.set_index('person_id')['id'])
df_authors_edges['text_id'] = df_authors_edges['appln_id'].map(df_text.set_index('appln_id')['id'])
df_authors_edges = df_authors_edges.dropna(subset=['author_id', 'text_id'])

df_citations_edges = df_citations.copy()
df_citations_edges['text_id'] = df_citations_edges['pat_appln_id'].map(df_text.set_index('appln_id')['id'])
df_citations_edges['cited_text_id'] = df_citations_edges['cited_pat_appln_id'].map(df_text.set_index('appln_id')['id'])
df_citations_edges = df_citations_edges.dropna(subset=['text_id', 'cited_text_id'])

In [21]:
# Convert all values to integers
df_authors_edges['author_id'] = df_authors_edges['author_id'].astype(int)
df_authors_edges['text_id'] = df_authors_edges['text_id'].astype(int)
df_citations_edges['text_id'] = df_citations_edges['text_id'].astype(int)
df_citations_edges['cited_text_id'] = df_citations_edges['cited_text_id'].astype(int)

# Build HDF5 Dataset

In [None]:
def string_to_array(str_repr):
    return np.fromstring(str_repr.strip('[]'), sep=',')

# Open an HDF5 file
with h5py.File('/mnt/hdd02/Projekt_EDV_TEK/gnn_dataset_identification/edv_tek_identification_gnn_dataset.h5', 'w') as f:
    # Save node data
    f.create_dataset('patent_nodes/x', data=np.stack(df_text['embeddings'].values))
    f.create_dataset('patent_nodes/y', data=df_text['label'].values.astype(np.int64))
    f.create_dataset('author_nodes/x', data=np.stack(df_authors_nodes['embeddings'].values))
    
    # Save edge indices
    f.create_dataset('patent_citations', data=df_citations_edges[['text_id', 'cited_text_id']].values, dtype=np.int64)
    f.create_dataset('author_patent_edges', data=df_authors_edges[['author_id', 'text_id']].values, dtype=np.int64)