In [1]:
import pandas as pd 
import h5py
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


# Load all required DataFrames

In [2]:
df_patent = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_title_abstract.parquet')
df_paper = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_openalex_title_abstract.csv')
df_patent_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.csv')
df_paper_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_openalex_citations.csv')

# Preprocess Patent-Paper Citations and Patent-Paper Pairs

In [28]:
df_patent_paper_citations = pd.read_csv("/mnt/hdd02/Projekt_EDV_TEK/Reliance_on_Science/_pcs_oa.csv")
df_patent_paper_pairs = pd.read_csv("/mnt/hdd02/Projekt_EDV_TEK/Reliance_on_Science/_patent_paper_pairs.csv")
df_patent_paper_pairs = df_patent_paper_pairs.rename(columns={'paperid': 'oaid'})

In [None]:
# Filter df_patent as only "US" patents are in df_rel_ppp!
df_patent = df_patent[df_patent['publn_auth'] == 'US']

# Extract publication numbers from patent-paper dataframes
df_patent_paper_citations['publn_nr'] = df_patent_paper_citations['patent'].str.split('-').str[1]
df_patent_paper_pairs['publn_nr'] = df_patent_paper_pairs['patent'].str.split('-').str[1]

# Add leading "W" to oaid in df_patent_paper_citations
df_patent_paper_citations['oaid'] = 'W' + df_patent_paper_citations['oaid'].astype(str)

# Filter patent-paper dataframes to only include papers in df_paper
df_patent_paper_citations = df_patent_paper_citations[df_patent_paper_citations['oaid'].isin(df_paper['oaid'])]
df_patent_paper_pairs = df_patent_paper_pairs[df_patent_paper_pairs['oaid'].isin(df_paper['oaid'])]

# Filter further to only include patents in df_patent
df_patent_paper_citations = df_patent_paper_citations[df_patent_paper_citations['publn_nr'].isin(df_patent['publn_nr'])]
df_patent_paper_pairs = df_patent_paper_pairs[df_patent_paper_pairs['publn_nr'].isin(df_patent['publn_nr'])]

In [41]:
# Filter df_patent_citations to only include patents in df_patent
df_patent_citations['pat_appln_id'] = df_patent_citations['pat_appln_id'].astype(str)
df_patent_citations['cited_pat_appln_id'] = df_patent_citations['cited_pat_appln_id'].astype(str)
df_patent['appln_id'] = df_patent['appln_id'].astype(str)
df_patent_citations = df_patent_citations[df_patent_citations['pat_appln_id'].isin(df_patent['appln_id'])]
df_patent_citations = df_patent_citations[df_patent_citations['cited_pat_appln_id'].isin(df_patent['appln_id'])]

# Filter df_paper_citations to only include papers in df_paper
df_paper_citations['oaid'] = "W" + df_paper_citations['oaid'].astype(str)
df_paper_citations['cited_oaid'] = "W" + df_paper_citations['cited_oaid'].astype(str)
df_paper_citations = df_paper_citations[df_paper_citations['oaid'].isin(df_paper['oaid'])]
df_paper_citations = df_paper_citations[df_paper_citations['cited_oaid'].isin(df_paper['oaid'])]

In [42]:
len(df_patent_paper_citations), len(df_patent_paper_pairs), len(df_patent), len(df_paper), len(df_patent_citations), len(df_paper_citations)

(5586603, 97928, 1642111, 1137787, 236311564, 9172502)

# Embed Patent and Paper Text Columns

In [None]:
df_patent['text'] = df_patent['appln_title'] + ' [SEP] ' + df_patent['appln_abstract']
df_paper['text'] = df_paper['title'] + ' [SEP] ' + df_paper['abstract']

In [7]:
# Load the sentence-transformer model
print("Loading a local sentence-transformer model...")
model = SentenceTransformer('paraphrase-MiniLM-L6-v2', use_auth_token=False)
model.to('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Model loaded on {model.device}")

Loading a local sentence-transformer model...




Model loaded on cuda:0


In [8]:
# Function to batch encode texts with progress bar
def encode_texts(texts, batch_size=32):
    """Encode texts in batches with progress bar"""
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Encoding texts"):
        batch = texts[i:i+batch_size]
        batch_embeddings = model.encode(batch, convert_to_tensor=True)
        embeddings.append(batch_embeddings.cpu().numpy())
    return np.vstack(embeddings)

In [9]:
# Process patent texts in batches
print("Encoding patent texts...")
df_patent = df_patent[df_patent['text'].notna()]
patent_texts = df_patent['text'].tolist()
patent_embeddings = encode_texts(patent_texts)
print(f"Patent embeddings shape: {patent_embeddings.shape}")

# Process paper texts in batches
print("Encoding paper texts...")
df_paper = df_paper[df_paper['text'].notna()]
paper_texts = df_paper['text'].tolist()
paper_embeddings = encode_texts(paper_texts)
print(f"Paper embeddings shape: {paper_embeddings.shape}")

# Store embeddings
df_patent['embedding'] = list(patent_embeddings)
df_paper['embedding'] = list(paper_embeddings)

Encoding patent texts...


Encoding texts: 100%|██████████| 51316/51316 [13:30<00:00, 63.29it/s]


Patent embeddings shape: (1642111, 384)
Encoding paper texts...


Encoding texts: 100%|██████████| 35556/35556 [11:27<00:00, 51.71it/s]


Paper embeddings shape: (1137787, 384)


# Create Edge Indices

In [43]:
# Reset indices
df_patent.reset_index(drop=True, inplace=True)
df_paper.reset_index(drop=True, inplace=True)

# Create unique IDs for patents and papers
df_patent['id'] = df_patent.index
df_paper['id'] = df_paper.index

In [44]:
# Create all edge indices
df_patent_citations['id'] = df_patent_citations['pat_appln_id'].map(df_patent.set_index('appln_id')['id'])
df_patent_citations['cited_id'] = df_patent_citations['cited_pat_appln_id'].map(df_patent.set_index('appln_id')['id'])
df_patent_citations = df_patent_citations[df_patent_citations['id'].notna()]
df_patent_citations = df_patent_citations[df_patent_citations['cited_id'].notna()]
df_patent_citations = df_patent_citations[['id', 'cited_id']]

# df_paper_citations['oaid'] = "W" + df_paper_citations['oaid'].astype(str)
# df_paper_citations['cited_oaid'] = "W" + df_paper_citations['cited_oaid'].astype(str)
df_paper_citations['id'] = df_paper_citations['oaid'].map(df_paper.set_index('oaid')['id'])
df_paper_citations['cited_id'] = df_paper_citations['cited_oaid'].map(df_paper.set_index('oaid')['id'])
df_paper_citations = df_paper_citations[df_paper_citations['id'].notna()]
df_paper_citations = df_paper_citations[df_paper_citations['cited_id'].notna()]
df_paper_citations = df_paper_citations[['id', 'cited_id']]

df_patent_paper_citations['id'] = df_patent_paper_citations['publn_nr'].map(df_patent.set_index('publn_nr')['id'])
df_patent_paper_citations['cited_id'] = df_patent_paper_citations['oaid'].map(df_paper.set_index('oaid')['id'])
df_patent_paper_citations = df_patent_paper_citations[df_patent_paper_citations['id'].notna()]
df_patent_paper_citations = df_patent_paper_citations[df_patent_paper_citations['cited_id'].notna()]
df_patent_paper_citations = df_patent_paper_citations[['id', 'cited_id']]

df_patent_paper_pairs['id'] = df_patent_paper_pairs['publn_nr'].map(df_patent.set_index('publn_nr')['id'])
df_patent_paper_pairs['cited_id'] = df_patent_paper_pairs['oaid'].map(df_paper.set_index('oaid')['id'])
df_patent_paper_pairs = df_patent_paper_pairs[df_patent_paper_pairs['id'].notna()]
df_patent_paper_pairs = df_patent_paper_pairs[df_patent_paper_pairs['cited_id'].notna()]
df_patent_paper_pairs = df_patent_paper_pairs[['id', 'cited_id']]
df_patent_paper_pairs = df_patent_paper_pairs.drop_duplicates(subset=['id', 'cited_id'])

In [45]:
len(df_patent_paper_citations), len(df_patent_paper_pairs), len(df_patent), len(df_paper), len(df_patent_citations), len(df_paper_citations)

(5586603, 97928, 1642111, 1137787, 236311564, 9172502)

In [46]:
# Convert all values to int
df_patent_citations['id'] = df_patent_citations['id'].astype(int)
df_patent_citations['cited_id'] = df_patent_citations['cited_id'].astype(int)
df_paper_citations['id'] = df_paper_citations['id'].astype(int)
df_paper_citations['cited_id'] = df_paper_citations['cited_id'].astype(int)
df_patent_paper_citations['id'] = df_patent_paper_citations['id'].astype(int)
df_patent_paper_citations['cited_id'] = df_patent_paper_citations['cited_id'].astype(int)
df_patent_paper_pairs['id'] = df_patent_paper_pairs['id'].astype(int)
df_patent_paper_pairs['cited_id'] = df_patent_paper_pairs['cited_id'].astype(int)

# Create H5PY Files

In [49]:
# Open an HDF5 file
with h5py.File('/mnt/hdd02/Projekt_EDV_TEK/gnn_dataset_emergence/edv_tek_emergence_gnn_dataset.h5', 'w') as f:
    # Save node data - no need for string conversion since they're already arrays
    patent_embeddings_array = np.stack(df_patent['embedding'].values)
    paper_embeddings_array = np.stack(df_paper['embedding'].values)
    
    f.create_dataset('patent_embeddings', data=patent_embeddings_array)
    f.create_dataset('paper_embeddings', data=paper_embeddings_array)

    # Save edge indices
    f.create_dataset('patent_citations', data=df_patent_citations[['id', 'cited_id']].values)
    f.create_dataset('paper_citations', data=df_paper_citations[['id', 'cited_id']].values)
    f.create_dataset('patent_paper_citations', data=df_patent_paper_citations[['id', 'cited_id']].values)
    f.create_dataset('patent_paper_pairs', data=df_patent_paper_pairs[['id', 'cited_id']].values)