In [1]:
import pandas as pd
import re
import numpy as np
import torch
import h5py
import ast
import torch
import multiprocessing as mp
import os.path as osp
import gcld3
from sqlalchemy import create_engine, URL, text, MetaData, Table
from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()
from rapidfuzz import fuzz, process, distance
from rapidfuzz.distance import Levenshtein
# from concurrent.futures import ProcessPoolExecutor, as_completed
from torch_geometric.data import HeteroData, Dataset, Data
from torch_geometric.nn import SAGEConv, GATConv, HeteroConv, MessagePassing
from torch_geometric.loader import NeighborLoader
from torch_geometric.utils import add_self_loops, degree
from sentence_transformers import SentenceTransformer

In [2]:
model = SentenceTransformer('distilbert/distilbert-base-uncased')
detector = gcld3.NNetLanguageIdentifier(min_num_bytes=0, max_num_bytes=1000)
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'

No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/distilbert_distilbert-base-uncased. Creating a new one with MEAN pooling.


# Initiate Database Connection

In [None]:
url_object = URL.create(
    drivername="",
    username="",
    password="",
    host="",
    port="",
    database=""
)
engine = create_engine(url_object)

In [8]:
engine = create_engine(url_object)

# Extract Cleantech Patents from PATSTAT

## Extract all Cleantech Patents from table TLS224_APPLN_CPC

In [4]:
# SQL query
query_patstat_cleantech_all = """
    SELECT appln_id, cpc_class_symbol
    FROM tls224_appln_cpc
    WHERE cpc_class_symbol LIKE 'Y02%%';
"""

In [None]:
# Execute the query and write the results into a pandas DataFrame
df_patstat_cleantech_all = pd.read_sql_query(query_patstat_cleantech_all, engine)
# Aggregate by appln_id and list all cpc_class_symbols
df_patstat_cleantech_all = df_patstat_cleantech_all.groupby('appln_id')['cpc_class_symbol'].apply(list).reset_index(name='cpc_class_symbol')
# Cast appln_id to string and delete everything after the dot
df_patstat_cleantech_all['appln_id'] = df_patstat_cleantech_all['appln_id'].astype(str).str.split('.').str[0]

In [None]:
# Print length of DataFrame
print(f"Count of all Clantech patents in PATSTAT: {len(df_patstat_cleantech_all)}")

## Filter Cleantech Patents by Granted = Y

In [None]:
df_patstat_cleantech_all.to_sql('temp_patstat_cleantech_all', engine, if_exists='replace', index=False)

In [None]:
df_patstat_cleantech_all.head()

In [None]:
# SQL query to filter by granted patents
query_filter_cleantech_granted = """
    SELECT temp_patstat_cleantech_all.appln_id
    FROM temp_patstat_cleantech_all
    INNER JOIN tls201_appln ON temp_patstat_cleantech_all.appln_id = CAST(tls201_appln.appln_id AS text)
    WHERE tls201_appln.granted = 'Y'
"""

In [None]:
df_patstat_cleantech_granted = pd.read_sql_query(query_filter_cleantech_granted, engine)
# Merge with df_patstat_cleantech_all to get the cpc_class_symbol
df_patstat_cleantech_granted = df_patstat_cleantech_granted.merge(df_patstat_cleantech_all, on='appln_id', how='left')

In [None]:
# Delete temporary table
engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_all")

## Source required Metadata for Cleantech Patents

In [None]:
df_patstat_cleantech_granted.to_sql('temp_patstat_cleantech_granted', engine, if_exists='replace', index=False)

In [None]:
# SQL query to select data from multiple tables
query_cleantech_metadata = """
    SELECT 
        temp_patstat_cleantech_granted.appln_id,
        tls201.appln_auth,
        tls201.appln_nr,
        tls201.appln_kind,
        tls201.appln_filing_date,
        tls201.appln_filing_year,
        tls202.appln_title_lg,
        tls202.appln_title,
        tls203.appln_abstract_lg,
        tls203.appln_abstract
    FROM 
        temp_patstat_cleantech_granted
    INNER JOIN 
        tls201_appln AS tls201 ON temp_patstat_cleantech_granted.appln_id = CAST(tls201.appln_id AS text)
    LEFT JOIN 
        tls202_appln_title AS tls202 ON temp_patstat_cleantech_granted.appln_id = CAST(tls202.appln_id AS text)
    LEFT JOIN 
        tls203_appln_abstr AS tls203 ON temp_patstat_cleantech_granted.appln_id = CAST(tls203.appln_id AS text)
    WHERE 
        tls201.granted = 'Y'
"""

In [None]:
df_patstat_cleantech_metadata = pd.read_sql_query(query_cleantech_metadata, engine)
# engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_granted")

In [None]:
# Group by appln_id, list all values and reset index
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata.groupby('appln_id').agg({
    'appln_auth': lambda x: list(x),
    'appln_nr': lambda x: list(x),
    'appln_kind': lambda x: list(x),
    'appln_filing_date': lambda x: list(x),
    'appln_filing_year': lambda x: list(x),
    'appln_title_lg': lambda x: list(x),
    'appln_title': lambda x: list(x),
    'appln_abstract_lg': lambda x: list(x),
    'appln_abstract': lambda x: list(x),
}).reset_index()

In [None]:
print(f"Number of granted Cleantech patents in PATSTAT: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_metadata.json', orient='records')

# Filter Cleantech Patents for further analysis

In [None]:
# Filter out all patents that have no abstract
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['appln_abstract'].notnull()]
print(f"Number of granted Cleantech patents in PATSTAT with abstract: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.head()

In [None]:
# Filter out all patents where list in appln_abstract_lg does not contain 'en'
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['appln_abstract_lg'].apply(lambda x: 'en' in x)]
print(f"Number of granted Cleantech patents in PATSTAT with english abstract; considered for training neural networks: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_abstract_metadata.json', orient='records')

# Extract furhter Metadata (Authors, Inventors, Assignees )

In [3]:
df_patstat_cleantech_metadata = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_abstract_metadata.json', orient='records')

In [4]:
df_patstat_non_cleantech_metadata = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_non_cleantech_granted_abstract_metadata.json', orient='records')

In [5]:
df_patstat_cleantech_metadata['cleantech'] = 1
df_patstat_non_cleantech_metadata['cleantech'] = 0
df_patstat_metadata = pd.concat([df_patstat_cleantech_metadata, df_patstat_non_cleantech_metadata])

In [9]:
df_patstat_metadata[['appln_id']].to_sql('temp_patstat_metadata', engine, if_exists='replace', index=False)

586

In [10]:
df_patstat_person_id = pd.read_sql_query("""
    SELECT tp.appln_id::TEXT, pa.person_id
    FROM temp_patstat_metadata AS tp
    JOIN tls207_pers_appln AS pa ON tp.appln_id::TEXT = pa.appln_id
""", con=engine)
df_patstat_person_id = df_patstat_person_id.drop_duplicates(subset=['appln_id', 'person_id'])

In [11]:
df_patstat_person_id.head()

Unnamed: 0,appln_id,person_id
0,11040656,22702215
1,11040656,22702214
2,11292232,23272759
3,11292232,23272758
4,11292232,23272757


In [12]:
len(df_patstat_person_id)

12669908

In [13]:
df_patstat_citations = pd.read_sql_query("""
    SELECT c.pat_publn_id::text, c.cited_pat_publn_id::text, p.appln_id::text
    FROM tls212_citation AS c
    JOIN tls211_pat_publn AS p ON c.pat_publn_id = p.pat_publn_id
    WHERE p.appln_id IN (SELECT appln_id::text FROM temp_patstat_metadata)
""", con=engine)
df_patstat_citations = df_patstat_citations.drop_duplicates(subset=['pat_publn_id', 'cited_pat_publn_id', 'appln_id'])
df_patstat_citations = df_patstat_citations[df_patstat_citations['cited_pat_publn_id'].isin(df_patstat_citations['pat_publn_id'])]

In [14]:
df_patstat_citations = df_patstat_citations.rename(columns={"appln_id": "pat_appln_id"})
df_patstat_citations = pd.merge(df_patstat_citations, df_patstat_citations[['pat_publn_id', 'pat_appln_id']].rename(columns={'pat_appln_id': 'cited_pat_appln_id'}), left_on='cited_pat_publn_id', right_on='pat_publn_id', how='inner')
df_patstat_citations = df_patstat_citations[['pat_publn_id_x', 'cited_pat_publn_id', 'pat_appln_id', 'cited_pat_appln_id']]
df_patstat_citations = df_patstat_citations.rename(columns={'pat_publn_id_x': 'pat_publn_id'})

# Embedding of Node Properties

In [None]:
df_patstat_metadata = df_patstat_metadata.dropna(subset=['appln_title', 'appln_abstract'])
df_patstat_metadata = df_patstat_metadata.reset_index(drop=True)
df_patstat_metadata['appln_title'] = df_patstat_metadata['appln_title'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else x)
df_patstat_metadata['appln_abstract'] = df_patstat_metadata['appln_abstract'].apply(lambda x: ' '.join(str(i) for i in x) if isinstance(x, list) else x)
df_patstat_metadata['embedding'] = model.encode(df_patstat_metadata['appln_title'] + ' [SEP] ' + df_patstat_metadata['appln_abstract'], device=device, show_progress_bar=True).tolist()

In [None]:
df_authors = df_patstat_person_id.groupby('appln_id')['person_id'].apply(list).reset_index(name='authors')
d = model.get_sentence_embedding_dimension()  
df_authors['embedding'] = df_authors.apply(lambda _: np.random.rand(d), axis=1)
df_authors['embedding'] = df_authors['embedding'].apply(lambda x: x / np.linalg.norm(x))

# Build Edge Indices

In [None]:
patent_id_to_index = pd.Series(df_patstat_metadata.index, index=df_patstat_metadata['appln_id']).to_dict()
person_id_to_index = pd.Series(df_authors.index, index=df_authors['person_id']).to_dict()

In [None]:
df_patent_edge_index = df_patstat_citations.copy()
df_patent_edge_index = df_patent_edge_index[['pat_appln_id', 'cited_pat_appln_id']]
df_patent_edge_index['pat_appln_id'] = df_patent_edge_index['pat_appln_id'].map(patent_id_to_index)
df_patent_edge_index['cited_pat_appln_id'] = df_patent_edge_index['cited_pat_appln_id'].map(patent_id_to_index)
df_patent_edge_index = df_patent_edge_index.drop_duplicates(subset=['pat_appln_id', 'cited_pat_appln_id']).reset_index(drop=True)

In [None]:
df_person_patent_edge_index = df_authors.copy()
df_person_patent_edge_index = df_person_patent_edge_index[['person_id', 'appln_id']]
df_person_patent_edge_index['person_id'] = df_person_patent_edge_index['person_id'].map(author_id_to_index)
df_person_patent_edge_index['appln_id'] = df_person_patent_edge_index['appln_id'].map(patent_id_to_index)
df_patent_person_edge_index = df_person_patent_edge_index[['appln_id', 'person_id']]

## Build GNN Model

In [None]:
def string_to_array(str_repr):
    return np.fromstring(str_repr.strip('[]'), sep=',')

# Open an HDF5 file
with h5py.File('/mnt/hdd01/patentsview/Graph Neural Network for EDV-TEK Identification/raw/torch_tek_dataset_distilbert.h5', 'w') as f:
    # Save node data
    # f.create_dataset('g_patent/x', data=np.stack(g_patent['patent_title_abstract_bert_for_patents_embedding'].apply(string_to_array).values))
    f.create_dataset('g_patent/x', data=np.stack(df_patstat_metadata["embedding"].values))
    f.create_dataset('g_patent/y', data=df_patstat_metadata['cleantech'].values.astype(np.int64))
    f.create_dataset('g_author_nodes/x', data=np.stack(df_authors['embedding'].apply(string_to_array).values))
    
    # Save edge indices
    f.create_dataset('patent_edge_index', data=df_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('person_patent_edge_index', data=df_person_patent_edge_index.values, dtype=np.int64)
    f.create_dataset('patent_person_edge_index', data=df_patent_person_edge_index.values, dtype=np.int64)