In [1]:
import pandas as pd
from sqlalchemy import create_engine, URL
from tqdm import tqdm

# Initiate Database Connection

In [None]:
url_object = URL.create(
    drivername='postgresql+psycopg2',
    username=username,
    password=password,
    host=IP,
    port=port,
    database='Patstat',
)

In [None]:
engine = create_engine(url_object)

# Extract Cleantech Patents from PATSTAT

## Extract all Cleantech Patents from table TLS224_APPLN_CPC

In [3]:
# SQL query
query_patstat_cleantech_all = """
    SELECT APPLN_ID, CPC_CLASS_SYMBOL
    FROM TLS224_APPLN_CPC
    WHERE CPC_CLASS_SYMBOL LIKE '%Y02%'
"""

In [None]:
# Execute the query and write the results into a pandas DataFrame
df_patstat_cleantech_all = pd.read_sql_query(query_patstat_cleantech_all, engine)
# Aggregate by appln_id and list all cpc_class_symbols
df_patstat_cleantech_all = df_patstat_cleantech_all.groupby('APPLN_ID')['CPC_CLASS_SYMBOL'].apply(list).reset_index(name='CPC_CLASS_SYMBOL')
# Cast appln_id to string and delete everything after the dot
df_patstat_cleantech_all['APPLN_ID'] = df_patstat_cleantech_all['APPLN_ID'].astype(str).str.split('.').str[0]

In [None]:
# Print length of DataFrame
print(f"Count of all Clantech patents in PATSTAT: {len(df_patstat_cleantech_all)}")

## Filter Cleantech Patents by Granted = Y

In [None]:
df_patstat_cleantech_all.to_sql('temp_patstat_cleantech_all', engine, if_exists='replace', index=False)

In [None]:
# SQL query to filter by granted patents
query_filter_cleantech_granted = """
    SELECT temp_patstat_cleantech_all.APPLN_ID
    FROM temp_patstat_cleantech_all
    INNER JOIN TLS201_appln ON temp_patstat_cleantech_all.APPLN_ID = TLS201_appln.APPLN_ID
    WHERE TLS201_appln.GRANTED = 'Y'
"""

In [None]:
df_patstat_cleantech_granted = pd.read_sql_query(query_filter_cleantech_granted, engine)
# Merge with df_patstat_cleantech_all to get the cpc_class_symbol
df_patstat_cleantech_granted = df_patstat_cleantech_granted.merge(df_patstat_cleantech_all, on='APPLN_ID', how='left')

In [None]:
# Delete temporary table
engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_all")

## Source required Metadata for Cleantech Patents

In [None]:
df_patstat_cleantech_granted.to_sql('temp_patstat_cleantech_granted', engine, if_exists='replace', index=False)

In [None]:
# SQL query to select data from multiple tables
query_cleantech_metadata = """
    SELECT 
        temp_df.APPLN_ID,
        tls201.APPLN_AUTH,
        tls201.APPLN_NR,
        tls201.APPLN_KIND,
        tls201.APPLN_FILING_DATE,
        tls201.APPLN_FILING_YEAR,
        tls202.APPLN_TITLE_LG,
        tls202.APPLN_TITLE,
        tls203.APPLN_ABSTRACT_LG,
        tls203.APPLN_ABSTRACT
    FROM 
        temp_patstat_cleantech_granted
    INNER JOIN 
        TLS201_APPLN AS tls201 ON temp_df.APPLN_ID = tls201.APPLN_ID
    LEFT JOIN 
        TLS202_APPLN_TITLE AS tls202 ON temp_df.APPLN_ID = tls202.APPLN_ID
    LEFT JOIN 
        TLS203_APPLN_ABSTR AS tls203 ON temp_df.APPLN_ID = tls203.APPLN_ID
    WHERE 
        tls201.GRANTED = 'Y'
"""

In [None]:
df_patstat_cleantech_metadata = pd.read_sql_query(query_cleantech_metadata, engine)
engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_granted")

In [None]:
# Group by APPLN_ID, list all values and reset index
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata.groupby('APPLN_ID').agg({
    'APPLN_AUTH': lambda x: list(x),
    'APPLN_NR': lambda x: list(x),
    'APPLN_KIND': lambda x: list(x),
    'APPLN_FILING_DATE': lambda x: list(x),
    'APPLN_FILING_YEAR': lambda x: list(x),
    'APPLN_TITLE_LG': lambda x: list(x),
    'APPLN_TITLE': lambda x: list(x),
    'APPLN_ABSTRACT_LG': lambda x: list(x),
    'APPLN_ABSTRACT': lambda x: list(x),
}).reset_index()

In [None]:
print(f"Number of granted Cleantech patents in PATSTAT: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('data/patstat_cleantech_metadata.json', orient='records')

# Filter Cleantech Patents for further analysis

In [None]:
# Filter out all patents that have no abstract
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['APPLN_ABSTRACT'].notnull()]
print(f"Number of granted Cleantech patents in PATSTAT with abstract: {len(df_patstat_cleantech_metadata)}")

In [None]:
# Filter out all patents where APPLN_ABSTRACT_LG is not en (english)
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['APPLN_ABSTRACT_LG'].str.contains('en')]
print(f"Number of granted Cleantech patents in PATSTAT with english abstract; considered for training neural networks: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('data/patstat_cleantech_metadata_filtered.json', orient='records')