In [None]:
import pandas as pd
from sqlalchemy import create_engine, URL
from tqdm import tqdm
tqdm.pandas()

# Initiate Database Connection

In [None]:
url_object = URL.create(
    drivername='postgresql+psycopg2',
    username='tie',
    password='TIE%2023!tuhh',
    host='localhost',
    port='25432',
    database='Patstat',
)

In [None]:
engine = create_engine(url_object)

# Extract Cleantech Patents from PATSTAT

## Extract all Cleantech Patents from table TLS224_APPLN_CPC

In [None]:
# SQL query
query_patstat_cleantech_all = """
    SELECT appln_id, cpc_class_symbol
    FROM tls224_appln_cpc
    WHERE cpc_class_symbol LIKE 'Y02%%';
"""

In [None]:
# Execute the query and write the results into a pandas DataFrame
df_patstat_cleantech_all = pd.read_sql_query(query_patstat_cleantech_all, engine)
# Aggregate by appln_id and list all cpc_class_symbols
df_patstat_cleantech_all = df_patstat_cleantech_all.groupby('appln_id')['cpc_class_symbol'].apply(list).reset_index(name='cpc_class_symbol')
# Cast appln_id to string and delete everything after the dot
df_patstat_cleantech_all['appln_id'] = df_patstat_cleantech_all['appln_id'].astype(str).str.split('.').str[0]

In [None]:
# Print length of DataFrame
print(f"Count of all Clantech patents in PATSTAT: {len(df_patstat_cleantech_all)}")

## Filter Cleantech Patents by Granted = Y

In [None]:
df_patstat_cleantech_all.to_sql('temp_patstat_cleantech_all', engine, if_exists='replace', index=False)

In [None]:
df_patstat_cleantech_all.head()

In [None]:
# SQL query to filter by granted patents
query_filter_cleantech_granted = """
    SELECT temp_patstat_cleantech_all.appln_id
    FROM temp_patstat_cleantech_all
    INNER JOIN tls201_appln ON temp_patstat_cleantech_all.appln_id = CAST(tls201_appln.appln_id AS text)
    WHERE tls201_appln.granted = 'Y'
"""

In [None]:
df_patstat_cleantech_granted = pd.read_sql_query(query_filter_cleantech_granted, engine)
# Merge with df_patstat_cleantech_all to get the cpc_class_symbol
df_patstat_cleantech_granted = df_patstat_cleantech_granted.merge(df_patstat_cleantech_all, on='appln_id', how='left')

In [None]:
# Delete temporary table
engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_all")

## Source required Metadata for Cleantech Patents

In [None]:
df_patstat_cleantech_granted.to_sql('temp_patstat_cleantech_granted', engine, if_exists='replace', index=False)

In [None]:
# SQL query to select data from multiple tables
query_cleantech_metadata = """
    SELECT 
        temp_patstat_cleantech_granted.appln_id,
        tls201.appln_auth,
        tls201.appln_nr,
        tls201.appln_kind,
        tls201.appln_filing_date,
        tls201.appln_filing_year,
        tls202.appln_title_lg,
        tls202.appln_title,
        tls203.appln_abstract_lg,
        tls203.appln_abstract
    FROM 
        temp_patstat_cleantech_granted
    INNER JOIN 
        tls201_appln AS tls201 ON temp_patstat_cleantech_granted.appln_id = CAST(tls201.appln_id AS text)
    LEFT JOIN 
        tls202_appln_title AS tls202 ON temp_patstat_cleantech_granted.appln_id = CAST(tls202.appln_id AS text)
    LEFT JOIN 
        tls203_appln_abstr AS tls203 ON temp_patstat_cleantech_granted.appln_id = CAST(tls203.appln_id AS text)
    WHERE 
        tls201.granted = 'Y'
"""

In [None]:
df_patstat_cleantech_metadata = pd.read_sql_query(query_cleantech_metadata, engine)
# engine.execute("DROP TABLE IF EXISTS temp_patstat_cleantech_granted")

In [None]:
# Group by appln_id, list all values and reset index
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata.groupby('appln_id').agg({
    'appln_auth': lambda x: list(x),
    'appln_nr': lambda x: list(x),
    'appln_kind': lambda x: list(x),
    'appln_filing_date': lambda x: list(x),
    'appln_filing_year': lambda x: list(x),
    'appln_title_lg': lambda x: list(x),
    'appln_title': lambda x: list(x),
    'appln_abstract_lg': lambda x: list(x),
    'appln_abstract': lambda x: list(x),
}).reset_index()

In [None]:
print(f"Number of granted Cleantech patents in PATSTAT: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_metadata.json', orient='records')

# Filter Cleantech Patents for further analysis

In [None]:
# Filter out all patents that have no abstract
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['appln_abstract'].notnull()]
print(f"Number of granted Cleantech patents in PATSTAT with abstract: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.head()

In [None]:
# Filter out all patents where list in appln_abstract_lg does not contain 'en'
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[df_patstat_cleantech_metadata['appln_abstract_lg'].apply(lambda x: 'en' in x)]
print(f"Number of granted Cleantech patents in PATSTAT with english abstract; considered for training neural networks: {len(df_patstat_cleantech_metadata)}")

In [None]:
df_patstat_cleantech_metadata.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_abstract_metadata.json', orient='records')