In [None]:
import pandas as pd
from sqlalchemy import create_engine, URL
from tqdm import tqdm

# Initiate Database Connection

In [None]:
url_object = URL.create(
    drivername='postgresql+psycopg2',
    username='tie',
    password='TIE%2023!tuhh',
    host='localhost',
    port='25432',
    database='Patstat',
)

In [None]:
engine = create_engine(url_object)

# Extract random Patents from database for Neural Network training

In [None]:
df_patstat_cleantech_metadata = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_cleantech_granted_abstract_metadata.json')

In [None]:
# Drop all columns except APPLN_ID
df_patstat_cleantech_metadata = df_patstat_cleantech_metadata[['appln_id']]
cleantech_patent_count = len(df_patstat_cleantech_metadata)

In [None]:
# Assuming df_patstat_cleantech_metadata is your DataFrame containing cleantech patent appln_id
df_patstat_cleantech_metadata.to_sql('cleantech_metadata', engine, if_exists='replace', index=False)

In [None]:
# SQL query to randomly sample non-cleantech patents
query_non_cleantech_patents = f"""
    SELECT 
        tls201.appln_id,
        tls201.appln_auth,
        tls201.appln_nr,
        tls201.appln_kind,
        tls201.appln_filling_date,
        tls201.appln_filing_year,
        tls202.appln_title_lg,
        tls202.appln_title,
        tls203.appln_abstract_lg,
        tls203.appln_abstract
    FROM 
        tls201_appln AS tls201
    LEFT JOIN 
        tls202_appln_title AS tls202 ON tls201.appln_id = tls202.appln_id
    LEFT JOIN 
        tls203_appln_abstr AS tls203 ON tls201.appln_id = tls203.appln_id
    WHERE 
        tls201.appln_id NOT IN (SELECT appln_id FROM cleantech_metadata)
    WHERE
        tls201.granted = 'Y'
    ORDER BY 
        RANDOM()
    LIMIT {cleantech_patent_count*1.2}
"""

In [None]:
df_non_cleantech_patents = pd.read_sql(query_non_cleantech_patents, engine)
print(f"Number of non-cleantech patents: {len(df_non_cleantech_patents)}")
# engine.execute("DROP TABLE IF EXISTS cleantech_metadata")

In [None]:
# Group by appln_id, list all values and reset index
df_non_cleantech_patents_metadata = df_non_cleantech_patents.groupby('appln_id').agg({
    'appln_auth': lambda x: list(x),
    'appln_nr': lambda x: list(x),
    'appln_kind': lambda x: list(x),
    'appln_filing_date': lambda x: list(x),
    'appln_filing_year': lambda x: list(x),
    'appln_title_lg': lambda x: list(x),
    'appln_title': lambda x: list(x),
    'appln_abstract_lg': lambda x: list(x),
    'appln_abstract': lambda x: list(x),
}).reset_index()

In [None]:
# Filter out all patents that have no abstract
df_non_cleantech_patents_metadata = df_non_cleantech_patents_metadata[df_non_cleantech_patents_metadata['appln_abstract'].notnull()]
print(f"Number of granted Non-Cleantech patents in PATSTAT with abstract: {len(df_non_cleantech_patents_metadata)}")

In [None]:
# Filter out all patents where list in appln_abstract_lg does not contain 'en'
df_non_cleantech_patents_metadata = df_non_cleantech_patents_metadata[df_non_cleantech_patents_metadata['appln_abstract_lg'].apply(lambda x: 'en' in x)]
print(f"Number of granted Non-Cleantech patents in PATSTAT with english abstract; considered for training neural networks: {len(df_non_cleantech_patents_metadata)}")

In [None]:
df_non_cleantech_patents.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/df_patstat_non_cleantech_granted_abstract_metadata.json', orient='records')