In [None]:
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm

# Initiate Database Connection

In [None]:
url_object = URL.create(
    drivername='postgresql+psycopg2',
    username=username,
    password=password,
    host=IP,
    port=port,
    database='Patstat',
)

In [None]:
engine = create_engine(url_object)

# Extract random Patents from database for Neural Network training

In [None]:
df_patstat_cleantech_metadata = pd.read_json('data/patstat_cleantech_metadata_filtered.json')

In [None]:
# Drop all columns except APPLN_ID
df_patstat_cleantech_metadata.drop(df_patstat_cleantech_metadata.columns.difference(['APPLN_ID']), 1, inplace=True)
cleantech_patent_count = len(df_patstat_cleantech_metadata)

In [None]:
# Assuming df_patstat_cleantech_metadata is your DataFrame containing cleantech patent appln_id
df_patstat_cleantech_metadata.to_sql('cleantech_metadata', engine, if_exists='replace', index=False)

In [None]:
# SQL query to randomly sample non-cleantech patents
query_non_cleantech_patents = f"""
    SELECT 
        tls201.appln_id,
        tls201.appln_auth,
        tls201.appln_nr,
        tls201.appln_kind,
        tls201.appln_filling_date,
        tls201.appln_filing_year,
        tls202.appln_title_lg,
        tls202.appln_title,
        tls203.APPLN_ABSTRACT_LG,
        tls203.APPLN_ABSTRACT
    FROM 
        TLS201_APPLN AS tls201
    LEFT JOIN 
        TLS202_APPLN_TITLE AS tls202 ON tls201.appln_id = tls202.appln_id
    LEFT JOIN 
        TLS203_APPLN_ABSTR AS tls203 ON tls201.appln_id = tls203.appln_id
    WHERE 
        tls201.appln_id NOT IN (SELECT APPLN_ID FROM cleantech_metadata)
    ORDER BY 
        RANDOM()
    LIMIT {cleantech_patent_count}
"""

In [None]:
df_non_cleantech_patents = pd.read_sql(query_non_cleantech_patents, engine)
print(f"Number of non-cleantech patents: {len(df_non_cleantech_patents)}")
engine.execute("DROP TABLE IF EXISTS cleantech_metadata")

In [None]:
df_non_cleantech_patents.to_json('data/non_cleantech_patents.json', orient='records')