In [2]:
import pandas as pd
from sqlalchemy import create_engine, URL, text
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

In [None]:
url_object = URL.create(
    drivername="",
    username="",
    password="",
    host="",
    port="",
    database=""
)
engine = create_engine(url_object)

In [None]:
df_grouped = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids.csv')

# Create Temp Table with appln_id

In [4]:
appln_ids = df_grouped['appln_id'].tolist()

# Create a temporary table and insert application IDs
temp_table_query = """
    DROP TABLE IF EXISTS temp_appln_ids;
    CREATE TEMP TABLE temp_appln_ids (appln_id TEXT);
"""
insert_query = text("INSERT INTO temp_appln_ids (appln_id) VALUES (:appln_id)")

with engine.connect() as connection:
    connection.execute(text(temp_table_query))
    for id in appln_ids:
        connection.execute(insert_query, {'appln_id': id})
    connection.commit()

    # Perform the join query to extract all valid publn_nr per appln_id
    join_query = """
        SELECT t1.appln_id, t2.publn_auth, t2.publn_nr
        FROM temp_appln_ids t1
        JOIN (
            SELECT appln_id, publn_auth, publn_nr
            FROM tls211_pat_publn
        ) t2 ON t1.appln_id = t2.appln_id;
    """
    result = connection.execute(text(join_query))
    df_temp_publn = pd.DataFrame(result.fetchall(), columns=result.keys())

df_temp_publn['publn_nr'] = df_temp_publn['publn_nr'].astype(str)
shortest_publn_nr_idx = df_temp_publn.groupby('appln_id')['publn_nr'].apply(lambda x: x.str.len().idxmin())
df_shortest_publn = df_temp_publn.loc[shortest_publn_nr_idx]

# Insert the filtered results back into the database
temp_table_with_publn_query = """
    CREATE TEMP TABLE temp_appln_ids_with_publn (appln_id TEXT, publn_auth TEXT, publn_nr TEXT);
    CREATE INDEX temp_appln_ids_with_publn_index ON temp_appln_ids_with_publn(publn_nr, publn_auth);
"""
insert_filtered_query = text("INSERT INTO temp_appln_ids_with_publn (appln_id, publn_auth, publn_nr) VALUES (:appln_id, :publn_auth, :publn_nr)")

with engine.connect() as connection:
    connection.execute(text(temp_table_with_publn_query))
    for _, row in df_shortest_publn.iterrows():
        connection.execute(insert_filtered_query, {'appln_id': row['appln_id'], 'publn_auth': row['publn_auth'], 'publn_nr': row['publn_nr']})
    connection.commit()

# Extract Author Information from PATSTAT

In [5]:
df_patstat_person_id = pd.read_sql_query("""
    SELECT tp.appln_id::TEXT, pa.person_id
    FROM temp_appln_ids AS tp
    JOIN tls207_pers_appln AS pa ON tp.appln_id::TEXT = pa.appln_id
""", con=engine)
df_patstat_person_id = df_patstat_person_id.drop_duplicates(subset=['appln_id', 'person_id'])

# Extract Patent Citations from PATSTAT

In [6]:
df_patstat_citations = pd.read_sql_query("""
    SELECT c.pat_publn_id::text, c.cited_pat_publn_id::text, p.appln_id::text
    FROM tls212_citation AS c
    JOIN tls211_pat_publn AS p ON c.pat_publn_id = p.pat_publn_id
    WHERE p.appln_id IN (SELECT appln_id::text FROM temp_appln_ids)
""", con=engine)
df_patstat_citations = df_patstat_citations.drop_duplicates(subset=['pat_publn_id', 'cited_pat_publn_id', 'appln_id'])
df_patstat_citations = df_patstat_citations[df_patstat_citations['cited_pat_publn_id'].isin(df_patstat_citations['pat_publn_id'])]

In [7]:
df_patstat_citations = df_patstat_citations.rename(columns={"appln_id": "pat_appln_id"})
df_patstat_citations = pd.merge(df_patstat_citations, df_patstat_citations[['pat_publn_id', 'pat_appln_id']].rename(columns={'pat_appln_id': 'cited_pat_appln_id'}), left_on='cited_pat_publn_id', right_on='pat_publn_id', how='inner')
df_patstat_citations = df_patstat_citations[['pat_publn_id_x', 'cited_pat_publn_id', 'pat_appln_id', 'cited_pat_appln_id']]
df_patstat_citations = df_patstat_citations.rename(columns={'pat_publn_id_x': 'pat_publn_id'})

# Save all data to CSV

In [None]:
df_patstat_person_id.to_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_person_id.csv', index=False)
df_patstat_citations.to_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.csv', index=False)