In [1]:
import pandas as pd
import numpy as np
import ast
from sqlalchemy import create_engine, URL, text
import re
from bs4 import BeautifulSoup
from tqdm import tqdm
tqdm.pandas()

In [None]:
url_object = URL.create(
    drivername="",
    username="",
    password="",
    host="",
    port="",
    database=""
)
engine = create_engine(url_object)

# Create Indices on Query Columns (only required once!)

In [None]:
# query_ep_index = """CREATE INDEX ep_index ON ep_fulltext_data(epo_publn_nr, publn_auth);"""
# query_us_brf_summary_index = """CREATE INDEX us_brf_summary_index ON us_brf_summary(patent_id, publn_auth);"""
# query_us_claims_index = """CREATE INDEX us_claims_index ON us_claims(patent_id, publn_auth);"""
# query_us_description_index = """CREATE INDEX us_description_index ON us_description(patent_id, publn_auth);"""

In [None]:
# with engine.connect() as connection:
#     connection.execute(text(query_ep_index))
#     connection.execute(text(query_us_brf_summary_index))
#     connection.execute(text(query_us_claims_index))
#     connection.execute(text(query_us_description_index))

# Extract Cleantech Patents from PATSTAT (only granted and EP or US patents)

## CPC Y02 Classification

In [None]:
# query_cpc_index = """CREATE INDEX idx_cpc_class_symbol ON tls224_appln_cpc(cpc_class_symbol);""" # only required once

In [None]:
query_cpc_y02 = """
    SELECT a.appln_id, a.cpc_class_symbol
    FROM tls224_appln_cpc a
    JOIN tls201_appln b ON a.appln_id = b.appln_id
    WHERE (a.cpc_class_symbol LIKE 'Y02A%' OR
           a.cpc_class_symbol LIKE 'Y02B%' OR
           a.cpc_class_symbol LIKE 'Y02C%' OR
           a.cpc_class_symbol LIKE 'Y02D%' OR
           a.cpc_class_symbol LIKE 'Y02E%' OR
           a.cpc_class_symbol LIKE 'Y02P%' OR
           a.cpc_class_symbol LIKE 'Y02T%' OR
           a.cpc_class_symbol LIKE 'Y02W%')
      AND b.appln_auth IN ('EP', 'US')
      AND b.granted = 'Y';
"""

In [None]:
with engine.connect() as connection:
    # connection.execute(text(query_cpc_index)) # Only required once
    df_cpc = pd.read_sql_query(text(query_cpc_y02), connection)

## IPC Green Inventory

### HTML Website IPC Green Inventory - Extract IPC Codes for Cleantech

In [None]:
html_file = "/home/thiesen/Documents/Projekt_EDV-TEK/IPC GREEN INVENTORY_komplett.html"

with open(html_file, "r", encoding="utf-8") as file:
    html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
rows = soup.find_all("tr")

In [None]:
ipc_combined_text = []

# Loop through table rows to extract the second column
for row in rows:
    ipc_column = row.find_all("td")[1] if len(row.find_all("td")) > 1 else None
    if ipc_column:
        combined_text = ipc_column.get_text(strip=True, separator=" ")
        if combined_text:  # Ensure the content is not empty
            ipc_combined_text.append(combined_text)

In [None]:
# Delete all white spaces before and after a hyphen
ipc_combined_text = [re.sub(r"\s*-\s*", "-", text) for text in ipc_combined_text]

In [None]:
def expand_ipc_codes(ipc_list):
    expanded_codes = []
    for entry in ipc_list:
        parts = re.split(r'\s*,\s*|\s+', entry)  # Split on spaces or commas
        base_code = parts[0]  # Initialize base code
        base_code_count = 0  # Tracks the number of expansions of current base code

        # If list only contains one element, add it to the expanded list
        if len(parts) == 1:
            expanded_codes.append(parts[0])
        
        # Else iterate over the parts
        for part in parts[1:]: # Parts always starts with the base code
            # First, check for ranges with hyphens (e.g., 5/40-5/48)
            if re.match(r'^\d{1,2}/\d{2,3}-\d{1,2}/\d{2,3}$', part):
                if base_code:  # Only process if base_code exists
                    start, end = part.split('-')
                    expanded_codes.append(f"{base_code} {start} - {base_code} {end}")
                    base_code_count += 1

            # Check for full IPC codes with slashes (e.g., 5/00, 67/00)
            elif re.match(r'^\d{1,2}/\d{2,3}$', part):
                if base_code:  # Combine with base code
                    expanded_codes.append(f"{base_code} {part}")
                    base_code_count += 1

            # Check if part might be a base code (e.g., A01, B02, H03F)
            elif re.match(r'^[A-Z]{1,3}\d{2}[A-Z]?', part):
                # If base code exists and has not been expanded, it must be standalone
                if base_code_count == 0:
                    expanded_codes.append(f"{base_code}")
                base_code = part
                base_code_count = 0
                # If part is last element in list, it must be standalone
                if part == parts[-1]:
                    expanded_codes.append(f"{base_code}")

    return expanded_codes

In [None]:
# Process and expand IPC codes
expanded_ipc_list = expand_ipc_codes(ipc_combined_text)

In [None]:
with open('/home/thiesen/Documents/Projekt_EDV-TEK/ipc_valid_symbols_20240101/ipc_valid_symbols_20240101.html', 'r') as file:
    html_content = file.read()

# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')

# Extract the IPC symbols
ipc_symbols = [p.text for p in soup.find_all('p')]

In [None]:
def expand_ranges(expanded_ipc_list, ipc_symbols):
    """
    Expands hyphenated ranges in IPC codes to individual entries (e.g., "A23K 10/06-10").
    Keeps entries without a hyphen as-is and handles cases where some intermediate codes may be missing.

    Parameters:
    - expanded_cpc_list (iterable of str): IPC codes (potentially with ranges).
    - cpc_symbols (list): Ordered list of all possible IPC codes.

    Returns:
    - list of str: IPC codes with all recognized ranges expanded.
    """
    exploded_list = []
    
    for entry in expanded_ipc_list:
        # Primary Pass: Expand entries with a slash-hyphen-suffix (e.g., "A23K 10/06-10")
        slash_hyphen_match = re.match(r'^([A-Z0-9]+\s+\d+)/(\d+)-(\d+)$', entry)
        if slash_hyphen_match:
            prefix = slash_hyphen_match.group(1) + '/'  # e.g., "A23K 10/"
            start_num = int(slash_hyphen_match.group(2))  # e.g., 06
            end_num = int(slash_hyphen_match.group(3))    # e.g., 10
            
            # Generate all codes within the range
            for num in range(start_num, end_num + 1):
                new_num = f"{num:02d}"  # Ensure two digits
                new_code = f"{prefix}{new_num}"
                if new_code in ipc_symbols:
                    exploded_list.append(new_code)
            continue  # Move to the next entry after processing
                
        # Second Pass: Handle plain numeric ranges (e.g., "F02M 39-71")
        numeric_range_match = re.match(r'^([A-Z0-9]+)\s+(\d+)-(\d+)$', entry)
        if numeric_range_match:
            prefix = numeric_range_match.group(1)  # e.g., "F02M"
            start_num = int(numeric_range_match.group(2))   # e.g., 39
            end_num = int(numeric_range_match.group(3))     # e.g., 71
            
            for num in range(start_num, end_num + 1):
                new_code = f"{prefix} {num}"
                if new_code in ipc_symbols:
                    exploded_list.append(new_code)
            continue
        
        # Third Pass: Handle single numeric codes with slash (e.g., "F02D 41")
        single_numeric_match = re.match(r'^([A-Z0-9]+)\s+(\d+)$', entry)
        if single_numeric_match:
            prefix = single_numeric_match.group(1)  # e.g., "F02D"
            num = single_numeric_match.group(2)     # e.g., "41"
            search_prefix = f"{prefix} {num}/"
            
            # Find all codes that start with the search_prefix
            matching_codes = [s for s in ipc_symbols if s.startswith(search_prefix)]
            if matching_codes:
                exploded_list.extend(matching_codes)
            else:
                # If no expansions found, retain the original
                exploded_list.append(entry)
            continue
        
        # If none of the patterns match, just append the entry as-is
        exploded_list.append(entry)
    
    return exploded_list

In [None]:
range_expanded_ipc_list = expand_ranges(expanded_ipc_list, ipc_symbols)

In [None]:
# Deduplicate the list
range_expanded_ipc_list = list(set(range_expanded_ipc_list))

In [None]:
len(range_expanded_ipc_list), len(ipc_combined_text)

In [None]:
def expand_base_code(range_expanded_ipc_list, ipc_symbols):
    expanded_list = []

    # Define the regex pattern for letter-number-number or letter-number-number-letter
    pattern = re.compile(r"^[A-Z]\d{2}[A-Z]?$")

    for entry in range_expanded_ipc_list:
        # Check if the entry matches the pattern
        if pattern.match(entry):
            # Include all IPC symbols that start with the exact entry
            matching_symbols = [symbol for symbol in ipc_symbols if symbol.startswith(entry)]
            expanded_list.extend(matching_symbols)
        else:
            # Retain entries that do not match the pattern
            expanded_list.append(entry)

    return expanded_list

In [None]:
# Expand base codes in IPC codes
final_expanded_ipc_list = expand_base_code(range_expanded_ipc_list, ipc_symbols)

In [None]:
# Deduplicate the list
final_expanded_ipc_list = list(set(final_expanded_ipc_list))

In [None]:
len(final_expanded_ipc_list), len(range_expanded_ipc_list), len(ipc_combined_text)

### IPC Postgres Operations

In [None]:
def transform_ipc_symbols(ipc_list):
    """
    Transform IPC symbols to ensure:
    - The slash '/' is always at the 9th position.
    - Spaces are padded dynamically between the first part and the main group.
    
    Args:
        ipc_list (list): List of IPC symbols.
    
    Returns:
        list: Formatted IPC symbols.
    """
    transformed_list = []
    
    for ipc in ipc_list:
        parts = ipc.split()  # Split on spaces
        if len(parts) == 1:  # Short symbol (e.g., 'B61', 'A61K')
            transformed_list.append(ipc)
            continue
        
        first_part = parts[0]  # Section, IPC class, subclass
        subgroup = parts[1]    # Main group and subgroup
        
        # Split subgroup into pre-slash and post-slash components
        if '/' in subgroup:
            before_slash, after_slash = subgroup.split('/')
        else:
            transformed_list.append(ipc)  # Skip if malformed
            continue
        
        # Calculate the padding needed to align the slash to position 9
        num_spaces = 8 - (len(first_part) + len(before_slash))
        num_spaces = max(num_spaces, 1)  # Ensure at least 1 space
        
        # Combine the parts into the final formatted symbol
        formatted_symbol = f"{first_part}{' ' * num_spaces}{before_slash}/{after_slash}"
        transformed_list.append(formatted_symbol)
    
    return transformed_list

# Transform IPC symbols
transformed_ipc_list = transform_ipc_symbols(final_expanded_ipc_list)

In [None]:
query_ipc_index = """CREATE INDEX idx_ipc_class_symbol ON tls209_appln_ipc(ipc_class_symbol);"""

In [None]:
query_ipc_green_inventory = """
    SELECT a.appln_id, a.ipc_class_symbol
    FROM tls209_appln_ipc a
    JOIN tls201_appln b ON a.appln_id = b.appln_id
    WHERE a.ipc_class_symbol IN :ipc_classes
      AND b.appln_auth IN ('EP', 'US')
      AND b.granted = 'Y';
"""

In [None]:
with engine.connect() as connection:
    # connection.execute(text(query_ipc_index))  # Only required once
    df_ipc = pd.read_sql_query(
        text(query_ipc_green_inventory),
        connection,
        params={"ipc_classes": tuple(transformed_ipc_list)},
    )

In [None]:
len(df_ipc['appln_id'].unique())

## OECD Envtech (IPC and CPC)

In [None]:
df_ipc_envtech = pd.read_csv('/home/thiesen/Documents/Projekt_EDV-TEK/OECD_envtech/expanded_cleaned_classification_oecd_envtech_ipc.csv', header=None, names=['ipc_class_symbol'])
df_cpc_envtech = pd.read_csv('/home/thiesen/Documents/Projekt_EDV-TEK/OECD_envtech/expanded_cleaned_classification_oecd_envtech_cpc.csv', header=None, names=['cpc_class_symbol'])

In [None]:
def transform_symbols(symbols_list):
    """
    Transform IPC symbols to ensure:
    - The slash '/' is always at the 9th position.
    - Spaces are padded dynamically between the first part and the main group.
    
    Args:
        ipc_list (list): List of IPC symbols or cpc_list (list): List of CPC symbols.
    
    Returns:
        list: Formatted IPC and/or CPC symbols.
    """
    transformed_list = []
    
    for symbol in symbols_list:
        parts = symbol.split()  # Split on spaces
        if len(parts) == 1:  # Short symbol (e.g., 'B61', 'A61K')
            transformed_list.append(symbol)
            continue
        
        first_part = parts[0]  # Section, IPC class, subclass
        subgroup = parts[1]    # Main group and subgroup
        
        # Split subgroup into pre-slash and post-slash components
        if '/' in subgroup:
            before_slash, after_slash = subgroup.split('/')
        else:
            transformed_list.append(symbol)  # Skip if malformed
            continue
        
        # Calculate the padding needed to align the slash to position 9
        num_spaces = 8 - (len(first_part) + len(before_slash))
        num_spaces = max(num_spaces, 1)  # Ensure at least 1 space
        
        # Combine the parts into the final formatted symbol
        formatted_symbol = f"{first_part}{' ' * num_spaces}{before_slash}/{after_slash}"
        transformed_list.append(formatted_symbol)
    
    return transformed_list

# Transform IPC and CPC symbols
transformed_ipc_envtech = transform_symbols(df_ipc_envtech['ipc_class_symbol'].tolist())
transformed_cpc_envtech = transform_symbols(df_cpc_envtech['cpc_class_symbol'].tolist())

In [None]:
query_ipc_envtech_index = """CREATE INDEX idx_ipc_class_symbol ON tls209_appln_ipc(ipc_class_symbol);"""
query_cpc_envtech_index = """CREATE INDEX idx_cpc_class_symbol ON tls224_appln_cpc(cpc_class_symbol);"""

In [None]:
query_ipc_envtech = """
    SELECT a.appln_id, a.ipc_class_symbol
    FROM tls209_appln_ipc a
    JOIN tls201_appln b ON a.appln_id = b.appln_id
    WHERE a.ipc_class_symbol IN :ipc_classes
      AND b.appln_auth IN ('EP', 'US')
      AND b.granted = 'Y';
"""

query_cpc_envtech = """
    SELECT a.appln_id, a.cpc_class_symbol
    FROM tls224_appln_cpc a
    JOIN tls201_appln b ON a.appln_id = b.appln_id
    WHERE a.cpc_class_symbol IN :cpc_classes
      AND b.appln_auth IN ('EP', 'US')
      AND b.granted = 'Y';
"""

In [None]:
with engine.connect() as connection:
    # connection.execute(text(query_ipc_envtech_index))
    # connection.execute(text(query_cpc_envtech_index))
    df_ipc_envtech = pd.read_sql_query(
        text(query_ipc_envtech),
        connection,
        params={"ipc_classes": tuple(transformed_ipc_envtech)},
    )
    df_cpc_envtech = pd.read_sql_query(
        text(query_cpc_envtech),
        connection,
        params={"cpc_classes": tuple(transformed_cpc_envtech)},
    )

In [None]:
len(df_ipc_envtech['appln_id'].unique()), len(df_cpc_envtech['appln_id'].unique())

## Concatenate CPC and IPC

In [None]:
# df = pd.concat([df_ipc, df_cpc], ignore_index=True)
# df = pd.concat([df_ipc_envtech, df_cpc_envtech], ignore_index=True) # Only for OECD-Envtech
df = pd.concat([df_ipc, df_cpc, df_ipc_envtech, df_cpc_envtech], ignore_index=True)

# Subsume Patents per appln_id

In [None]:
df_grouped = df.groupby('appln_id').agg({
    'ipc_class_symbol': lambda x: list(x),
    'cpc_class_symbol': lambda x: list(x)
}).reset_index()

In [None]:
df_grouped.to_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids.csv', index=False)

In [3]:
df_grouped = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids.csv')

# Extract Patent Fulltext Data for appln_id

## Create Temp Table with appln_id

In [4]:
appln_ids = df_grouped['appln_id'].tolist()

# Create a temporary table and insert application IDs
temp_table_query = """
    DROP TABLE IF EXISTS temp_appln_ids;
    CREATE TEMP TABLE temp_appln_ids (appln_id TEXT);
"""
insert_query = text("INSERT INTO temp_appln_ids (appln_id) VALUES (:appln_id)")

with engine.connect() as connection:
    connection.execute(text(temp_table_query))
    for id in appln_ids:
        connection.execute(insert_query, {'appln_id': id})
    connection.commit()

    # Perform the join query to extract all valid publn_nr per appln_id
    join_query = """
        SELECT t1.appln_id, t2.publn_auth, t2.publn_nr, t2.publn_date
        FROM temp_appln_ids t1
        JOIN (
            SELECT appln_id, publn_auth, publn_nr, publn_date
            FROM tls211_pat_publn
        ) t2 ON t1.appln_id = t2.appln_id;
    """
    result = connection.execute(text(join_query))
    df_temp_publn = pd.DataFrame(result.fetchall(), columns=result.keys())

df_temp_publn['publn_nr'] = df_temp_publn['publn_nr'].astype(str)
shortest_publn_nr_idx = df_temp_publn.groupby('appln_id')['publn_nr'].apply(lambda x: x.str.len().idxmin())
df_shortest_publn = df_temp_publn.loc[shortest_publn_nr_idx]

# Insert the filtered results back into the database
temp_table_with_publn_query = """
    DROP TABLE IF EXISTS temp_appln_ids_with_publn;
    CREATE TEMP TABLE temp_appln_ids_with_publn (appln_id TEXT, publn_auth TEXT, publn_nr TEXT, publn_date DATE);
    CREATE INDEX temp_appln_ids_with_publn_index ON temp_appln_ids_with_publn(publn_nr, publn_auth);
"""
insert_filtered_query = text("INSERT INTO temp_appln_ids_with_publn (appln_id, publn_auth, publn_nr, publn_date) VALUES (:appln_id, :publn_auth, :publn_nr, :publn_date)")

with engine.connect() as connection:
    connection.execute(text(temp_table_with_publn_query))
    for _, row in df_shortest_publn.iterrows():
        connection.execute(insert_filtered_query, {'appln_id': row['appln_id'], 'publn_auth': row['publn_auth'], 'publn_nr': row['publn_nr'], 'publn_date': row['publn_date']})
    connection.commit()

## Extract Title and Abstract

In [5]:
query_title_abstr = """
    SELECT ids.appln_id, ids.publn_nr, ids.publn_auth, ids.publn_date, title.appln_title, abstract.appln_abstract
    FROM temp_appln_ids_with_publn ids
    JOIN tls202_appln_title title ON ids.appln_id = title.appln_id AND title.appln_title_lg = 'en'
    JOIN tls203_appln_abstr abstract ON ids.appln_id = abstract.appln_id AND abstract.appln_abstract_lg = 'en'
"""

In [6]:
with engine.connect() as connection:
    df_title_abstr = pd.read_sql_query(text(query_title_abstr), connection)

In [7]:
df_title_abstr.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_title_abstract.parquet', index=False)

## Extract US Brf Summary (currently only Title, Abstract, Claims considered)

In [None]:
with engine.connect() as connection:
    query_us_brf_summary = """
        SELECT t1.*, t2.*
        FROM temp_appln_ids_with_publn t1
        JOIN us_brf_summary t2 ON t1.publn_nr = t2.patent_id AND t1.publn_auth = t2.publn_auth;
    """
    df_us_brf_summary = pd.read_sql(query_us_brf_summary, connection)

In [None]:
df_us_brf_summary = df_us_brf_summary.dropna(subset=['summary_text'])
df_us_brf_summary['summary_text'] = df_us_brf_summary['summary_text'].apply(lambda x: re.sub(r'\n', ' ', x))

In [None]:
df_us_brf_summary = df_us_brf_summary.loc[:, ~df_us_brf_summary.columns.duplicated()]
df_us_brf_summary.to_parquet('us_brf_summary.parquet', index=False) # Currently not used!!!

In [None]:
df_us_brf_summary = pd.read_parquet('us_brf_summary.parquet')

## Extract US Claims

In [None]:
with engine.connect() as connection:
    query_us_claims = """
        SELECT t1.*, t2.*
        FROM temp_appln_ids_with_publn t1
        JOIN us_claims t2 ON t1.publn_nr = t2.patent_id AND t1.publn_auth = t2.publn_auth;
    """
    df_us_claims = pd.read_sql(query_us_claims, connection)

### Preprocess Claims

In [None]:
df_us_claims = df_us_claims.dropna(subset=['claim_text'])
df_us_claims.sort_values(by=['patent_id', 'claim_sequence'], inplace=True)
df_us_claims_grouped = df_us_claims.groupby('appln_id').agg({
    'patent_id': 'first',
    'claim_text': list
}).reset_index()

In [None]:
def process_row(row):
    claim_fulltext = ' '.join(re.sub(r'^\d+\.\s', ' ', text) for text in row['claim_text'])
    return pd.Series({'claim_fulltext': claim_fulltext})
df_us_claims_grouped['claim_fulltext'] = df_us_claims_grouped.apply(process_row, axis=1)
df_us_claims_grouped.drop('claim_text', axis=1, inplace=True)

In [None]:
df_us_claims_grouped.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_us_claims.parquet', index=False)

In [8]:
df_us_claims_grouped = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_us_claims.parquet')

## Extract US Description (currently only Title, Abstract, Claims considered)

In [None]:
with engine.connect() as connection:    
    query_us_description = """
        SELECT t1.*, t2.*
        FROM temp_appln_ids_with_publn t1
        JOIN us_description t2 ON t1.publn_nr = t2.patent_id AND t1.publn_auth = t2.publn_auth;
    """
    df_us_description = pd.read_sql(query_us_description, connection)

In [None]:
df_us_description = df_us_description.dropna(subset=['description_text'])
df_us_description['description_text'] = df_us_description['description_text'].apply(lambda x: re.sub(r'\n', ' ', x))

In [None]:
df_us_description.to_csv('/mnt/hdd02/cleantech_patents_ipc_cpc/Extract_all_Cleantech_ipc_cpc/us_description.csv', index=False) # Currently not used!!!

## Extract EP Fulltext (Title, Abstract, Brf Summary, Claims, Description) - (currently only Title, Abstract, Claims considered)

In [None]:
with engine.connect() as connection:    
    ## Extract all fulltext data
    # query_ep_fulltext_data = """ 
    #     SELECT t1.*, t2.*
    #     FROM temp_appln_ids_with_publn t1 
    #     JOIN ep_fulltext_data t2 
    #     ON t1.publn_nr = t2.epo_publn_nr
    #     AND t1.publn_auth = t2.publn_auth
    #     WHERE t2.appln_lng = 'en';
    # """
    ## Only extract CLAIM fulltext data
    query_ep_fulltext_data = """
        SELECT t1.*, t2.*
        FROM temp_appln_ids_with_publn t1 
        JOIN ep_fulltext_data t2 
            ON t1.publn_nr = t2.epo_publn_nr
            AND t1.publn_auth = t2.publn_auth
        WHERE t2.appln_lng = 'en'
          AND t2.appln_comp = 'CLAIM';
    """
    df_ep_fulltext_data = pd.read_sql(query_ep_fulltext_data, connection)

In [None]:
order = {'B9': 11, 'B8': 10, 'B3': 9, 'B2': 8, 'B1': 7, 'A9': 6, 'A8': 5, 'A4': 4, 'A3': 3, 'A2': 2, 'A1': 1}
df_ep_fulltext_data['order'] = df_ep_fulltext_data['appln_kind'].map(order)
df_ep_fulltext_data.drop(['epo_publn_nr', 'appln_auth', 'appln_date', 'appln_lng', 'appln_text_type'], axis=1, inplace=True)

In [None]:
# df_ep_fulltext_data_description = df_ep_fulltext_data[df_ep_fulltext_data['appln_comp'] == 'DESCR']
df_ep_fulltext_data_claims = df_ep_fulltext_data[df_ep_fulltext_data['appln_comp'] == 'CLAIM']
# df_ep_fulltext_data_amend = df_ep_fulltext_data[df_ep_fulltext_data['appln_comp'] == 'AMEND']

In [None]:
# df_ep_fulltext_data_description = df_ep_fulltext_data_description.sort_values(by=['publn_nr', 'order'])
df_ep_fulltext_data_claims = df_ep_fulltext_data_claims.sort_values(by=['publn_nr', 'order'])

# df_ep_fulltext_data_description = df_ep_fulltext_data_description.groupby('publn_nr').first().reset_index()
df_ep_fulltext_data_claims = df_ep_fulltext_data_claims.groupby('publn_nr').first().reset_index()

### Clean Text

In [None]:
def clean_html(text):
    cleaned_text = re.sub(r'<!--.*?-->', ' ', text)
    soup = BeautifulSoup(cleaned_text, 'html.parser')
    cleaned_text = soup.get_text(separator=' ')
    return cleaned_text
def clean_claim_text(claim_text):
    # Remove all instances of <!--(.*?)-->
    claim_text = re.sub(r'<!--.*?-->', ' ', claim_text)

    # Parse the claim_text as XML using BeautifulSoup
    soup = BeautifulSoup(claim_text, 'html.parser')
    
    # Extract all text from <claim-text> tags
    cleaned_texts = [elem.get_text() for elem in soup.find_all('claim-text') if elem.get_text()]

    # Join the cleaned texts
    cleaned_text = ' '.join(cleaned_texts)
    
    return cleaned_text.strip()

In [None]:
# df_ep_fulltext_data_description['appln_text'] = df_ep_fulltext_data_description['appln_text'].apply(clean_html)
df_ep_fulltext_data_claims['appln_text'] = df_ep_fulltext_data_claims['appln_text'].apply(clean_claim_text)

In [None]:
# df_ep_fulltext_data_description.to_csv('oecd_envtech_ipc_cpc_ep_description.csv', index=False)
# df_ep_fulltext_data_claims.to_csv('oecd_envtech_ipc_cpc_ep_claims.csv', index=False)

In [None]:
# df_ep_fulltext_data_description.to_parquet('ep_description.parquet', index=False)
df_ep_fulltext_data_claims = df_ep_fulltext_data_claims.loc[:, ~df_ep_fulltext_data_claims.columns.duplicated()]
df_ep_fulltext_data_claims.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_ep_claims.parquet', index=False)

In [9]:
df_ep_fulltext_data_claims = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_ep_claims.parquet')

# Build one common dataframe

In [None]:
df_title_abstr = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_title_abstract.parquet')

In [10]:
df_title_abstr.rename(columns={'publn_nr': 'patent_id', 'appln_title': 'title', 'appln_abstract': 'abstract'}, inplace=True)
# Keep publn_date and publn_auth columns as they are
# df_us_brf_summary.rename(columns={'summary_text': 'brf_summary'}, inplace=True)
# df_us_brf_summary.drop(['publn_auth', 'publn_nr'], axis=1, inplace=True)
# df_us_description.rename(columns={'description_text': 'description'}, inplace=True)
# df_us_description.drop(['publn_auth', 'publn_nr', 'description_length'], axis=1, inplace=True)

In [11]:
df_cleantech_patents = pd.DataFrame()
df_cleantech_patents = pd.concat([df_cleantech_patents, df_title_abstr])
# df_cleantech_patents = df_cleantech_patents.merge(df_us_brf_summary, on=['appln_id', 'patent_id'], how='outer').drop_duplicates(subset=['appln_id', 'patent_id']).reset_index(drop=True)
# df_cleantech_patents = df_cleantech_patents.merge(df_us_description, on=['appln_id', 'patent_id'], how='outer').drop_duplicates(subset=['appln_id', 'patent_id']).reset_index(drop=True)
df_cleantech_patents = df_cleantech_patents.merge(df_us_claims_grouped, on=['appln_id', 'patent_id'], how='outer').drop_duplicates(subset=['appln_id', 'patent_id']).reset_index(drop=True)

In [12]:
df_cleantech_patents.sample(5) ### UNTIL HERE!!!

Unnamed: 0,appln_id,patent_id,publn_auth,publn_date,title,abstract,claim_fulltext
1707773,53485948,6141698,US,2000-10-31,Method and system for injecting new code into ...,A method and system for modifying the behavior...,A method in a computer system for modifying a...
1080270,490949333,10269966,US,2019-04-23,Semiconductor device including a fin structure,A semiconductor device including a Fin FET dev...,"A semiconductor device, comprising: a Fin FET..."
1001223,485618147,10357867,US,2019-07-23,Polishing system,A polishing system includes a wafer support th...,A polishing system comprising: a wafer suppor...
1665081,532169691,10669720,US,2020-06-02,Stackable closure strip,"Closure strip devices, systems, kits, assembli...",A closure strip for a building having a metal...
1525111,52314141,7029957,US,2006-04-18,Method of manufacturing semiconductor device h...,A method of manufacturing a semiconductor devi...,A method of manufacturing a semiconductor dev...


In [13]:
# df_ep_fulltext_data_description.rename(columns={'publn_nr': 'patent_id', 'appln_text': 'description'}, inplace=True)
# df_ep_fulltext_data_description.drop(['publn_auth', 'appln_kind', 'appln_comp', 'order'], axis=1, inplace=True)

df_ep_fulltext_data_claims.rename(columns={'publn_nr': 'patent_id', 'appln_text': 'claim_fulltext'}, inplace=True)
df_ep_fulltext_data_claims.drop(['publn_auth', 'appln_kind', 'appln_comp', 'order'], axis=1, inplace=True)

In [14]:
# df_ep_fulltext_data_description['patent_id'] = df_ep_fulltext_data_description['patent_id'].astype(str)
df_ep_fulltext_data_claims['patent_id'] = df_ep_fulltext_data_claims['patent_id'].astype(str)

In [15]:
# df_cleantech_patents = df_cleantech_patents.merge(df_ep_fulltext_data_description, on=['appln_id', 'patent_id'], how='outer').drop_duplicates(subset=['appln_id', 'patent_id']).reset_index(drop=True)
df_cleantech_patents = df_cleantech_patents.merge(df_ep_fulltext_data_claims, on=['appln_id', 'patent_id'], how='outer').drop_duplicates(subset=['appln_id', 'patent_id']).reset_index(drop=True)

In [16]:
# df_cleantech_patents['description'] = df_cleantech_patents['description_x'].fillna(df_cleantech_patents['description_y'])
df_cleantech_patents['claim_fulltext'] = df_cleantech_patents['claim_fulltext_x'].fillna(df_cleantech_patents['claim_fulltext_y'])
# df_cleantech_patents.drop(['description_x', 'description_y', 'claim_fulltext_x', 'claim_fulltext_y'], axis=1, inplace=True)
df_cleantech_patents.drop(['claim_fulltext_x', 'claim_fulltext_y'], axis=1, inplace=True)

In [17]:
# Drop rows where either title, abstract or claim_fulltext is NaN
df_cleantech_patents = df_cleantech_patents.dropna(subset=['title', 'abstract', 'claim_fulltext'])

In [18]:
df_cleantech_patents = df_cleantech_patents.reset_index(drop=True)

In [19]:
# Save to parquet
df_cleantech_patents.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_cleantech_patents_all.parquet', index=False)

# Merge Dataframe containing Title, Abstract, Claims with Dataframe containing CPC and IPC codes

In [None]:
df_cleantech_patents = pd.read_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_cleantech_patents_all.parquet')

In [None]:
df_grouped = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_appln_ids.csv')

In [20]:
df_cleantech_patents['appln_id'] = df_cleantech_patents['appln_id'].astype(str)
df_grouped['appln_id'] = df_grouped['appln_id'].astype(str)

In [21]:
df_cleantech_patents_cpc_ipc = pd.merge(df_cleantech_patents, df_grouped, on=['appln_id'], how='left')

In [22]:
def parse_and_clean(sym_str):
    """
    - If already a list/array, return as‐is
    - If a string like "[nan, 'A01B 1/00', nan]", replace nan→None,
      ast.literal_eval(), then map None→np.nan.
    """
    if not isinstance(sym_str, str):
        return sym_str
    # 1) turn unquoted nan into Python None
    tmp = sym_str.replace('nan', 'None')
    # 2) safely parse literal
    try:
        lst = ast.literal_eval(tmp)
    except Exception:
        return []  # or return [np.nan]
    # 3) map None→np.nan
    return [x if x is not None else np.nan for x in lst]

def drop_extra_nan(lst):
    """
    - Remove all np.nan from the list
    - If list ends up empty, return [np.nan]
    """
    cleaned = [x for x in lst if pd.notna(x)]
    return cleaned if cleaned else [np.nan]

# Apply to both columns:
df = df_cleantech_patents_cpc_ipc

# 1) parse string→list, unify nan
df['ipc_class_symbol'] = (df['ipc_class_symbol']
    .apply(parse_and_clean)
    .apply(drop_extra_nan)
)

df['cpc_class_symbol'] = (df['cpc_class_symbol']
    .apply(parse_and_clean)
    .apply(drop_extra_nan)
)

In [23]:
df_cleantech_patents_cpc_ipc.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_cleantech_patents_title_abstract_date_cpc_ipc.parquet', index=False)

In [24]:
df_cleantech_patents_cpc_ipc.sample(10)

Unnamed: 0,appln_id,patent_id,publn_auth,publn_date,title,abstract,claim_fulltext,ipc_class_symbol,cpc_class_symbol
486846,422594458,3050140,EP,2019-03-20,METHOD FOR PRODUCING AN ELECTRODE FOR A LITHIU...,The invention relates to a method for producin...,A method of manufacturing a secondary battery ...,[nan],[Y02E 60/10]
683269,46198188,7282988,US,2007-10-16,Bandgap reference circuit,A bandgap reference circuit is proposed. To re...,"A bandgap reference circuit, comprising: a fi...",[H01L 23/58],[nan]
127419,315438118,7985830,US,2011-07-26,Method for making nitrogen aromatic oligomers ...,Methods for synthesizing dimeric or polymeric ...,"A method for polymerizing pyridine, comprisin...","[B01J 23/42, B01J 23/44]",[nan]
1614782,53744471,7391292,US,2008-06-24,Inductors having interconnect and inductor por...,An on-chip inductor includes a main inductor p...,An on-chip inductor comprising: a main induct...,"[H01L 21/02, H01L 21/822, H01L 23/522, H01L...",[nan]
1554779,53382566,6684119,US,2004-01-27,Method of providing dynamic production materia...,A method of providing dynamic production mater...,A method of providing dynamic production mate...,"[G06Q 10/00, G06Q 10/08]",[nan]
535928,439819877,10084196,US,2018-09-25,System and method for controlling fuel cell mo...,A fuel cell module has a hydrogen recirculatio...,A process for operating a fuel cell stack com...,[nan],[Y02E 60/50]
1152405,50567331,6946264,US,2005-09-20,Metalloproteinase inhibitor,"A novel metalloproteinase inhibitor, analogs t...",An isolated DNA encoding a polypeptide produc...,"[C12N 1/21, C12N 5/10]",[nan]
1641404,538871987,11453682,US,2022-09-27,Condensed-cyclic compound and organic light-em...,A condensed-cyclic compound represented by For...,A condensed-cyclic compound represented by an...,[H01L 51/50],[nan]
1523353,531788124,11101648,US,2021-08-24,Power supply system,"A power supply system includes a main line, a ...",A power supply system comprising: a main line...,"[H02J 1/10, H02J 7/00]",[Y02T 10/70]
1451542,527004670,11293853,US,2022-04-05,System and method for measuring vibrational sp...,Disclosed are systems and methods for measurin...,A system for measuring vibrational spectra of...,"[G01R 23/02, G01R 23/16]",[nan]


In [None]:
df_cleantech_patents_citations = pd.read_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.csv')

In [None]:
# Analyze the number of pat_appln_id in df_cleantech_patents_citations that are in df_cleantech_patents_cpc_ipc column appln_id
df_cleantech_patents_citations['pat_appln_id'] = df_cleantech_patents_citations['pat_appln_id'].astype(str)
df_cleantech_patents_citations['cited_pat_appln_id'] = df_cleantech_patents_citations['cited_pat_appln_id'].astype(str)

In [None]:
# Print the number of pat_appln_id in df_cleantech_patents_citations that are in df_cleantech_patents_cpc_ipc column appln_id
print("Number of pat_appln_id in df_cleantech_patents_citations that are in df_cleantech_patents_cpc_ipc column appln_id:")
print(len(df_cleantech_patents_citations[df_cleantech_patents_citations['pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]))
print("Number of pat_appln_id in df_cleantech_patents_citations that are NOT in df_cleantech_patents_cpc_ipc column appln_id:")
print(len(df_cleantech_patents_citations[~df_cleantech_patents_citations['pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]))
print("Number of cited_pat_appln_id in df_cleantech_patents_citations that are in df_cleantech_patents_cpc_ipc column appln_id:")
print(len(df_cleantech_patents_citations[df_cleantech_patents_citations['cited_pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]))
print("Number of cited_pat_appln_id in df_cleantech_patents_citations that are NOT in df_cleantech_patents_cpc_ipc column appln_id:")
print(len(df_cleantech_patents_citations[~df_cleantech_patents_citations['cited_pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]))

In [None]:
# Get rid of columns where pat_appln_id is not in df_cleantech_patents_cpc_ipc column appln_id
df_cleantech_patents_citations = df_cleantech_patents_citations[df_cleantech_patents_citations['pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]
df_cleantech_patents_citations = df_cleantech_patents_citations[df_cleantech_patents_citations['cited_pat_appln_id'].isin(df_cleantech_patents_cpc_ipc['appln_id'])]
df_cleantech_patents_citations = df_cleantech_patents_citations.reset_index(drop=True)

In [None]:
df_cleantech_patents_citations.to_csv('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.csv', index=False)

In [None]:
df_cleantech_patents_citations.to_parquet('/mnt/hdd02/Projekt_EDV_TEK/edv_tek_all_cleantech_patstat_citations.parquet', index=False)