In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [None]:
df_cleantech_ep = pd.read_csv('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/Cleantech Patent Raw Data/cleantech_epo_text_data.csv')

In [None]:
# Delete all rows where appln_lng != 'en' - only keep english texts
df_cleantech_ep_en = df_cleantech_ep[df_cleantech_ep['appln_lng'] == 'en']

In [None]:
# Delete initial df_cleantech_ep to save memory
del df_cleantech_ep

# Preprocessing of Dataframe

In [None]:
# Create a custom order for appln_kind
order = {'B9': 11, 'B8': 10, 'B3': 9, 'B2': 8, 'B1': 7, 'A9': 6, 'A8': 5, 'A4': 4, 'A3': 3, 'A2': 2, 'A1': 1}
df_cleantech_ep_en['kind_order'] = df_cleantech_ep_en['appln_kind'].map(order)

# Sort and group
df_cleantech_ep_en_sorted = df_cleantech_ep_en.sort_values(by=['appln_comp', 'publn_nr', 'kind_order'], ascending=[True, True, False])
df_cleantech_ep_en_filtered = df_cleantech_ep_en_sorted.groupby(['appln_comp', 'publn_nr']).first().reset_index()

# Drop the kind_order column
df_cleantech_ep_en_filtered = df_cleantech_ep_en_filtered.drop(columns='kind_order')

In [None]:
# Count unique publn_nr 
print('Number of unique publication numbers: ', len(df_cleantech_ep_en['publn_nr'].unique()))

In [None]:
# Count values in appln_kind column
df_cleantech_ep_en_filtered['appln_kind'].value_counts()

In [None]:
# Pivot dataframe by publn_nr and appln_comp (Text such as ABSTR, TITLE, CLAIM)
df_cleantech_ep_en_pivot = df_cleantech_ep_en_filtered.pivot_table(index='publn_nr', columns='appln_comp', values='appln_text', aggfunc=list)
# Reset index
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.reset_index()

In [None]:
# Merge dataframes and keep important information
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.merge(df_cleantech_ep_en_filtered[['publn_nr', 'appln_auth_x', 'appln_lng', 'appln_id', 'granted', 'cpc_class_symbol']], on='publn_nr', how='left')
# Delete duplicate rows
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.drop_duplicates(subset=['publn_nr', 'appln_auth_x', 'appln_lng', 'appln_id', 'granted', 'cpc_class_symbol'])

In [None]:
# Print number of rows where claim is not empty
# print('Number of rows where claim is not empty: ', len(df_cleantech_ep_en_pivot[df_cleantech_ep_en_pivot['CLAIM'].notnull()]))

In [None]:
# Save dataframe as json
df_cleantech_ep_en_pivot.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot.json', orient='records')

# Text Preprocessing

In [2]:
# Load data if needed
df_cleantech_ep_en_pivot = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot.json', orient='records')

In [3]:
import re
import xml.etree.ElementTree as ET

def clean_claim_text_xml(claim_text):
    # If the input is a list, join it into a string
    if isinstance(claim_text, list):
        claim_text = ' '.join(claim_text)
    elif claim_text is None:
        return None

    # Remove all instances of <!--(.*?)-->
    claim_text = re.sub(r'<!--.*?-->', ' ', claim_text)

    # Surround the claim_text with a root tag to make it a valid XML
    claim_text = '<root>' + claim_text + '</root>'

    # Remove HTML tags with three or less characters between < and >
    # claim_text = re.sub(r'<.{0,5}>', ' ', claim_text)
    
    try:
        # Parse the claim_text as XML
        root = ET.fromstring(claim_text)
        
        # Extract all text from <claim-text> tags
        cleaned_texts = [elem.text for elem in root.findall('.//claim-text') if elem.text]

        # Remove claim numbers if they exist
        cleaned_texts = [re.sub(r'^\d+\.\s*', '', text) for text in cleaned_texts]

        # Join the cleaned texts
        cleaned_text = ' '.join(cleaned_texts)

    except ET.ParseError:
        return claim_text  # Return the original text if parsing fails
    
    return cleaned_text.strip()


# Clean claim text
# df_cleantech_ep_en_pivot['cleaned_amend_claims'] = df_cleantech_ep_en_pivot['AMEND'].progress_apply(clean_claim_text_xml)
df_cleantech_ep_en_pivot['cleaned_claims'] = df_cleantech_ep_en_pivot['CLAIM'].progress_apply(clean_claim_text_xml)

100%|██████████| 182369/182369 [00:11<00:00, 15620.86it/s]


In [None]:
# Delete all rows where cleaned_amend_claims == None
# df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot[df_cleantech_ep_en_pivot['cleaned_amend_claims'].notnull()]
# df_cleantech_ep_en_pivot.reset_index(drop=True, inplace=True)

In [10]:
df_cleantech_ep_en_pivot['cleaned_claims'][25000]

'Clarification plant primarily for communal waste water, - by means of which the waste water can at least partly be filtered in tanks units (I, II, III) in a clarifying procedure with varying clarification steps when flowing stepwise through one tank after the other, - whose tank volume, predetermined systematically by the clarification procedure, is split up variably into the tanks depending on the clarification steps, while all clarification steps except for one dispose of only one tank and one clarification step - practically the one with the biggest volume demand - is divided into two tanks, - whose tanks (I, II, III/1, III/2) form a plant unit, whose construction is optimized to achieve an increased economic efficiency by a more favorable relation between wall surfaces and tank volume of all the tanks of the plant, - with a partition wall (5) being arranged between the tanks (I, II, III/1, III/2) with a charging and by-pass canal (7) which can be divided into sections by means of 

'Clarification plant primarily for communal waste water, - by means of which the waste water can at least partly be filtered in tanks units (I, II, III) in a clarifying procedure with varying clarification steps when flowing stepwise through one tank after the other, - whose tank volume, predetermined systematically by the clarification procedure, is split up variably into the tanks depending on the clarification steps, while all clarification steps except for one dispose of only one tank and one clarification step - practically the one with the biggest volume demand - is divided into two tanks, - whose tanks (I, II, III/1, III/2) form a plant unit, whose construction is optimized to achieve an increased economic efficiency by a more favorable relation between wall surfaces and tank volume of all the tanks of the plant, - with a partition wall (5) being arranged between the tanks (I, II, III/1, III/2) with a charging and by-pass canal (7) which can be divided into sections by means of shutoff devices (11), - where each tank (I, II, III/1, III/2) can be by-passed via the canal (7) by means of rearranging the shutoff devices (11, 12), with the other tanks continuously remaining operable and maintaining the clarification procedure while being flown through one after the other by using the respective section of the canal, - with each tank (I, II, III/1, III/2) being equipped with in- and outflows (12) which can be shut off and which are being laid out towards the canal as well as towards the partition wall to the respective neighboring tank, - where in case of the by-passing of one tank the in- and outflows (12) of this tank are closed, while the relevant canal section (11) and the in- and outflows (12) of the neighboring tank are open. Clarification plant (4) according to claim 1 characterized in that at least the partition walls (5) between tanks that are arranged next to each other are constructed as double sheet version and that the charging and supply lines can be arranged between the sheets (6) of the walls (5). Clarification plant (4) according to claims 1 and 2 characterized in that at least the partition walls (5) between tanks that are arranged next to each other are, as cross section, constructed in cellular form, and that the cells (6, 7, 8, 9, 10) can be used as transport lines to distribute the media into the different tanks and/or as tube canal for the supply lines. Clarification plant (4) according to claim 3 characterized in that the cells (6, 7, 8, 9, 10) are accessible for a camera, for cleaning devices and for redevelopment devices. Clarification plant (4) according to claims 3 and 4 characterized in that the cells (6, 7, 8, 9, 10) are passable at least in kneeling position. Clarification plant (4) according to claims 3 to 5 characterized in that the cells (6, 7, 8, 9, 10) are made of concrete. Clarification plant (4) according to claims 3 to 6 characterized in that openings (12) that can be shut off towards the tanks (I, II, III) and sliding gates (11), by means of which a cell line is dividable into sections for the transport of different media, are arranged in the cells (6, 7, 8, 9, 10). Clarification plant (4) according to claims 1 to 7 characterized in that the mechanical devices can be regrouped within the tanks (I, II, III).'

In [11]:
df_cleantech_ep_en_pivot['CLAIM'][25000]

['<claim id="c-en-01-0001" num="0001"><claim-text>Clarification plant primarily for communal waste water,<claim-text>- by means of which the waste water can at least partly be filtered in tanks units (I, II, III) in a clarifying procedure with varying clarification steps when flowing stepwise through one tank after the other,</claim-text><claim-text>- whose tank volume, predetermined systematically by the clarification procedure, is split up variably into the tanks depending on the clarification steps, while all clarification steps except for one dispose of only one tank and one clarification step - practically the one with the biggest volume demand - is divided into two tanks,</claim-text><claim-text>- whose tanks (I, II, III/1, III/2) form a plant unit, whose construction is optimized to achieve an increased economic efficiency by a more favorable relation between wall surfaces and tank volume of all the tanks of the plant,</claim-text><claim-text>- with a partition wall (5) being ar

['<claim id="c-en-01-0001" num="0001"><claim-text>Clarification plant primarily for communal waste water,<claim-text>- by means of which the waste water can at least partly be filtered in tanks units (I, II, III) in a clarifying procedure with varying clarification steps when flowing stepwise through one tank after the other,</claim-text><claim-text>- whose tank volume, predetermined systematically by the clarification procedure, is split up variably into the tanks depending on the clarification steps, while all clarification steps except for one dispose of only one tank and one clarification step - practically the one with the biggest volume demand - is divided into two tanks,</claim-text><claim-text>- whose tanks (I, II, III/1, III/2) form a plant unit, whose construction is optimized to achieve an increased economic efficiency by a more favorable relation between wall surfaces and tank volume of all the tanks of the plant,</claim-text><claim-text>- with a partition wall (5) being arranged between the tanks (I, II, III/1, III/2) with a charging and by-pass canal (7) which can be divided into sections by means of shutoff devices (11),</claim-text><claim-text>- where each tank (I, II, III/1, III/2) can be by-passed via the canal (7) by means of rearranging the shutoff devices (11, 12), with the other tanks continuously remaining operable and maintaining the clarification procedure while being flown through one after the other by using the respective section of the canal,</claim-text><claim-text>- with each tank (I, II, III/1, III/2) being equipped with in- and outflows (12) which can be shut off and which are being laid out towards the canal as well as towards the partition wall to the respective neighboring tank,</claim-text><claim-text>- where in case of the by-passing of one tank the in- and outflows (12) of this tank are closed, while the relevant canal section (11) and the in- and outflows (12) of the neighboring tank are open.</claim-text><!-- EPO <DP n="10"> --></claim-text></claim><claim id="c-en-01-0002" num="0002"><claim-text>Clarification plant (4) according to claim 1 characterized in that at least the partition walls (5) between tanks that are arranged next to each other are constructed as double sheet version and that the charging and supply lines can be arranged between the sheets (6) of the walls (5).</claim-text></claim><claim id="c-en-01-0003" num="0003"><claim-text>Clarification plant (4) according to claims 1 and 2 characterized in that at least the partition walls (5) between tanks that are arranged next to each other are, as cross section, constructed in cellular form, and that the cells (6, 7, 8, 9, 10) can be used as transport lines to distribute the media into the different tanks and/or as tube canal for the supply lines.</claim-text></claim><claim id="c-en-01-0004" num="0004"><claim-text>Clarification plant (4) according to claim 3 characterized in that the cells (6, 7, 8, 9, 10) are accessible for a camera, for cleaning devices and for redevelopment devices.</claim-text></claim><claim id="c-en-01-0005" num="0005"><claim-text>Clarification plant (4) according to claims 3 and 4 characterized in that the cells (6, 7, 8, 9, 10) are passable at least in kneeling position.</claim-text></claim><claim id="c-en-01-0006" num="0006"><claim-text>Clarification plant (4) according to claims 3 to 5 characterized in that the cells (6, 7, 8, 9, 10) are made of concrete.</claim-text></claim><claim id="c-en-01-0007" num="0007"><claim-text>Clarification plant (4) according to claims 3 to 6 characterized in that openings (12) that can be shut off towards the tanks (I, II, III) and sliding gates (11), by means of which a cell line is dividable into sections for the transport of different media, are arranged in the cells (6, 7, 8, 9, 10).</claim-text></claim><claim id="c-en-01-0008" num="0008"><claim-text>Clarification plant (4) according to claims 1 to 7 characterized in that the mechanical devices can be regrouped within the tanks (I, II, III).</claim-text></claim>']

In [None]:
df_cleantech_ep_en_pivot.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_amend_claims_cleaned.json', orient='records')