In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
# df_cleantech_ep = pd.read_csv('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/Cleantech Patent Raw Data/cleantech_epo_text_data.csv')
df_cleantech_ep = pd.read_csv('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_epo_non_cleantech_text_data.csv') # Non Cleantech Patents!!!

In [3]:
# Delete all rows where appln_lng != 'en' - only keep english texts
df_cleantech_ep_en = df_cleantech_ep[df_cleantech_ep['appln_lng'] == 'en']

In [4]:
# Delete initial df_cleantech_ep to save memory
del df_cleantech_ep

# Preprocessing of Dataframe

In [5]:
# Create a custom order for appln_kind
order = {'B9': 11, 'B8': 10, 'B3': 9, 'B2': 8, 'B1': 7, 'A9': 6, 'A8': 5, 'A4': 4, 'A3': 3, 'A2': 2, 'A1': 1}
df_cleantech_ep_en['kind_order'] = df_cleantech_ep_en['appln_kind'].map(order)

# Sort and group
df_cleantech_ep_en_sorted = df_cleantech_ep_en.sort_values(by=['appln_comp', 'publn_nr', 'kind_order'], ascending=[True, True, False])
df_cleantech_ep_en_filtered = df_cleantech_ep_en_sorted.groupby(['appln_comp', 'publn_nr']).first().reset_index()

# Drop the kind_order column
df_cleantech_ep_en_filtered = df_cleantech_ep_en_filtered.drop(columns='kind_order')

In [6]:
# Count unique publn_nr 
print('Number of unique publication numbers: ', len(df_cleantech_ep_en['publn_nr'].unique()))

Number of unique publication numbers:  181920


In [7]:
# Count values in appln_kind column
df_cleantech_ep_en_filtered['appln_kind'].value_counts()

appln_kind
B1    343392
A1      8373
B2      5861
B8      2306
B9      1880
A2       382
A3       200
B3       151
A9        16
A8         6
Name: count, dtype: int64

In [8]:
# Pivot dataframe by publn_nr and appln_comp (Text such as ABSTR, TITLE, CLAIM)
df_cleantech_ep_en_pivot = df_cleantech_ep_en_filtered.pivot_table(index='publn_nr', columns='appln_comp', values='appln_text', aggfunc=list)
# Reset index
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.reset_index()

In [10]:
df_cleantech_ep_en_pivot.head()

appln_comp,publn_nr,AMEND,CLAIM,TITLE
0,23,,"[<claim id=""c-en-01-0001"" num=""""><claim-text>1...","[Omega-substituted pentyl ureas, their prepara..."
1,31,,"[<claim id=""c-en-01-0001"" num=""""><claim-text>1...",[Thermal and sound isolating glass unit]
2,84,,"[<claim id=""c-en-01-0001"" num=""""><claim-text>1...",[Use of alpha-polyolefin compositions for extr...
3,159,,"[<claim id=""c-en-01-0001"" num=""""><claim-text>1...",[Method of treating butanediol obtained in the...
4,180,,"[<claim id=""c-en-01-0001"" num=""""><claim-text>1...",[Semiconductor cell structure for a bucket bri...


In [11]:
df_cleantech_ep_en_filtered.head()

Unnamed: 0.1,appln_comp,publn_nr,Unnamed: 0,appln_auth,appln_kind,appln_date,appln_lng,appln_text_type,appln_text,appln_id,publn_date
0,AMEND,14551,1134668,EP,A1,1980-08-20,en,1.0,"<heading id=""h0007"">Amended claims in accordan...",16452126.0,1980-08-20
1,AMEND,22871,1136351,EP,A1,1981-01-28,en,1.0,"<heading id=""h0027"">Amended claims in accordan...",16442612.0,1981-03-24
2,AMEND,22876,1159286,EP,A1,1981-01-28,en,1.0,"<heading id=""h0007"">Amended claims in accordan...",16459762.0,1986-08-27
3,AMEND,26237,1154241,EP,A1,1981-04-08,en,1.0,"<heading id=""h0009"">Amended claims in accordan...",16460252.0,1986-07-16
4,AMEND,48759,1156899,EP,A1,1982-04-07,en,1.0,"<heading id=""h0005"">Amended claims in accordan...",16460153.0,1982-07-12


In [12]:
# Merge dataframes and keep important information
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.merge(df_cleantech_ep_en_filtered[['publn_nr', 'appln_auth', 'appln_lng', 'appln_id']], on='publn_nr', how='left')
# Delete duplicate rows
df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot.drop_duplicates(subset=['publn_nr', 'appln_lng', 'appln_id'])

In [None]:
# Print number of rows where claim is not empty
# print('Number of rows where claim is not empty: ', len(df_cleantech_ep_en_pivot[df_cleantech_ep_en_pivot['CLAIM'].notnull()]))

In [None]:
# Save dataframe as json
df_cleantech_ep_en_pivot.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot.json', orient='records')

# Text Preprocessing

In [2]:
# Load data if needed
df_cleantech_ep_en_pivot = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot.json', orient='records')

In [16]:
import re
import xml.etree.ElementTree as ET

def clean_claim_text_xml(claim_text):
    # If the input is a list, join it into a string
    if isinstance(claim_text, list):
        claim_text = ' '.join(claim_text)
    elif claim_text is None:
        return None

    # Remove all instances of <!--(.*?)-->
    claim_text = re.sub(r'<!--.*?-->', ' ', claim_text)

    # Surround the claim_text with a root tag to make it a valid XML
    claim_text = '<root>' + claim_text + '</root>'

    # Remove HTML tags with three or less characters between < and >
    # claim_text = re.sub(r'<.{0,5}>', ' ', claim_text)
    
    try:
        # Parse the claim_text as XML
        root = ET.fromstring(claim_text)
        
        # Extract all text from <claim-text> tags
        cleaned_texts = [elem.text for elem in root.findall('.//claim-text') if elem.text]

        # Remove claim numbers if they exist
        cleaned_texts = [re.sub(r'^\d+\.\s*', '', text) for text in cleaned_texts]

        # Join the cleaned texts
        cleaned_text = ' '.join(cleaned_texts)

    except ET.ParseError:
        return claim_text  # Return the original text if parsing fails
    
    return cleaned_text.strip()


# Clean claim text
# df_cleantech_ep_en_pivot['cleaned_amend_claims'] = df_cleantech_ep_en_pivot['AMEND'].progress_apply(clean_claim_text_xml)
df_cleantech_ep_en_pivot['cleaned_claims'] = df_cleantech_ep_en_pivot['CLAIM'].progress_apply(clean_claim_text_xml)

100%|██████████| 181920/181920 [00:13<00:00, 13336.79it/s]


In [None]:
from bs4 import BeautifulSoup

def clean_abstr_text_html(abstr_text):
    # If the input is a list, join it into a string
    if isinstance(abstr_text, list):
        abstr_text = ' '.join(abstr_text)
    elif abstr_text is None:
        return None

    # Remove all instances of <!--(.*?)-->
    abstr_text = re.sub(r'<!--.*?-->', ' ', abstr_text)

    try:
        # Parse the claim_text as HTML
        soup = BeautifulSoup(abstr_text, 'html.parser')
        
        # Extract all text from <p> tags
        cleaned_texts = [elem.text for elem in soup.find_all('p') if elem.text]

        # Remove claim numbers if they exist
        cleaned_texts = [re.sub(r'^\d+\.\s*', '', text) for text in cleaned_texts]

        # Join the cleaned texts
        cleaned_text = ' '.join(cleaned_texts)

    except Exception as e:
        return abstr_text  # Return the original text if parsing fails
    
    return cleaned_text.strip()

# Clean claim text
df_epo['cleaned_abstr'] = df_epo['ABSTR'].apply(clean_claim_text_html)

In [None]:
# Delete all rows where cleaned_amend_claims == None
# df_cleantech_ep_en_pivot = df_cleantech_ep_en_pivot[df_cleantech_ep_en_pivot['cleaned_amend_claims'].notnull()]
# df_cleantech_ep_en_pivot.reset_index(drop=True, inplace=True)

In [17]:
df_cleantech_ep_en_pivot['cleaned_claims'][25000]

'Method of manufacturing a construction (7) for sound insulation, in which one foam backs a semi-finished product with plastics material in a mould, the semi-finished product consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2), Method of manufacturing a construction for sound insulation according to claim 1, Method of manufacturing a construction for sound insulation according to claim 1 or 2, Method of manufacturing a construction for sound insulation according to any of claims 1 to 3, Method of manufacturing a construction for sound insulation according to any of claims 1 to 4, Method of manufacturing a construction for sound insulation according to any of claims 1 to 5, Method of manufacturing a construction for sound insulation in accordance with any of claims 1 to 5, Method of manufacturing a construction for sound insulation according to any of claims 1 to 5, Method of manufacturing a

'Method of manufacturing a construction (7) for sound insulation, in which one foam backs a semi-finished product with plastics material in a mould, the semi-finished product consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2), Method of manufacturing a construction for sound insulation according to claim 1, Method of manufacturing a construction for sound insulation according to claim 1 or 2, Method of manufacturing a construction for sound insulation according to any of claims 1 to 3, Method of manufacturing a construction for sound insulation according to any of claims 1 to 4, Method of manufacturing a construction for sound insulation according to any of claims 1 to 5, Method of manufacturing a construction for sound insulation in accordance with any of claims 1 to 5, Method of manufacturing a construction for sound insulation according to any of claims 1 to 5, Method of manufacturing a construction for sound insulation according to any of claims 1 to 8, Method of manufacturing a construction for sound insulation according to claim 9, Method of manufacturing a construction for sound insulation in accordance with any of claims 1 to 10, Construction for sound insulation, consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2), and an adjoining foam backing, Construction for sound insulation according to claim 12, Construction for sound insulation according to claim 12 or 13, Construction for sound insulation according to any of claims 12 to 14, Construction for sound insulation according to either of claims 12 and 14, Construction for sound insulation according to any of claims 12 to 16, Construction for sound insulation according to any of claims 12 to 17, Construction for sound insulation according to any of claims 12 to 17, Construction for sound insulation according to any of claims 12 to 17, Construction for sound insulation according to any of claims 12 to 20, Construction for sound insulation according to any of claims 12 to 21, Construction for sound insulation according to any of claims 12 to 22, Construction for sound insulation according to any of claims 12 to 23, Use of the construction for sound insulation according to any of claims 12 to 24 for the sound insulation of disruptive acoustic sources. Use of the construction for sound insulation in accordance with any of claims 12 to 24 for sound insulation in vehicles, in particular motor vehicles. Use of the construction for sound insulation in accordance with any of claims 12 to 23 as a sound  insulation part laid loose and form-fitting in a motor vehicle. Use of the construction for sound insulation according to any of claims 12 to 23 as a sound insulation part laid glued and form-fitting in a motor vehicle. Use of the construction according to any of claims 12 to 24 as a sound insulation part laid adhesively and form-fittingly in a motor vehicle. Use of the construction according to any of claims 12 to 24 for the sound insulation of motor vehicle passenger compartments. Use of the construction according to any of claims 12 to 24 for the sound insulation of motor vehicle engine compartments.'

In [18]:
df_cleantech_ep_en_pivot['CLAIM'][25000]

'[\'<claim id="c-en-01-0001" num="0001"><claim-text>Method of manufacturing a construction (7) for sound insulation, in which one foam backs a semi-finished product with plastics material in a mould, the semi-finished product consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2),<br/>characterised in that,<br/>one provides the decorative covering (1), if it is foam tight, or the foam-tight foil lining (2) of the decorative covering (1) with structured materials on the rear side, and<br/>in that one then foams into a closed mould, in a single working operation, for forming a cell-free or cell-poor consolidated layer (3) of plastics material for the formation of an acoustic heavy layer into the structured materials and for forming a cell-containing elastic soft foam layer (5) of plastics material adjoining thereon, for the formation of an acoustic spring, whereby for the production both of the 

'[\'<claim id="c-en-01-0001" num="0001"><claim-text>Method of manufacturing a construction (7) for sound insulation, in which one foam backs a semi-finished product with plastics material in a mould, the semi-finished product consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2),<br/>characterised in that,<br/>one provides the decorative covering (1), if it is foam tight, or the foam-tight foil lining (2) of the decorative covering (1) with structured materials on the rear side, and<br/>in that one then foams into a closed mould, in a single working operation, for forming a cell-free or cell-poor consolidated layer (3) of plastics material for the formation of an acoustic heavy layer into the structured materials and for forming a cell-containing elastic soft foam layer (5) of plastics material adjoining thereon, for the formation of an acoustic spring, whereby for the production both of the consolidated layer (3) and the elastic soft foam layer (5) a plastics material having the same chemical composition is employed,<br/>whereby the formed elastic foam has a loss factor of at least 0.2 and up to 0.6,<br/>whereby a non-woven fabric and/or an open-pored cut foam is employed as structured materials, whereby in the case of a non-woven fabric the mass per unit area lies between 100 and 1000 g/m<sup>2</sup>, and<br/>whereby the arrangement of consolidated layer (3) and decorative covering (1), or of consolidated layer (3), decorative covering (1) and foil lining (2) have masses per unit area of less than 2000 g/m<sup>2</sup> and the consolidated layer (3) alone has a mass per unit area of less than 1100 g/m<sup>2</sup>.</claim-text></claim><claim id="c-en-01-0002" num="0002"><claim-text>Method of manufacturing a construction for sound insulation according to claim 1,<br/><!-- EPO <DP n="19"> -->characterised in that,<br/>one arranges, between the consolidated layer and the soft foam layer, a foam penetrable non-woven covering for retaining loose structured materials.</claim-text></claim><claim id="c-en-01-0003" num="0003"><claim-text>Method of manufacturing a construction for sound insulation according to claim 1 or 2,<br/>characterised in that,<br/>for acoustic decoupling of the decorative covering and the consolidated layer, one arranges a soft intermediate layer between them.</claim-text></claim><claim id="c-en-01-0004" num="0004"><claim-text>Method of manufacturing a construction for sound insulation according to any of claims 1 to 3,<br/>characterised in that,<br/>one provides the elastic soft foam layer with an adhesive coating, in order to achieve a bonded connection between the foam of the construction and the part to be insulated.</claim-text></claim><claim id="c-en-01-0005" num="0005"><claim-text>Method of manufacturing a construction for sound insulation according to any of claims 1 to 4,<br/>characterised in that,<br/>there is employed a non-woven fabric having a mass per unit area of 180 g/m<sup>2</sup>.</claim-text></claim><claim id="c-en-01-0006" num="0006"><claim-text>Method of manufacturing a construction for sound insulation according to any of claims 1 to 5,<br/>characterised in that,<br/>as non-woven fabric there is employed an acrylic fibre non-woven fabric.</claim-text></claim><claim id="c-en-01-0007" num="0007"><claim-text>Method of manufacturing a construction for sound insulation in accordance with any of claims 1 to 5,<br/>characterised in that,<br/>there is employed a non-woven fabric of polyester fibres which is consolidated thermally or by a binder, and if appropriate is silicone treated.<!-- EPO <DP n="20"> --></claim-text></claim><claim id="c-en-01-0008" num="0008"><claim-text>Method of manufacturing a construction for sound insulation according to any of claims 1 to 5,<br/>characterised in that,<br/>there is employed as non-woven fabric a polyethylene non-woven fabric.</claim-text></claim><claim id="c-en-01-0009" num="0009"><claim-text>Method of manufacturing a construction for sound insulation according to any of claims 1 to 8,<br/>characterised in that,<br/>one produces the consolidated layer and the soft elastic foam layer in that for the foaming one employs a physical foaming agent.</claim-text></claim><claim id="c-en-01-0010" num="0010"><claim-text>Method of manufacturing a construction for sound insulation according to claim 9,<br/>characterised in that,<br/>one employs methylene chloride as physical foaming agent.</claim-text></claim><claim id="c-en-01-0011" num="0011"><claim-text>Method of manufacturing a construction for sound insulation in accordance with any of claims 1 to 10,<br/>characterised in that,<br/>one brings the decorative covering, with an acrylic fibre non-woven fabric, a consolidated polyester fibre non-woven fabric or a polyethylene non-woven fabric, if appropriate with a corresponding foam-tight lining towards the decorative covering, under a lining press, effects heating, and introduces the composite into the mould tool, heats it to an object temperature of 160-165°C and then effects foam backing.</claim-text></claim><claim id="c-en-01-0012" num="0012"><claim-text>Construction for sound insulation, consisting of a decorative covering (1) which is either itself foam-tight or is provided directly at its rear side with a foam-tight foil lining (2), and an adjoining foam backing,<br/>in particular manufactured in accordance with a method according to any of claims 1 to 12,<br/>characterised in that,<br/><!-- EPO <DP n="21"> -->the foam backing consists of a cell-free or cell-poor consolidated layer of plastics material (3) - adjoining the decorative covering (1) if it is foam-tight, or adjoining the foam-tight foil lining (2) of the decorative covering (1), foamed into structured materials provided there on the reverse side - and a cell-containing elastic soft foam layer (5) of plastics material adjoining the consolidated layer (3),<br/>whereby both the plastics material of the consolidated layer (3) and the plastics material of the elastic soft foam layer (5) have the same chemical composition, whereby the elastic foam formed has a loss factor of at least 0.2 up to 0.6,<br/>whereby there is employed as structured materials a non-woven fabric and/or and an open-pored cut foam, whereby in the case of a non-woven fabric the mass per unit area lies between 100 and 1000 g/m<sup>2</sup>, and whereby the arrangement of the consolidated layer (3) and decorative covering (1), or of consolidated layer (3), decorative covering (1) and foil lining (2), have masses per unit area of less than 2000g/m<sup>2</sup> and the consolidated layer (3) alone has a mass per unit area of less than 1100 g/m<sup>2</sup>.</claim-text></claim><claim id="c-en-01-0013" num="0013"><claim-text>Construction for sound insulation according to claim 12,<br/>characterised in that,<br/>the soft elastic foam layer (5) and the consolidated layer (3) are of an air-permeable or open-celled light foam on in particular two component polyurethane basis.</claim-text></claim><claim id="c-en-01-0014" num="0014"><claim-text>Construction for sound insulation according to claim 12 or 13,<br/>characterised in that,<br/>the consolidated layer (3) is such that with significantly reduced mass per unit area it has almost the same acoustic quality as a known soft bending heavy layer in the spring-mass-system.<!-- EPO <DP n="22"> --></claim-text></claim><claim id="c-en-01-0015" num="0015"><claim-text>Construction for sound insulation according to any of claims 12 to 14,<br/>characterised in that,<br/>the consolidated layer (3) is resistant to bending.</claim-text></claim><claim id="c-en-01-0016" num="0016"><claim-text>Construction for sound insulation according to either of claims 12 and 14,<br/>characterised in that,<br/>the elastic soft foam layer (5) has a substantially lesser density of the foam than a foam layer in a conventional spring-mass-system.</claim-text></claim><claim id="c-en-01-0017" num="0017"><claim-text>Construction for sound insulation according to any of claims 12 to 16,<br/>characterised in that,<br/>there is employed a non-woven fabric having a mass per unit area of about 180 mg<sup>-2</sup>.</claim-text></claim><claim id="c-en-01-0018" num="0018"><claim-text>Construction for sound insulation according to any of claims 12 to 17,<br/>characterised in that,<br/>as non-woven fabric there is employed an acrylic fibre non-woven fabric.</claim-text></claim><claim id="c-en-01-0019" num="0019"><claim-text>Construction for sound insulation according to any of claims 12 to 17,<br/>characterised in that,<br/>there is employed a non-woven fabric of polyester fibres, which is consolidated thermally or by a binder, and if appropriate is silicone treated.</claim-text></claim><claim id="c-en-01-0020" num="0020"><claim-text>Construction for sound insulation according to any of claims 12 to 17,<br/>characterised in that,<br/>a polyethylene non-woven fabric is employed as non-woven fabric.<!-- EPO <DP n="23"> --></claim-text></claim><claim id="c-en-01-0021" num="0021"><claim-text>Construction for sound insulation according to any of claims 12 to 20,<br/>characterised in that,<br/>for the stability of shape, and the ability to be processed further, of the arrangement having the structured materials, there is arranged between the structured materials and the foam layer (5) a foam-penetrable non-woven fabric covering (4).</claim-text></claim><claim id="c-en-01-0022" num="0022"><claim-text>Construction for sound insulation according to any of claims 12 to 21,<br/>characterised in that,<br/>for acoustic decoupling of the decorative covering (1) and the consolidated layer (3) there is arranged between them an intermediate layer.</claim-text></claim><claim id="c-en-01-0023" num="0023"><claim-text>Construction for sound insulation according to any of claims 12 to 22,<br/>characterised in that,<br/>the soft elastic foam has a loss factor of at least 0.2.</claim-text></claim><claim id="c-en-01-0024" num="0024"><claim-text>Construction for sound insulation according to any of claims 12 to 23,<br/>characterised in that,<br/>the elastic soft foam layer (5) has an adhesive coating (9).</claim-text></claim><claim id="c-en-01-0025" num="0025"><claim-text>Use of the construction for sound insulation according to any of claims 12 to 24 for the sound insulation of disruptive acoustic sources.</claim-text></claim><claim id="c-en-01-0026" num="0026"><claim-text>Use of the construction for sound insulation in accordance with any of claims 12 to 24 for sound insulation in vehicles, in particular motor vehicles.</claim-text></claim><claim id="c-en-01-0027" num="0027"><claim-text>Use of the construction for sound insulation in accordance with any of claims 12 to 23 as a sound<!-- EPO <DP n="24"> --> insulation part laid loose and form-fitting in a motor vehicle.</claim-text></claim><claim id="c-en-01-0028" num="0028"><claim-text>Use of the construction for sound insulation according to any of claims 12 to 23 as a sound insulation part laid glued and form-fitting in a motor vehicle.</claim-text></claim><claim id="c-en-01-0029" num="0029"><claim-text>Use of the construction according to any of claims 12 to 24 as a sound insulation part laid adhesively and form-fittingly in a motor vehicle.</claim-text></claim><claim id="c-en-01-0030" num="0030"><claim-text>Use of the construction according to any of claims 12 to 24 for the sound insulation of motor vehicle passenger compartments.</claim-text></claim><claim id="c-en-01-0031" num="0031"><claim-text>Use of the construction according to any of claims 12 to 24 for the sound insulation of motor vehicle engine compartments.</claim-text></claim>\']'

In [19]:
# df_cleantech_ep_en_pivot.to_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_amend_claims_cleaned.json', orient='records')
df_cleantech_ep_en_pivot.to_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_epo_non_cleantech_text_data_pivot_claims_cleaned.json', orient='records')