In [2]:
import requests
import time
import json

#### DOWNLOAD SPECIFIC PATENT LENS.ORG

In [27]:
# API URL for patent search
url = 'https://api.lens.org/patent/search'
usage_url = "https://api.lens.org/subscriptions/patent_api/usage"

In [23]:
# API headers, include your API token
headers = {
    'Authorization': '63D1me3aSbQrHfw3lz1RzFbTnZprrK3Bujsv5OQe8YsUmwLxlh2Z',  # Replace with your actual Lens.org API token
    'Content-Type': 'application/json'
}

In [None]:
def check_usage():
    """Checks the current API usage to monitor monthly and record limits."""
    response = requests.get(usage_url, headers=headers)
    if response.status_code == 200:
        usage_data = response.json()
        print("API Usage:", usage_data)
    else:
        print("Failed to retrieve usage data:", response.status_code, response.text)

# Usage check at the start
check_usage()

In [24]:
# Include fields - adding 'citations' to the valid fields based on the documentation
include = '''[
    "biblio",
    "doc_key",
    "claims",
    "description"
]'''


In [25]:
query = {
    "query": {
        "bool": {
            "must": [
                {"terms": {"doc_number": ["4163491"]}},   # Patent document number
                {"term": {"jurisdiction": "EP"}},        # Jurisdiction (Europe)
                {"match": {"cites_patent": 'true'}}     # Matching patents that have been cited by other patents
            ]
        }
    },
    "size": 10,
    "include": ["lens_id", "biblio", "abstract", "claims", "description"]   # Fields to include in the response
}
# Make the POST request to the Lens API
response = requests.post(url, headers=headers, data=json.dumps(query))

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    print("Patent Query Results:")
    print(json.dumps(data, indent=2))
else:
    # Print error message if the request fails
    print(f"Error: Unable to fetch patent data. Status Code: {response.status_code}")
    print(response.text)

Error: Unable to fetch patent data. Status Code: 429
{"reference":"20c175be-b77b-4168-b57d-f758d6512baf","message":"Too many records. Allowed '100000' per 'month'","code":429}


#### IMPORT DATA

In [3]:

# Load data from the JSON file
with open("patents_data_v2_3_final1.json", "r") as json_file:
    data = json.load(json_file)

# Count the number of entries
num_entries = len(data)
print(f"The file contains {num_entries} patent entries.")

# Initialize counters
count_with_description = 0
count_with_citations = 0

# Loop through each entry
for entry in data:
    # Check for description
    has_description = (
        "claims" in entry and len(entry.get("claims", [])) > 0
    )
    
    # Check for citations
    references_cited = entry.get("biblio", {}).get("references_cited", {})
    has_citations = (
        "citations" in references_cited and len(references_cited.get("citations", [])) > 0
    )
    
    # Update counters
    if has_description:
        count_with_description += 1
    if has_citations:
        count_with_citations += 1

# Display counts
print(f"Entries with descriptions: {count_with_description}")
print(f"Entries with citations: {count_with_citations}")

The file contains 53411 patent entries.
Entries with descriptions: 53411
Entries with citations: 53411


In [38]:
# Count the number of entries
num_entries = len(data)
print(f"The file contains {num_entries} patent entries.")

# Print the first entry
first_entry = data[2]
print(first_entry)

The file contains 53411 patent entries.
{'lens_id': '000-374-150-323-590', 'doc_key': 'EP_2458201_A1_20120530', 'biblio': {'publication_reference': {'jurisdiction': 'EP', 'doc_number': '2458201', 'kind': 'A1', 'date': '2012-05-30'}, 'application_reference': {'jurisdiction': 'EP', 'doc_number': '11190476', 'kind': 'A', 'date': '2011-11-24'}, 'priority_claims': {'claims': [{'jurisdiction': 'DK', 'doc_number': 'PA201070508', 'kind': 'A', 'date': '2010-11-26', 'sequence': 1}, {'jurisdiction': 'US', 'doc_number': '41733810', 'kind': 'P', 'date': '2010-11-26', 'sequence': 2}], 'earliest_claim': {'date': '2010-11-26'}}, 'invention_title': [{'text': 'Windturbine mit hydraulischem Blattwinkelverstellsystem', 'lang': 'de'}, {'text': 'Wind turbine with hydraulic blade pitch system', 'lang': 'en'}, {'text': "Éolienne dotée d'un système de pas hydraulique", 'lang': 'fr'}], 'parties': {'applicants': [{'residence': 'DK', 'extracted_name': {'value': 'VESTAS WIND SYS AS'}}], 'inventors': [{'residence':

In [6]:
# Print publication reference details
pub_ref = data[2]['biblio']['publication_reference']
print("Publication Reference:", pub_ref)

Publication Reference: {'jurisdiction': 'EP', 'doc_number': '2458201', 'kind': 'A1', 'date': '2012-05-30'}


In [8]:
invention_titles = data[2]['biblio']['invention_title']
print("Invention Titles:")
for title in invention_titles:
    print(f" - {title['text']} [{title['lang']}]")


invention_titles = data[2]['biblio']['invention_title']
print("Invention Titles:")
for title in invention_titles:
    if title['lang'] == 'en':  # Check if the language is English
        print(f" - {title['text']} [{title['lang']}]")
        

Invention Titles:
 - Windturbine mit hydraulischem Blattwinkelverstellsystem [de]
 - Wind turbine with hydraulic blade pitch system [en]
 - Éolienne dotée d'un système de pas hydraulique [fr]
Invention Titles:
 - Wind turbine with hydraulic blade pitch system [en]


#### CITATIONS

In [33]:
# Print references cited
references_cited = data[2]['biblio']['references_cited']
print("References Cited:")

# Print the type and contents of references_cited to debug
print(f"Type of references_cited: {type(references_cited)}")
print("Contents of references_cited:")
print(references_cited)

# Check if references_cited contains the 'citations' key and if it's a dict
if isinstance(references_cited, dict) and 'citations' in references_cited:
    citations = references_cited['citations']
    
    if isinstance(citations, list):
        for index, citation in enumerate(citations):
            # Extract the details from each citation
            sequence = citation.get('sequence', 'No sequence')
            patent_info = citation.get('patcit', {})
            document_id = patent_info.get('document_id', {})
            jurisdiction = document_id.get('jurisdiction', 'No jurisdiction')
            doc_number = document_id.get('doc_number', 'No doc number')
            kind = document_id.get('kind', 'No kind')
            date = document_id.get('date', 'No date')
            lens_id = patent_info.get('lens_id', 'No lens ID')
            cited_phase = citation.get('cited_phase', 'No cited phase')
            category = citation.get('category', [])

            # Check if category is empty or not and format accordingly
            category_display = ', '.join(category) if category else ''

            # Print the citation details including the category
            print(f"{index + 1}. Sequence: {sequence}, Patent: {jurisdiction}{doc_number}{kind}, {date}, "
                  f"Lens ID: {lens_id}, Phase: {cited_phase}, Category: {category_display}")
    else:
        print("Citations is not a list.")
else:
    print("References cited is not a dictionary or does not contain 'citations'.")


References Cited:
Type of references_cited: <class 'dict'>
Contents of references_cited:
{'citations': [{'sequence': 1, 'patcit': {'document_id': {'jurisdiction': 'DE', 'doc_number': '19948997', 'kind': 'A1', 'date': '2001-04-19'}, 'lens_id': '113-739-326-508-211'}, 'category': ['I'], 'cited_phase': 'SEA'}, {'sequence': 2, 'patcit': {'document_id': {'jurisdiction': 'US', 'doc_number': '2007217912', 'kind': 'A1', 'date': '2007-09-20'}, 'lens_id': '120-349-166-665-84X'}, 'category': ['Y', 'D'], 'cited_phase': 'SEA'}, {'sequence': 3, 'patcit': {'document_id': {'jurisdiction': 'US', 'doc_number': '2008219846', 'kind': 'A1', 'date': '2008-09-11'}, 'lens_id': '095-125-736-742-070'}, 'category': ['Y'], 'cited_phase': 'SEA'}, {'sequence': 4, 'patcit': {'document_id': {'jurisdiction': 'DE', 'doc_number': '3009922', 'kind': 'A1', 'date': '1981-09-24'}, 'lens_id': '097-716-736-663-839'}, 'category': ['A'], 'cited_phase': 'SEA'}, {'sequence': 5, 'patcit': {'document_id': {'jurisdiction': 'DE', 'do

#### SEARCHING

In [55]:
# Example lens_id you are searching for
search_lens_id = '000-041-469-766-603'

# Search for the dictionary where 'lens_id' matches the search value
matching_entry = next((entry for entry in data if entry.get('lens_id') == search_lens_id), None)

# Check if a match was found and print the result
if matching_entry:
    print("Match found:")
    print(matching_entry)
else:
    print("No match found for the specified lens_id.")

Match found:
{'lens_id': '000-041-469-766-603', 'doc_key': 'WO_2010103086_A2_20100916', 'biblio': {'publication_reference': {'jurisdiction': 'WO', 'doc_number': '2010103086', 'kind': 'A2', 'date': '2010-09-16'}, 'application_reference': {'jurisdiction': 'EP', 'doc_number': '2010053133', 'kind': 'W', 'date': '2010-03-11'}, 'priority_claims': {'claims': [{'jurisdiction': 'SE', 'doc_number': '0950152', 'kind': 'A', 'date': '2009-03-13', 'sequence': 1}, {'jurisdiction': 'US', 'doc_number': '18618209', 'kind': 'P', 'date': '2009-06-11', 'sequence': 2}], 'earliest_claim': {'date': '2009-03-13'}}, 'invention_title': [{'text': 'BLADE MOUNTING', 'lang': 'en'}, {'text': 'MONTAGE DE PALE', 'lang': 'fr'}], 'parties': {'applicants': [{'residence': 'NO', 'extracted_name': {'value': 'GE WIND ENERGY NORWAY AS'}}, {'residence': 'SE', 'extracted_name': {'value': 'LINDBERG MIKAEL'}}, {'residence': 'SE', 'extracted_name': {'value': 'HEDBERG JOHAN'}}], 'inventors': [{'residence': 'SE', 'sequence': 1, 'extr

In [19]:
# Define search criteria
search_jurisdiction = 'DE'
search_doc_number = '19948997'

# Search for entries that match the criteria in either publication_reference or application_reference
matching_entries = [
    entry for entry in data
    if (
        # Check publication_reference for match
        (entry.get('biblio', {}).get('publication_reference', {}).get('jurisdiction') == search_jurisdiction
         and entry.get('biblio', {}).get('publication_reference', {}).get('doc_number') == search_doc_number)
        or
        # Check application_reference for match
        (entry.get('biblio', {}).get('application_reference', {}).get('jurisdiction') == search_jurisdiction
         and entry.get('biblio', {}).get('application_reference', {}).get('doc_number') == search_doc_number)
    )
]

# Print results
if matching_entries:
    print("Matches found:")
    for match in matching_entries:
        print(match)
else:
    print("No matches found for the specified criteria.")


No matches found for the specified criteria.


#### CLAIMS

In [4]:
# Access the second entry in the data list
entry = data[1]

# Check if 'claims' field is present and is a list
claims_data = entry.get('claims')

# If claims data exists and is a list, print the claim texts
if isinstance(claims_data, list):
    print("Claims:")
    for claim in claims_data:
        # Ensure 'claims' inside each claim is also a list
        inner_claims = claim.get('claims', [])
        for inner_claim in inner_claims:
            # Extract 'claim_text' if it's a list
            claim_text = inner_claim.get('claim_text', [])
            for text in claim_text:
                print(f" - {text}")
else:
    print("No claims found in data[1].")

Claims:
 - 1 . A speed reducer used in a yaw drive apparatus of a wind power generation apparatus, comprising: a first stage speed reducing portion, a second stage speed reducing portion connected to the first speed reducing portion, and a third stage speed reducing portion, wherein a total reduction gear ratio of a first stage speed reducing portion and a second speed reducing portion is set to 1/6 to 1/60, the third stage speed reducing portion is constructed by an eccentric oscillating type speed reduction mechanism comprising an internal gear member in which internal teeth are formed at the internal periphery thereof, a plurality of external gears which are received in the internal gear member, which have external teeth engaged with the internal teeth and having number of teeth slightly less than that of the internal teeth at the external periphery thereof, and which are disposed in parallel to each other in the axial direction, a plurality of crank shafts which are rotatably inser

#### DESCRIPTION

In [39]:
# Access the second entry in the data list
entry = data[2]

# Check if 'description' field is present and contains 'text'
description_data = entry.get('description', {}).get('text')

# Print the description text if it exists
if description_data:
    print("Description:")
    print(f"{description_data}")
else:
    print("No description found in data[1].")

Description:
The present invention relates to a wind turbine have a rotor with adjustable blades and a hydraulic blade pitch system for controlling the pitch angle of the blade. BACKGROUND Wind turbines having blade pitch systems for adjusting the pitch angle of the blades have been known for many years, typically employing electric pitch drives or hydraulic pitch drives. Early examples of hydraulic pitch drives are disclosed e.g. in  US 4,348,155  and  US 4,352,634 , both of United Technologies. The hydraulic valves used to control the flow of liquid to the linear hydraulic actuators are disclosed as switch valves, i.e. valves operated in an on-off mode and the pitching torque is not adjustable. In  EP 1 835 174 B1  of Robert Bosch GmbH is disclosed a wind turbine having a hydraulic pitch drive where the flow of liquid is controlled by means of a number of switch valves are arranged in parallel and are operable in different combinations in order to provide what is known as digital hyd

#### PANDAS

In [4]:
import pandas as pd

In [34]:
lens_id = entry.get('lens_id')

In [25]:
entry = data[2]

# Check if 'description' field is present and contains 'text'
description_data = entry.get('description', {}).get('text')

In [None]:
claims_data = entry.get('claims')
first_claim = claims_data[0]
inner_claims = first_claim.get('claims', [])
first_inner_claim = inner_claims[0]  # Access the first inner claim
claim_text = first_inner_claim.get('claim_text', [])
claim_text[0]

'1. An arrangement for in situ mounting of rotor blades to a rotor hub of a wind power plant comprising rotor blades, a rotor hub (40), a disc (1), a means for rotation (30) of the disc (1) around its axis, the disc (1) being in connected to rotor hub (40) in such a way that when the disc (1) rotates a certain degree of angle around its axis (3) it will cause the rotor hub (40) to rotate the same degree of angle around the axis of the rotor hub, the disc (1) being provided with a connecting device (2), characterized in that the disc (1) is caused to rotate by the means for rotation (30) and when the means for rotation (30) comes into a desired position, the disc (1) is secured by a securing means (20; 36, 37).'

In [31]:
invention_titles = entry['biblio']['invention_title']
for title in invention_titles:
    if title['lang'] == 'en':  # Check if the language is English
        title_data = title['text']

In [37]:
references_cited = entry['biblio']['references_cited']
citations = references_cited['citations']
for index, citation in enumerate(citations):
    # Extract the details from each citation
    sequence = citation.get('sequence', 'No sequence')
    patent_info = citation.get('patcit', {})
    document_id = patent_info.get('document_id', {})
    jurisdiction = document_id.get('jurisdiction', 'No jurisdiction')
    doc_number = document_id.get('doc_number', 'No doc number')
    kind = document_id.get('kind', 'No kind')
    date = document_id.get('date', 'No date')
    lens_id = patent_info.get('lens_id', 'No lens ID')
    cited_phase = citation.get('cited_phase', 'No cited phase')
    category = citation.get('category', [])

    # Check if category is empty or not and format accordingly
    category_display = ', '.join(category) if category else ''

    # Print the citation details including the category
    print(f"{index + 1}. Sequence: {sequence}, Patent: {jurisdiction}{doc_number}{kind}, {date}, "
            f"Lens ID: {lens_id}, Phase: {cited_phase}, Category: {category_display}")

1. Sequence: 1, Patent: DE19948997A1, 2001-04-19, Lens ID: 113-739-326-508-211, Phase: SEA, Category: I
2. Sequence: 2, Patent: US2007217912A1, 2007-09-20, Lens ID: 120-349-166-665-84X, Phase: SEA, Category: Y, D
3. Sequence: 3, Patent: US2008219846A1, 2008-09-11, Lens ID: 095-125-736-742-070, Phase: SEA, Category: Y
4. Sequence: 4, Patent: DE3009922A1, 1981-09-24, Lens ID: 097-716-736-663-839, Phase: SEA, Category: A
5. Sequence: 5, Patent: DE20317749U1, 2005-03-24, Lens ID: 002-201-899-099-786, Phase: SEA, Category: A
6. Sequence: 1, Patent: US4348155A, 1982-09-07, Lens ID: 116-012-533-052-482, Phase: APP, Category: 
7. Sequence: 2, Patent: US4352634A, 1982-10-05, Lens ID: 073-296-243-883-164, Phase: APP, Category: 
8. Sequence: 3, Patent: EP1835174B1, 2009-06-24, Lens ID: 160-054-295-218-424, Phase: APP, Category: 
9. Sequence: 4, Patent: EP1533520A1, 2005-05-25, Lens ID: 028-088-548-890-500, Phase: APP, Category: 
10. Sequence: 5, Patent: EP2072815A1, 2009-06-24, Lens ID: 030-002-0

In [None]:
entry = data[2]
#LENS ID
lens_id = entry.get('lens_id') #THIS
#DESC
description_data = entry.get('description', {}).get('text') #THIS
#CLAIMS
claims_data = entry.get('claims')
first_claim = claims_data[0]
inner_claims = first_claim.get('claims', [])
first_inner_claim = inner_claims[0]  # Access the first inner claim
claim_text = first_inner_claim.get('claim_text', [])
claim_data = claim_text[0] #THIS
#TITLE
invention_titles = entry['biblio']['invention_title']
for title in invention_titles:
    if title['lang'] == 'en':  # Check if the language is English
        title_data = title['text'] #THIS
#CITATIONS
references_cited = entry['biblio']['references_cited']
citations = references_cited['citations']



for index, citation in enumerate(citations):
    # Extract the details from each citation
    sequence = citation.get('sequence', 'No sequence')
    patent_info = citation.get('patcit', {})
    document_id = patent_info.get('document_id', {})
    jurisdiction = document_id.get('jurisdiction', 'No jurisdiction')
    doc_number = document_id.get('doc_number', 'No doc number')
    kind = document_id.get('kind', 'No kind')
    date = document_id.get('date', 'No date')
    lens_id_citation = patent_info.get('lens_id', 'No lens ID')
    cited_phase = citation.get('cited_phase', 'No cited phase')
    category = citation.get('category', [])
    doc_number_jur = jurisdiction + doc_number

    # Check if category is empty or not and format accordingly
    category_display = ', '.join(category) if category else ''
    df_temp_citations = pd.DataFrame([{
        'lens_id': lens_id,
        'claim': claim_data,
        'title': title_data,
        'description': description_data,
        'citation_doc_number': doc_number_jur, 
        'citation_lens_id': lens_id_citation, 
        'citation_phase': cited_phase, 
        'citation_category':category_display
    }])

    df_temp_all = pd.concat([df_temp_all, df_temp_citations], ignore_index=True)


In [45]:
df_temp_all

Unnamed: 0,lens_id,claim,title,description,citation_doc_number,citation_lens_id,citation_phase,citation_category
0,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,DE19948997,113-739-326-508-211,SEA,I
1,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,US2007217912,120-349-166-665-84X,SEA,"Y, D"
2,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,US2008219846,095-125-736-742-070,SEA,Y
3,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,DE3009922,097-716-736-663-839,SEA,A
4,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,DE20317749,002-201-899-099-786,SEA,A
5,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,US4348155,116-012-533-052-482,APP,
6,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,US4352634,073-296-243-883-164,APP,
7,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,EP1835174,160-054-295-218-424,APP,
8,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,EP1533520,028-088-548-890-500,APP,
9,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,EP2072815,030-002-027-444-216,APP,


In [None]:
df_temp_all = pd.DataFrame(columns=['lens_id', 'claim', 'title', 'description', 'citation_doc_number', 'citation_lens_id', 'citation_phase', 'citation_category'])
counter = 0  # Initialize a counter
for entry in data:
    #LENS ID
    lens_id = entry.get('lens_id') #THIS
    #DESC
    description_data = entry.get('description', {}).get('text') #THIS
    #CLAIMS
    claims_data = entry.get('claims')
    first_claim = claims_data[0]
    inner_claims = first_claim.get('claims', [])
    first_inner_claim = inner_claims[0]  # Access the first inner claim
    claim_text = first_inner_claim.get('claim_text', [])
    claim_data = claim_text[0] #THIS
    #TITLE
    invention_titles = entry['biblio']['invention_title']
    for title in invention_titles:
        if title['lang'] == 'en':  # Check if the language is English
            title_data = title['text'] #THIS
    #CITATIONS
    references_cited = entry['biblio']['references_cited']
    citations = references_cited['citations']

    for index, citation in enumerate(citations):
        # Extract the details from each citation
        sequence = citation.get('sequence', 'No sequence')
        patent_info = citation.get('patcit', {})
        document_id = patent_info.get('document_id', {})
        jurisdiction = document_id.get('jurisdiction', 'No jurisdiction')
        doc_number = document_id.get('doc_number', 'No doc number')
        kind = document_id.get('kind', 'No kind')
        date = document_id.get('date', 'No date')
        lens_id_citation = patent_info.get('lens_id', 'No lens ID')
        cited_phase = citation.get('cited_phase', 'No cited phase')
        category = citation.get('category', [])
        doc_number_jur = jurisdiction + doc_number

        # Check if category is empty or not and format accordingly
        category_display = ', '.join(category) if category else ''
        df_temp_citations = pd.DataFrame([{
            'lens_id': lens_id,
            'claim': claim_data,
            'title': title_data,
            'description': description_data,
            'citation_doc_number': doc_number_jur, 
            'citation_lens_id': lens_id_citation, 
            'citation_phase': cited_phase, 
            'citation_category':category_display
        }])

        df_temp_all = pd.concat([df_temp_all, df_temp_citations], ignore_index=True)
    counter += 1
    if counter % 20000 == 0:
        print(counter)

In [50]:
df_temp_all

Unnamed: 0,lens_id,claim,title,description,citation_doc_number,citation_lens_id,citation_phase,citation_category
0,000-041-469-766-603,CLAIMS:,BLADE MOUNTING,BLADE MOUNTING The present invention concerns ...,US2006228220,186-267-506-340-976,APP,
1,000-041-469-766-603,CLAIMS:,BLADE MOUNTING,BLADE MOUNTING The present invention concerns ...,US2004253109,025-783-263-687-432,APP,
2,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US5651747,146-289-592-821-472,PRS,
3,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US2001044356,173-836-091-610-908,PRS,
4,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US5695425,179-721-438-743-514,PRS,
...,...,...,...,...,...,...,...,...
627582,165-006-124-054-117,1. A lightning conduction system for wind turb...,Lightning conduction system for wind turbine b...,OBJECT OF THE PATENT The object of the patent ...,US2006280613,046-374-562-151-050,SEA,
627583,165-006-124-054-117,1. A lightning conduction system for wind turb...,Lightning conduction system for wind turbine b...,OBJECT OF THE PATENT The object of the patent ...,US2007074892,046-347-221-138-612,SEA,
627584,165-006-124-054-117,1. A lightning conduction system for wind turb...,Lightning conduction system for wind turbine b...,OBJECT OF THE PATENT The object of the patent ...,US2008073098,173-474-667-313-450,SEA,
627585,175-918-535-824-280,Rotorblatt (110) für eine Windenergieanlage (1...,ROTOR BLADE FOR A WIND ENERGY SYSTEM AND ROTOR...,Die Erfindung betrifft ein Rotorblatt für eine...,No jurisdictionNo doc number,No lens ID,No cited phase,


In [52]:
df_temp_all.to_csv('data.csv.gz', index=False, compression='gzip')

### FIX DATA

#### CLAIMS

In [None]:
# Count rows containing 'claim' or 'claims'
count = df_temp_all['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False).sum()
print(count)

  count = df_temp_all['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False)


In [None]:
df_with_claims = df_temp_all[
    df_temp_all['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False) &  # contains "claim" or "claims"
    (df_temp_all['claim'].str.len() < 20)  # has less than 40 characters
].copy()

# Step 2: Remove these rows from the original DataFrame
df_temp_all = df_temp_all[~(
    df_temp_all['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False) & 
    (df_temp_all['claim'].str.len() < 20)
)].copy()

  df_temp_all['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False) &


In [87]:
# Example of the main DataFrame and source DataFrame
# df_temp_all is your main DataFrame
# df_source contains 'lens_id' and 'new_text' columns

# Iterate through unique lens_id values in `df_temp_all`
for lens_id_c in df_with_claims['lens_id'].unique():
    # Retrieve the new_text for the current lens_id from df_source
    #new_text_row = df_source.loc[df_source['lens_id'] == lens_id, 'new_text']
    matching_entry = next((entry for entry in data if entry.get('lens_id') == lens_id_c), None)
    claims_data = matching_entry.get('claims')
    first_claim = claims_data[0]
    inner_claims = first_claim.get('claims', [])
    first_inner_claim = inner_claims[0]  # Access the first inner claim
    claim_text = first_inner_claim.get('claim_text', [])
    new_text = claim_text[1]
    
    # # Check if we found a matching new_text in df_source
    # if not new_text_row.empty:
    #     new_text = new_text_row.iloc[0]  # Get the new_text (assuming one row per lens_id)
        
        # Replace rows in `claim` column with `new_text` where:
        # - lens_id matches
        # - `claim` column contains "claim" or "claims"
    df_with_claims.loc[
            (df_with_claims['lens_id'] == lens_id_c) & 
            (df_with_claims['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False)), 
            'claim'] = new_text


  (df_with_claims['claim'].str.contains(r'\bclaim(s)?\b', case=False, na=False)),


In [89]:
df_combined = pd.concat([df_temp_all, df_with_claims], ignore_index=True)

#### CITATIONS

In [92]:
# Count rows containing 'claim' or 'claims'
count2 = df_temp_all['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False).sum()
print(count2)

  count2 = df_temp_all['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False).sum()


80331


In [93]:
# Count rows containing 'claim' or 'claims'
count3 = df_temp_all['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False).sum()
print(count3)

76277


  count3 = df_temp_all['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False).sum()


In [96]:
# Step 1: Create a new DataFrame with the specified conditions
df_with_no = df_combined[
    df_combined['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False) | 
    df_combined['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False)
].copy()

# Step 2: Remove these rows from the original DataFrame
df_combined = df_combined[~(
    df_combined['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False) | 
    df_combined['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False)
)].copy()

  df_combined['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False) |
  df_combined['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False)
  df_combined['citation_lens_id'].str.contains(r'\bno(s)?\b', case=False, na=False) |
  df_combined['citation_doc_number'].str.contains(r'\bno(s)?\b', case=False, na=False)


In [97]:
df_with_no

Unnamed: 0,lens_id,claim,title,description,citation_doc_number,citation_lens_id,citation_phase,citation_category
18,000-664-023-248-319,System mit einem Rotorblatt (10) oder einem Ro...,Rotor blade or rotor blade segment for a wind ...,Die Erfindung betrifft ein Rotorblatt oder ein...,No jurisdictionNo doc number,No lens ID,No cited phase,
19,000-664-023-248-319,System mit einem Rotorblatt (10) oder einem Ro...,Rotor blade or rotor blade segment for a wind ...,Die Erfindung betrifft ein Rotorblatt oder ein...,No jurisdictionNo doc number,No lens ID,No cited phase,
20,000-664-023-248-319,System mit einem Rotorblatt (10) oder einem Ro...,Rotor blade or rotor blade segment for a wind ...,Die Erfindung betrifft ein Rotorblatt oder ein...,No jurisdictionNo doc number,No lens ID,No cited phase,
21,000-664-023-248-319,System mit einem Rotorblatt (10) oder einem Ro...,Rotor blade or rotor blade segment for a wind ...,Die Erfindung betrifft ein Rotorblatt oder ein...,No jurisdictionNo doc number,No lens ID,No cited phase,
37,000-729-551-090-695,1. A wind turbine generator comprising: a nace...,Wind turbine generator,TECHNICAL FIELD The present invention relates ...,No jurisdictionNo doc number,No lens ID,APP,
...,...,...,...,...,...,...,...,...
627371,026-322-188-359-539,1. Method for adjusting a pitch angle (a) of a...,METHOD FOR LOAD REDUCTION ON WIND TURBINE BLAD...,DESCRIPTION Method for load reduction on wind ...,No jurisdictionNo doc number,No lens ID,ISR,X
627383,138-944-199-149-833,1. A method of controlling a pitch of a blade ...,VELOCITY FEEDFOWARD CONTROL OF A HYDRAULIC PIT...,VELOCITY FEEDFOWARD CONTROL OF A HYDRAULIC PIT...,No jurisdictionNo doc number,No lens ID,ISR,X
627384,138-944-199-149-833,1. A method of controlling a pitch of a blade ...,VELOCITY FEEDFOWARD CONTROL OF A HYDRAULIC PIT...,VELOCITY FEEDFOWARD CONTROL OF A HYDRAULIC PIT...,No jurisdictionNo doc number,No lens ID,ISR,I
627478,044-845-029-604-993,1. A wind turbine blade apparatus comprising:,SCISSOR LIFT SYSTEM AND PLUG-IN MOBILITY MECHA...,SCISSOR LIFT SYSTEM AND PLUG-IN MOBILITY MECHA...,No jurisdictionNo doc number,No lens ID,APP,


In [98]:
df_combined 

Unnamed: 0,lens_id,claim,title,description,citation_doc_number,citation_lens_id,citation_phase,citation_category
0,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US5651747,146-289-592-821-472,PRS,
1,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US2001044356,173-836-091-610-908,PRS,
2,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US5695425,179-721-438-743-514,PRS,
3,000-316-779-119-014,1 . A speed reducer used in a yaw drive appara...,Speed reducer for use in yaw drive apparatus f...,TECHNICAL FIELD The present invention relates ...,US2003054912,042-855-809-856-864,PRS,
4,000-374-150-323-590,A wind turbine (1) having a rotor (4) with at ...,Wind turbine with hydraulic blade pitch system,The present invention relates to a wind turbin...,DE19948997,113-739-326-508-211,SEA,I
...,...,...,...,...,...,...,...,...
627582,001-817-748-239-023,1. A pitch controlled wind turbine comprising ...,A PITCH CONTROLLED WIND TURBINE,A Pitch Controlled Wind Turbine FIELD OF THE I...,CN106089573,105-144-823-359-859,ISR,A
627583,075-658-706-946-183,1. An apparatus for pumping viscous fluids com...,DOUBLE PUMP,Double pump The present invention relates to a...,GB2013512,087-316-762-825-701,ISR,"X, I"
627584,075-658-706-946-183,1. An apparatus for pumping viscous fluids com...,DOUBLE PUMP,Double pump The present invention relates to a...,JP2014169672,196-573-180-877-148,ISR,"X, I"
627585,075-658-706-946-183,1. An apparatus for pumping viscous fluids com...,DOUBLE PUMP,Double pump The present invention relates to a...,JPH05231369,040-000-005-188-999,ISR,A


In [99]:
df_combined.to_csv('data_clean.csv.gz', index=False, compression='gzip')