In [1]:
import pandas as pd
import ast
import re
from tqdm import tqdm
tqdm.pandas()

# EPO

In [None]:
# Load Data
# df_yake_claims_ep = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_cleaned_yake_noun_chunks.json')
df_yake_claims_ep = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_epo_non_cleantech_text_data_pivot_claims_cleaned_yake_noun_chunks.json')

In [None]:
def parse_strings(s):
    # Check if the string starts with [' and ends with ']
    if s.startswith("['") and s.endswith("']"):
        # Use a regular expression to find all sequences of characters enclosed in single or double quotes
        return re.findall(r"['\"]([^'\"]*)['\"]", s)
    else:
        # Split the string by commas
        return s.split(', ')
    
# Apply parse_strings function to 'cpc_class_symbol' column
df_yake_claims_ep['cpc_class_symbol'] = df_yake_claims_ep['cpc_class_symbol'].progress_apply(parse_strings)

In [None]:
keywords_list_ep = []
yake_conf_score_list = []
publn_nr_list = []
# cpc_symbol_list = [] - Not considered for Non Cleantech Patents
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_yake_claims_ep.iterrows()):
    # Check if 'keywords_yake_claims' column is not a list
    if not isinstance(row['keywords_yake_claim_noun_chunk'], list):
        continue
    # Check if 'keywords_yake_claims' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_claim_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake_claims' column and append to keywords_list_ep, consider only top 10 keywords
    else:
        for keyword in row['keywords_yake_claim_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_ep.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            publn_nr_list.append(row['publn_nr'])
            # cpc_symbol_list.append(row['cpc_class_symbol'])

# Create new dataframe
df_keywords_list_ep = pd.DataFrame({
    'keyword_yake': keywords_list_ep,
    'yake_conf_score': yake_conf_score_list,
    'publn_nr': publn_nr_list,
    # 'cpc_class_symbol': cpc_symbol_list,
    'abs_frequency': 1
})

In [None]:
# Filter out non-alphanumeric keywords
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

In [None]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_ep['keyword_yake_lemma'] = df_keywords_list_ep['keyword_yake'].progress_apply(lemmatize_keywords)

In [None]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_ep['keyword_yake_lemma'] = df_keywords_list_ep['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

In [None]:
# Aggregate df_keywords_list_ep by 'keyword'
df_keywords_list_ep_agg = df_keywords_list_ep.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'publn_nr': list,
    # 'cpc_class_symbol': list,
    'abs_frequency': 'count'
}).reset_index()

# Flatten nested lists in 'cpc_class_symbol' column
# df_keywords_list_ep_agg['cpc_class_symbol'] = df_keywords_list_ep_agg['cpc_class_symbol'].progress_apply(lambda x: [item for sublist in x for item in sublist])

In [None]:
# df_keywords_list_ep_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/epo_yake_keywords_list_noun_chunks.json', orient='records')
df_keywords_list_ep_agg.to_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/epo_yake_keywords_list_non_cleantech_noun_chunks.json', orient='records')

# USPTO

In [None]:
# Load Data
# df_yake_claims_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_cleantech_yake_noun_chunks.json')
df_yake_claims_uspto = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/g_uspto_non_cleantech_claims_fulltext_yake_noun_chunks.json')

In [None]:
keywords_list_uspto = []
yake_conf_score_list = []
patent_id_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_yake_claims_uspto.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['keywords_yake_claim_noun_chunk'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_claim_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake' column and append to keywords_list_uspto
    else:
        for keyword in row['keywords_yake_claim_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_uspto.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            patent_id_list.append(row['patent_id'])

# Create new dataframe
df_keywords_list_uspto = pd.DataFrame({
    'keyword_yake': keywords_list_uspto,
    'yake_conf_score': yake_conf_score_list,
    'patent_id': patent_id_list,
    'abs_frequency': 1
})

In [None]:
# Filter out non-alphanumeric keywords
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

## Match Patents to CPC Classification

In [None]:
# PatentsView - Merge with CPC Classification
df_cpc_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/df_patentsview_patent_cpc_grouped_cleantech.json')
# Extract 'cpc_group' into a new column
df_cpc_uspto['cpc_group'] = df_cpc_uspto['cpc'].progress_apply(
    lambda x: [entry['cpc_group'] for entry in x.values() if 'cpc_group' in entry]
)

# Remove duplicates from 'cpc_group_list'
df_cpc_uspto['cpc_group'] = df_cpc_uspto['cpc_group'].progress_apply(lambda x: list(set(x)))

In [None]:
# Merge df_keywords_list_uspto with df_cpc_uspto
df_keywords_list_uspto = pd.merge(
    df_keywords_list_uspto,
    df_cpc_uspto[['patent_id', 'cpc_group']],
    how='left',
    on='patent_id'
)

## Continue Postprocessing

In [None]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_uspto['keyword_yake_lemma'] = df_keywords_list_uspto['keyword_yake'].progress_apply(lemmatize_keywords)

In [None]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_uspto['keyword_yake_lemma'] = df_keywords_list_uspto['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

In [None]:
# Aggregate df_keywords_list_ep by 'keyword'
df_keywords_list_uspto_agg = df_keywords_list_uspto.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'patent_id': list,
    # 'cpc_group': list,
    'abs_frequency': 'count'
}).reset_index()

# Flatten nested lists in 'cpc_group' column
# df_keywords_list_uspto_agg['cpc_group'] = df_keywords_list_uspto_agg['cpc_group'].progress_apply(lambda x: [item for sublist in x for item in sublist])

In [None]:
# df_keywords_list_uspto_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/uspto_yake_keywords_list_noun_chunks.json', orient='records')
df_keywords_list_uspto_agg.to_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/uspto_yake_keywords_list_non_cleantech_noun_chunks.json', orient='records')

# Reliance on Science - USPTO and EPO

In [None]:
# Load Data
df_rel_on_science_uspto = pd.read_json('/mnt/hdd01/patentsview/Reliance on Science - Cleantech Patents/df_oaid_Cleantech_y02_individual_works_yake_noun_chunks.json', dtype={'patent_id': str, 'oaid': str})
df_rel_on_science_ep = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/Reliance on Science/cleantech_epo_rel_on_science_abstract_yake_noun_chunks.json', dtype={'publn_nr': str, 'oaid': str})

In [3]:
df_rel_on_science = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/df_oaids_non_cleantech_yake_noun_chunks.json', dtype={'oaid': str})

In [None]:
# Concatenate dataframes, reset index and drop duplicates
df_rel_on_science = pd.concat([df_rel_on_science_uspto, df_rel_on_science_ep], ignore_index=True)
df_rel_on_science = df_rel_on_science.drop_duplicates(subset=['oaid'], keep='first').reset_index(drop=True)

In [4]:
keywords_list_rel = []
yake_conf_score_list = []
oaid_list = []
# publn_nr_list = []
# patent_id_list = []
patent_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_rel_on_science.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['keywords_yake_abstract_noun_chunk'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_abstract_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake' column and append to keywords_list_rel
    else:
        for keyword in row['keywords_yake_abstract_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_rel.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            oaid_list.append(row['oaid'])
            patent_list.append(row['patent'])
            # publn_nr_list.append(row['publn_nr'])
            # patent_id_list.append(row['patent_id'])

# Create new dataframe
df_keywords_list_rel = pd.DataFrame({
    'keyword_yake': keywords_list_rel,
    'yake_conf_score': yake_conf_score_list,
    'oaid': oaid_list,
    'abs_frequency': 1,
    # 'publn_nr': publn_nr_list,
    # 'patent_id': patent_id_list
    'patent': patent_list
})

611441it [00:34, 17719.97it/s]


In [5]:
# Filter out non-alphanumeric keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 5870433/5870433 [00:04<00:00, 1233843.85it/s]
100%|██████████| 5430269/5430269 [00:02<00:00, 2302048.08it/s]
100%|██████████| 5430269/5430269 [00:04<00:00, 1109039.61it/s]


In [6]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_rel['keyword_yake_lemma'] = df_keywords_list_rel['keyword_yake'].progress_apply(lemmatize_keywords)

100%|██████████| 5430269/5430269 [00:22<00:00, 243118.62it/s]


In [7]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_rel['keyword_yake_lemma'] = df_keywords_list_rel['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 5430269/5430269 [00:03<00:00, 1542266.16it/s]
100%|██████████| 5430269/5430269 [00:02<00:00, 2375649.16it/s]


In [None]:
# Cast 'publn_nr' column to string
df_keywords_list_rel['publn_nr'] = df_keywords_list_rel['publn_nr'].progress_apply(str)

In [8]:
# Aggregate df_keywords_list_rel by 'keyword_yake_lemma'
df_keywords_list_rel_agg = df_keywords_list_rel.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'oaid': list,
    # 'publn_nr': list,
    'patent': list,
    # 'patent_id': list,
    'abs_frequency': 'count'
}).reset_index()

# Delete all nan list entries in columns 'publn_nr' and 'patent_id'
# df_keywords_list_rel_agg['publn_nr'] = df_keywords_list_rel_agg['publn_nr'].progress_apply(lambda x: [item for item in x if str(item) != 'nan'])
# df_keywords_list_rel_agg['patent_id'] = df_keywords_list_rel_agg['patent_id'].progress_apply(lambda x: [item for item in x if str(item) != 'nan'])
df_keywords_list_rel_agg['patent'] = df_keywords_list_rel_agg['patent'].progress_apply(lambda x: [item for item in x if str(item) != 'nan'])

100%|██████████| 1272664/1272664 [00:01<00:00, 703329.79it/s]


In [9]:
# df_keywords_list_rel_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/rel_on_science_yake_keywords_list_noun_chunks.json', orient='records')
df_keywords_list_rel_agg.to_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/rel_on_science_yake_keywords_list_non_cleantech_noun_chunks.json', orient='records')

# CPC Classification

In [None]:
df_cpc_classification = pd.read_json('/mnt/hdd01/patentsview/CPC Classification/df_keyword_y02_classification_noun_chunking.json')

In [None]:
keywords_list_cpc = []
yake_conf_score_list = []
cpc_symbol_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_cpc_classification.iterrows()):
    # Check if 'keywords_yake_claims' column is not a list
    if not isinstance(row['keywords_yake_title_lower_noun_chunk'], list):
        continue
    # Check if 'keywords_yake_claims' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_title_lower_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake_claims' column and append to keywords_list_ep, consider only top 10 keywords
    else:
        for keyword in row['keywords_yake_title_lower_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_cpc.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            cpc_symbol_list.append(row['cpc_classification'])

# Create new dataframe
df_keywords_list_cpc = pd.DataFrame({
    'keyword_yake': keywords_list_cpc,
    'yake_conf_score': yake_conf_score_list,
    'cpc_class_symbol': cpc_symbol_list
})

In [None]:
# Filter out non-alphanumeric keywords
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

In [None]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_cpc['keyword_yake_lemma'] = df_keywords_list_cpc['keyword_yake'].progress_apply(lemmatize_keywords)

In [None]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_cpc['keyword_yake_lemma'] = df_keywords_list_cpc['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

In [None]:
# Aggregate df_keywords_list_cpc by 'keyword'
df_keywords_list_cpc_agg = df_keywords_list_cpc.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'cpc_class_symbol': list,
}).reset_index()

In [None]:
df_keywords_list_cpc_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks.json', orient='records')

In [None]:
from sentence_transformers import SentenceTransformer
import torch

In [None]:
model_climatebert = SentenceTransformer('climatebert/distilroberta-base-climate-f')
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')
model_patentsberta = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

In [None]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
# Generate copy of df_claims_keywords_list
df_keywords_list_cpc_embeddings = df_keywords_list_cpc_agg.copy()

# Perform sentence embedding on the 'keyword_yake' (PatentsView) or 'keywords_yake_claims' (EPO) column
df_keywords_list_cpc_embeddings['keyword_yake_patentsberta_embedding'] = df_keywords_list_cpc_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_patentsberta.encode(x)
)

df_keywords_list_cpc_embeddings['keyword_yake_climatebert_embedding'] = df_keywords_list_cpc_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_climatebert.encode(x)
)

df_keywords_list_cpc_embeddings['keyword_yake_bertforpatents_embedding'] = df_keywords_list_cpc_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_bertforpatents.encode(x)
)

In [None]:
df_keywords_list_cpc_embeddings.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks_embeddings.json', orient='records')

# Merge EP, USPTO and Reliance on Science

In [None]:
# Load Data
df_keywords_list_uspto_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/uspto_yake_keywords_list_noun_chunks.json')
df_keywords_list_ep_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/epo_yake_keywords_list_noun_chunks.json')
df_keywords_list_rel_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/rel_on_science_yake_keywords_list_noun_chunks.json')

In [5]:
df_keywords_list_uspto_agg = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/uspto_yake_keywords_list_non_cleantech_noun_chunks.json')
df_keywords_list_ep_agg = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/epo_yake_keywords_list_non_cleantech_noun_chunks.json')
df_keywords_list_rel_agg = pd.read_json('/mnt/hdd01/patentsview/Non Cleantech Patents - Classifier Set/rel_on_science_yake_keywords_list_non_cleantech_noun_chunks.json')

In [8]:
# Cast publn_nr and patent_id to list of strings
df_keywords_list_ep_agg['publn_nr'] = df_keywords_list_ep_agg['publn_nr'].progress_apply(lambda x: [str(item) for item in x])
df_keywords_list_uspto_agg['patent_id'] = df_keywords_list_uspto_agg['patent_id'].progress_apply(lambda x: [str(item) for item in x])

100%|██████████| 374142/374142 [00:00<00:00, 421216.51it/s]
100%|██████████| 1110103/1110103 [00:07<00:00, 140352.53it/s]


In [9]:
frames = [df_keywords_list_uspto_agg, df_keywords_list_ep_agg, df_keywords_list_rel_agg]
df_keywords_list = pd.concat(frames)
df_keywords_list.reset_index(drop=True, inplace=True)

In [10]:
df_keywords_list.sample(5)

Unnamed: 0,keyword_yake_lemma,yake_conf_score,patent_id,abs_frequency,publn_nr,oaid,patent
959498,substantially rectangular trench,0.087966,[6255190],1,,,
1430965,storing process,0.008902,,1,[2463637],,
1622658,bulk scattering perfect,0.245504,,1,,[2170805020],[{ep-2719328-a1}]
2596601,sufficiently long amount,0.345991,,1,,[2017368948],[{us-9576343-b2}]
1637559,casing packer,0.049772,,1,,[2022625807],"[{us-10016810-b2,us-10030474-b2,us-10053957-b2..."


In [12]:
def concat_lists(series):
    combined_list = []
    for item in series:
        if isinstance(item, list):
            combined_list.extend(item)
    return combined_list

# Group by 'keyword_yake_lemma' and aggregate
df_keywords_list_agg = df_keywords_list.groupby('keyword_yake_lemma').agg({
    'yake_conf_score': 'mean',        # Mean of yake_conf_score
    'abs_frequency': 'sum',           # Sum of abs_frequency
    'patent_id': concat_lists,        # Concatenate lists in patent_id
    'publn_nr': concat_lists,         # Concatenate lists in publn_nr
    'oaid': concat_lists,             # Concatenate lists in oaid
    # 'cpc_group': concat_lists,        # Concatenate lists in cpc_group
    # 'cpc_class_symbol': concat_lists  # Concatenate lists in cpc_class_symbol
}).reset_index()

In [13]:
import ast

def flatten_and_convert(entry):
    # If the entry is NaN (float type in pandas), return an empty list
    if isinstance(entry, float):
        return []

    # Initialize an empty list to store the flattened results
    flattened_list = []

    # Check if the entry is a string and convert it to a list if it represents a list
    if isinstance(entry, str) and entry.startswith("[") and entry.endswith("]"):
        try:
            entry = ast.literal_eval(entry)
        except ValueError:
            # If conversion fails, return an empty list
            return []

    # If the entry is a list, process its items
    if isinstance(entry, list):
        for item in entry:
            # If the item is a string representation of a list, convert it
            if isinstance(item, str) and item.startswith("[") and item.endswith("]"):
                try:
                    item = ast.literal_eval(item)
                except ValueError:
                    continue  # Skip items that can't be converted

            # If the item is a list, extend the flattened list with its elements
            if isinstance(item, list):
                flattened_list.extend(item)
            else:
                # For single string items, append them directly
                flattened_list.append(item)

    return flattened_list

# Apply the function to the 'publn_nr' column
df_keywords_list_agg['publn_nr'] = df_keywords_list_agg['publn_nr'].progress_apply(flatten_and_convert)


100%|██████████| 2473267/2473267 [00:06<00:00, 363519.69it/s] 


In [14]:
def remove_duplicates(lst):
    # Convert list to set to remove duplicates, then back to list
    return list(set(lst))

# Apply the function to each relevant column
df_keywords_list_agg['patent_id'] = df_keywords_list_agg['patent_id'].progress_apply(remove_duplicates)
df_keywords_list_agg['publn_nr'] = df_keywords_list_agg['publn_nr'].progress_apply(remove_duplicates)
df_keywords_list_agg['oaid'] = df_keywords_list_agg['oaid'].progress_apply(remove_duplicates)

100%|██████████| 2473267/2473267 [00:01<00:00, 1377948.02it/s]
100%|██████████| 2473267/2473267 [00:07<00:00, 311523.73it/s] 
100%|██████████| 2473267/2473267 [00:08<00:00, 288483.12it/s] 


In [15]:
def remove_duplicates_from_lists(df, columns):
    for col in columns:
        df[col] = df[col].progress_apply(lambda x: list(set(x)))
    return df

# Apply this function to the 'oaid', 'patent_id', and 'publn_nr' columns
# df_keywords_list_agg = remove_duplicates_from_lists(df_keywords_list_agg, ['oaid', 'patent_id', 'publn_nr', 'cpc_group', 'cpc_class_symbol'])
df_keywords_list_agg = remove_duplicates_from_lists(df_keywords_list_agg, ['oaid', 'patent_id', 'publn_nr'])

100%|██████████| 2473267/2473267 [00:02<00:00, 1153239.23it/s]
100%|██████████| 2473267/2473267 [00:07<00:00, 349830.57it/s] 
100%|██████████| 2473267/2473267 [00:01<00:00, 1549745.26it/s]


## Postprocessing

In [16]:
# Prune dataframe by document frequency and absolute frequency
min_abs_frequency = 5
max_abs_frequency = 1000
# max_doc_frequency = 0.3

df_keywords_list_agg_pruned = df_keywords_list_agg[(df_keywords_list_agg['abs_frequency'] >= min_abs_frequency) & (df_keywords_list_agg['abs_frequency'] <= max_abs_frequency)]

# Reset index
df_keywords_list_agg_pruned.reset_index(drop=True, inplace=True)

# Keyword Embedding

In [21]:
from sentence_transformers import SentenceTransformer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [22]:
model_climatebert = SentenceTransformer('climatebert/distilroberta-base-climate-f')
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')
model_patentsberta = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f. Creating a new one with MEAN pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/anferico_bert-for-patents. Creating a new one with MEAN pooling.


In [23]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU available: NVIDIA RTX A4500


In [24]:
# Generate copy of df_claims_keywords_list
df_keywords_list_agg_embeddings = df_keywords_list_agg_pruned.copy()

# Perform sentence embedding on the 'keyword_yake' (PatentsView) or 'keywords_yake_claims' (EPO) column
df_keywords_list_agg_embeddings['keyword_yake_patentsberta_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_patentsberta.encode(x)
)

df_keywords_list_agg_embeddings['keyword_yake_climatebert_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_climatebert.encode(x)
)

df_keywords_list_agg_embeddings['keyword_yake_bertforpatents_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_bertforpatents.encode(x)
)

 76%|███████▌  | 118627/155607 [13:38<04:15, 144.96it/s]


KeyboardInterrupt: 

In [None]:
# Save dataframe to json
df_keywords_list_agg_embeddings.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json', orient='records')