In [1]:
import pandas as pd
import ast
import re
from tqdm import tqdm
tqdm.pandas()

# EPO

In [2]:
# Load Data
df_yake_claims_ep = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/PATSTAT/cleantech_epo_text_data_pivot_cleaned_yake_noun_chunks.json')

In [3]:
def parse_strings(s):
    # Check if the string starts with [' and ends with ']
    if s.startswith("['") and s.endswith("']"):
        # Use a regular expression to find all sequences of characters enclosed in single or double quotes
        return re.findall(r"['\"]([^'\"]*)['\"]", s)
    else:
        # Split the string by commas
        return s.split(', ')
    
# Apply parse_strings function to 'cpc_class_symbol' column
df_yake_claims_ep['cpc_class_symbol'] = df_yake_claims_ep['cpc_class_symbol'].progress_apply(parse_strings)

100%|██████████| 182369/182369 [00:00<00:00, 607237.06it/s]


In [5]:
keywords_list_ep = []
yake_conf_score_list = []
publn_nr_list = []
cpc_symbol_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_yake_claims_ep.iterrows()):
    # Check if 'keywords_yake_claims' column is not a list
    if not isinstance(row['keywords_yake_claim_noun_chunk'], list):
        continue
    # Check if 'keywords_yake_claims' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_claim_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake_claims' column and append to keywords_list_ep, consider only top 10 keywords
    else:
        for keyword in row['keywords_yake_claim_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_ep.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            publn_nr_list.append(row['publn_nr'])
            cpc_symbol_list.append(row['cpc_class_symbol'])

# Create new dataframe
df_keywords_list_ep = pd.DataFrame({
    'keyword_yake': keywords_list_ep,
    'yake_conf_score': yake_conf_score_list,
    'publn_nr': publn_nr_list,
    'cpc_class_symbol': cpc_symbol_list,
    'abs_frequency': 1
})

182369it [00:10, 17678.73it/s]


In [6]:
# Filter out non-alphanumeric keywords
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 1724465/1724465 [00:01<00:00, 1227954.03it/s]
100%|██████████| 1656469/1656469 [00:00<00:00, 2168903.25it/s]
100%|██████████| 1656469/1656469 [00:01<00:00, 1066813.77it/s]


In [7]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_ep['keyword_yake_lemma'] = df_keywords_list_ep['keyword_yake'].progress_apply(lemmatize_keywords)

100%|██████████| 1656469/1656469 [00:07<00:00, 234884.58it/s]


In [8]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_ep['keyword_yake_lemma'] = df_keywords_list_ep['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_ep = df_keywords_list_ep[
    df_keywords_list_ep['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 1656469/1656469 [00:01<00:00, 1557196.21it/s]
100%|██████████| 1656469/1656469 [00:00<00:00, 2379232.41it/s]


In [9]:
# Aggregate df_keywords_list_ep by 'keyword'
df_keywords_list_ep_agg = df_keywords_list_ep.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'publn_nr': list,
    'cpc_class_symbol': list,
    'abs_frequency': 'count'
}).reset_index()

# Flatten nested lists in 'cpc_class_symbol' column
df_keywords_list_ep_agg['cpc_class_symbol'] = df_keywords_list_ep_agg['cpc_class_symbol'].progress_apply(lambda x: [item for sublist in x for item in sublist])

100%|██████████| 333010/333010 [00:01<00:00, 197146.79it/s]


In [10]:
df_keywords_list_ep_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/epo_yake_keywords_list_noun_chunks.json', orient='records')

# USPTO

In [3]:
# Load Data
df_yake_claims_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/g_patent_claims_cleantech_yake_noun_chunks.json')

In [4]:
keywords_list_uspto = []
yake_conf_score_list = []
patent_id_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_yake_claims_uspto.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['keywords_yake_claim_noun_chunk'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_claim_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake' column and append to keywords_list_uspto
    else:
        for keyword in row['keywords_yake_claim_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_uspto.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            patent_id_list.append(row['patent_id'])

# Create new dataframe
df_keywords_list_uspto = pd.DataFrame({
    'keyword_yake': keywords_list_uspto,
    'yake_conf_score': yake_conf_score_list,
    'patent_id': patent_id_list,
    'abs_frequency': 1
})

515742it [00:23, 21936.79it/s]


In [5]:
# Filter out non-alphanumeric keywords
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 5115943/5115943 [00:04<00:00, 1278048.31it/s]
100%|██████████| 4935785/4935785 [00:02<00:00, 2363758.56it/s]
100%|██████████| 4935785/4935785 [00:04<00:00, 1109943.83it/s]


## Match Patents to CPC Classification

In [6]:
# PatentsView - Merge with CPC Classification
df_cpc_uspto = pd.read_json('/mnt/hdd01/patentsview/Patentsview - Cleantech Patents/df_patentsview_patent_cpc_grouped_cleantech.json')
# Extract 'cpc_group' into a new column
df_cpc_uspto['cpc_group'] = df_cpc_uspto['cpc'].progress_apply(
    lambda x: [entry['cpc_group'] for entry in x.values() if 'cpc_group' in entry]
)

# Remove duplicates from 'cpc_group_list'
df_cpc_uspto['cpc_group'] = df_cpc_uspto['cpc_group'].progress_apply(lambda x: list(set(x)))

100%|██████████| 515745/515745 [00:00<00:00, 925507.84it/s]
100%|██████████| 515745/515745 [00:00<00:00, 1190762.37it/s]


In [7]:
# Merge df_keywords_list_uspto with df_cpc_uspto
df_keywords_list_uspto = pd.merge(
    df_keywords_list_uspto,
    df_cpc_uspto[['patent_id', 'cpc_group']],
    how='left',
    on='patent_id'
)

## Continue Postprocessing

In [8]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_uspto['keyword_yake_lemma'] = df_keywords_list_uspto['keyword_yake'].progress_apply(lemmatize_keywords)

100%|██████████| 4935785/4935785 [00:19<00:00, 251958.13it/s]


In [9]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_uspto['keyword_yake_lemma'] = df_keywords_list_uspto['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_uspto = df_keywords_list_uspto[
    df_keywords_list_uspto['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 4935785/4935785 [00:03<00:00, 1570873.43it/s]
100%|██████████| 4935785/4935785 [00:02<00:00, 2349589.39it/s]


In [10]:
# Aggregate df_keywords_list_ep by 'keyword'
df_keywords_list_uspto_agg = df_keywords_list_uspto.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'patent_id': list,
    'cpc_group': list,
    'abs_frequency': 'count'
}).reset_index()

# Flatten nested lists in 'cpc_group' column
df_keywords_list_uspto_agg['cpc_group'] = df_keywords_list_uspto_agg['cpc_group'].progress_apply(lambda x: [item for sublist in x for item in sublist])

100%|██████████| 724045/724045 [00:07<00:00, 94066.54it/s] 


In [11]:
df_keywords_list_uspto_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/uspto_yake_keywords_list_noun_chunks.json', orient='records')

# Reliance on Science - USPTO and EPO

In [27]:
# Load Data
df_rel_on_science_uspto = pd.read_json('/mnt/hdd01/patentsview/Reliance on Science - Cleantech Patents/df_oaid_Cleantech_y02_individual_works_yake_noun_chunks.json', dtype={'patent_id': str, 'oaid': str})
df_rel_on_science_ep = pd.read_json('/mnt/hdd01/PATSTAT Working Directory/Reliance on Science/cleantech_epo_rel_on_science_abstract_yake_noun_chunks.json', dtype={'publn_nr': str, 'oaid': str})

In [28]:
# Concatenate dataframes, reset index and drop duplicates
df_rel_on_science = pd.concat([df_rel_on_science_uspto, df_rel_on_science_ep], ignore_index=True)
df_rel_on_science = df_rel_on_science.drop_duplicates(subset=['oaid'], keep='first').reset_index(drop=True)

In [124]:
keywords_list_rel = []
yake_conf_score_list = []
oaid_list = []
publn_nr_list = []
patent_id_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_rel_on_science.iterrows()):
    # Check if 'keywords_yake' column is not a list
    if not isinstance(row['keywords_yake_abstract_noun_chunk'], list):
        continue
    # Check if 'keywords_yake' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_abstract_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake' column and append to keywords_list_rel
    else:
        for keyword in row['keywords_yake_abstract_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_rel.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            oaid_list.append(row['oaid'])
            publn_nr_list.append(row['publn_nr'])
            patent_id_list.append(row['patent_id'])

# Create new dataframe
df_keywords_list_rel = pd.DataFrame({
    'keyword_yake': keywords_list_rel,
    'yake_conf_score': yake_conf_score_list,
    'oaid': oaid_list,
    'abs_frequency': 1,
    'publn_nr': publn_nr_list,
    'patent_id': patent_id_list
})

0it [00:00, ?it/s]

623364it [00:43, 14454.38it/s]


In [125]:
# Filter out non-alphanumeric keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 6002161/6002161 [00:04<00:00, 1290900.60it/s]
100%|██████████| 5526239/5526239 [00:02<00:00, 2536599.00it/s]
100%|██████████| 5526239/5526239 [00:04<00:00, 1114949.60it/s]


In [126]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_rel['keyword_yake_lemma'] = df_keywords_list_rel['keyword_yake'].progress_apply(lemmatize_keywords)

100%|██████████| 5526239/5526239 [00:22<00:00, 244526.28it/s]


In [127]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_rel['keyword_yake_lemma'] = df_keywords_list_rel['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_rel = df_keywords_list_rel[
    df_keywords_list_rel['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 5526239/5526239 [00:03<00:00, 1639468.80it/s]
100%|██████████| 5526239/5526239 [00:02<00:00, 2586973.79it/s]


In [128]:
# Cast 'publn_nr' column to string
df_keywords_list_rel['publn_nr'] = df_keywords_list_rel['publn_nr'].progress_apply(str)

100%|██████████| 5523351/5523351 [00:02<00:00, 2329283.94it/s]


In [129]:
# Aggregate df_keywords_list_rel by 'keyword_yake_lemma'
df_keywords_list_rel_agg = df_keywords_list_rel.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'oaid': list,
    'publn_nr': list,
    'patent_id': list,
    'abs_frequency': 'count'
}).reset_index()

# Delete all nan list entries in columns 'publn_nr' and 'patent_id'
df_keywords_list_rel_agg['publn_nr'] = df_keywords_list_rel_agg['publn_nr'].progress_apply(lambda x: [item for item in x if str(item) != 'nan'])
df_keywords_list_rel_agg['patent_id'] = df_keywords_list_rel_agg['patent_id'].progress_apply(lambda x: [item for item in x if str(item) != 'nan'])

100%|██████████| 1197878/1197878 [00:01<00:00, 775967.36it/s]
100%|██████████| 1197878/1197878 [00:01<00:00, 803346.20it/s]


In [130]:
df_keywords_list_rel_agg.sample(15)

Unnamed: 0,keyword_yake_lemma,yake_conf_score,oaid,publn_nr,patent_id,abs_frequency
1078296,testing device,0.084503,[1979508045],[],[9643711],1
7789,abstract loading,0.00904,[2161400967],[],[7608557],1
1125436,umfasst,0.151127,[4244532323],[],[8865359],1
901231,recently characterized depsipeptide,0.001241,[2070072813],[],[10494407],1
794140,peptidalkaloide wurde zunächst,0.006159,[2109103725],[],[7582604],1
121456,bond representative carbon,0.000112,[1556512819],[],[10960087],1
917544,renally impaired patient,0.100391,"[2165011987, 1802548550]",[],"[11022593, 7213009]",2
766223,organ selectivity,0.112657,[2066283662],[],[7452538],1
1019378,squid,0.170053,"[2151810181, 2067349649, 2076275236, 949100985...","[['1909061'], ['0496530', '0487130'], ['371839...","[10695062, 9512899, 6472541, 10277208, 9219978...",40
592919,laser gas heating,0.018877,[2021119433],[],[9609732],1


In [131]:
df_keywords_list_rel_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/rel_on_science_yake_keywords_list_noun_chunks.json', orient='records')

# CPC Classification

In [2]:
df_cpc_classification = pd.read_json('/mnt/hdd01/patentsview/CPC Classification/df_keyword_y02_classification_noun_chunking.json')

In [4]:
df_cpc_classification.head(1)

Unnamed: 0,cpc_classification,sequence,title,title_lower,full_title,keywords_yake_title_lower,keywords_yake_title_lower_noun_chunk,noun_chunks
0,Y02,-2,Technologies or applications for mitigation or...,technologies or applications for mitigation or...,Technologies or applications for mitigation or...,"[[technologies or applications, 0.0], [climate...","[[climate change, 0.0153808212], [mitigation, ...","[mitigation, climate change, applications, ada..."


In [7]:
keywords_list_cpc = []
yake_conf_score_list = []
cpc_symbol_list = []
# min_yake_conf = 0.05 - Currently not used

# Iterate over rows in dataframe
for index, row in tqdm(df_cpc_classification.iterrows()):
    # Check if 'keywords_yake_claims' column is not a list
    if not isinstance(row['keywords_yake_title_lower_noun_chunk'], list):
        continue
    # Check if 'keywords_yake_claims' column is an empty list or contains only empty lists
    if not any(keyword for keyword in row['keywords_yake_title_lower_noun_chunk']):
        continue
    # Iterate over keywords in 'keywords_yake_claims' column and append to keywords_list_ep, consider only top 10 keywords
    else:
        for keyword in row['keywords_yake_title_lower_noun_chunk'][:10]:
            # if keyword[1] <= min_yake_conf:
            keywords_list_cpc.append(keyword[0].lower())
            yake_conf_score_list.append(keyword[1])
            cpc_symbol_list.append(row['cpc_classification'])

# Create new dataframe
df_keywords_list_cpc = pd.DataFrame({
    'keyword_yake': keywords_list_cpc,
    'yake_conf_score': yake_conf_score_list,
    'cpc_class_symbol': cpc_symbol_list
})

340it [00:00, 11339.11it/s]


In [9]:
# Filter out non-alphanumeric keywords
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: all(word.isalnum() for word in x.split()))
]

# Filter out all keywords shorter than 3 characters
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: len(x) > 2)
]

# Define a function to check if a string is an abbreviation
def is_abbreviation(keyword):
    # Regular expression to identify abbreviations (typically all uppercase and periods)
    # and check for all-uppercase abbreviations with 3 or fewer characters
    pattern = re.compile(r'\b(?:[A-Z]{1,}\.){2,}\b|\b[A-Z]{1,3}\b')
    return pattern.match(keyword) is not None

# Apply the function to filter out abbreviations
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake'].progress_apply(lambda x: not is_abbreviation(x))
]

100%|██████████| 707/707 [00:00<00:00, 381300.36it/s]
100%|██████████| 672/672 [00:00<00:00, 706940.63it/s]
100%|██████████| 672/672 [00:00<00:00, 347243.11it/s]


In [28]:
# Lemmatize keywords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_keywords(keyword):
    return ' '.join([lemmatizer.lemmatize(word) for word in keyword.split()])

df_keywords_list_cpc['keyword_yake_lemma'] = df_keywords_list_cpc['keyword_yake'].progress_apply(lemmatize_keywords)

100%|██████████| 672/672 [00:00<00:00, 930.05it/s]


In [29]:
import nltk

stopwords = set(nltk.corpus.stopwords.words('english'))

# Function to remove keywords that are only one stopword or start/end with a stopword
def remove_stopwords(keyword):
    words = keyword.split()
    
    # If the keyword is a single stopword, remove it
    if len(words) == 1 and words[0] in stopwords:
        return ''
    
    # If the keyword starts or ends with a stopword, remove line
    if words[0] in stopwords:
        return ''
    if words and words[-1] in stopwords:
        return ''
    
    return ' '.join(words)

# Apply the function to remove stopwords
df_keywords_list_cpc['keyword_yake_lemma'] = df_keywords_list_cpc['keyword_yake_lemma'].progress_apply(remove_stopwords)

# Remove empty keywords
df_keywords_list_cpc = df_keywords_list_cpc[
    df_keywords_list_cpc['keyword_yake_lemma'].progress_apply(lambda x: len(x) > 0)
]

100%|██████████| 672/672 [00:00<00:00, 780984.29it/s]
100%|██████████| 672/672 [00:00<00:00, 1261106.17it/s]


In [30]:
df_keywords_list_cpc.head(1)

Unnamed: 0,keyword_yake,yake_conf_score,cpc_class_symbol,keyword_yake_lemma
0,climate change,0.015381,Y02,climate change


In [31]:
# Aggregate df_keywords_list_cpc by 'keyword'
df_keywords_list_cpc_agg = df_keywords_list_cpc.groupby(['keyword_yake_lemma']).agg({
    'yake_conf_score': 'mean',
    'cpc_class_symbol': list,
}).reset_index()


In [33]:
df_keywords_list_cpc_agg.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks.json', orient='records')

# Merge EP, USPTO and Reliance on Science

In [15]:
# Load Data
df_keywords_list_uspto_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/uspto_yake_keywords_list_noun_chunks.json')
df_keywords_list_ep_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/epo_yake_keywords_list_noun_chunks.json')
df_keywords_list_rel_agg = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/rel_on_science_yake_keywords_list_noun_chunks.json')

In [33]:
# Cast publn_nr and patent_id to list of strings
df_keywords_list_ep_agg['publn_nr'] = df_keywords_list_ep_agg['publn_nr'].progress_apply(lambda x: [str(item) for item in x])
df_keywords_list_uspto_agg['patent_id'] = df_keywords_list_uspto_agg['patent_id'].progress_apply(lambda x: [str(item) for item in x])

100%|██████████| 333010/333010 [00:03<00:00, 85829.38it/s] 
100%|██████████| 724045/724045 [00:01<00:00, 700259.76it/s]


In [40]:
frames = [df_keywords_list_uspto_agg, df_keywords_list_ep_agg, df_keywords_list_rel_agg]
df_keywords_list = pd.concat(frames)
df_keywords_list.reset_index(drop=True, inplace=True)

In [54]:
df_keywords_list.sample(5)

Unnamed: 0,keyword_yake_lemma,yake_conf_score,patent_id,cpc_group,abs_frequency,publn_nr,cpc_class_symbol,oaid
958791,principal,0.069415,,,37,"[130483, 145557, 211084, 280762, 580790, 65402...","[Y02E 30/30, Y02P 70/50, Y02E 60/50, Y02P ...",
1424744,expression vector family,0.039519,[7323619],,1,[],,[2136646494]
673822,triangular shaped die,0.011013,[4889476],[Y02P40/57],1,,,
706296,voltage variance,0.026912,"[10052968, 7019489]","[Y02T10/70, Y02E60/50, Y02T90/40, Y02T10/70]",2,,,
266155,functional processor,0.037849,[9626220],[Y02D10/00],1,,,


In [75]:
def concat_lists(series):
    combined_list = []
    for item in series:
        if isinstance(item, list):
            combined_list.extend(item)
    return combined_list

# Group by 'keyword_yake_lemma' and aggregate
df_keywords_list_agg = df_keywords_list.groupby('keyword_yake_lemma').agg({
    'yake_conf_score': 'mean',        # Mean of yake_conf_score
    'abs_frequency': 'sum',           # Sum of abs_frequency
    'patent_id': concat_lists,        # Concatenate lists in patent_id
    'publn_nr': concat_lists,         # Concatenate lists in publn_nr
    'oaid': concat_lists,             # Concatenate lists in oaid
    'cpc_group': concat_lists,        # Concatenate lists in cpc_group
    'cpc_class_symbol': concat_lists  # Concatenate lists in cpc_class_symbol
}).reset_index()

In [108]:
import ast

def flatten_and_convert(entry):
    # If the entry is NaN (float type in pandas), return an empty list
    if isinstance(entry, float):
        return []

    # Initialize an empty list to store the flattened results
    flattened_list = []

    # Check if the entry is a string and convert it to a list if it represents a list
    if isinstance(entry, str) and entry.startswith("[") and entry.endswith("]"):
        try:
            entry = ast.literal_eval(entry)
        except ValueError:
            # If conversion fails, return an empty list
            return []

    # If the entry is a list, process its items
    if isinstance(entry, list):
        for item in entry:
            # If the item is a string representation of a list, convert it
            if isinstance(item, str) and item.startswith("[") and item.endswith("]"):
                try:
                    item = ast.literal_eval(item)
                except ValueError:
                    continue  # Skip items that can't be converted

            # If the item is a list, extend the flattened list with its elements
            if isinstance(item, list):
                flattened_list.extend(item)
            else:
                # For single string items, append them directly
                flattened_list.append(item)

    return flattened_list

# Apply the function to the 'publn_nr' column
df_keywords_list_agg['publn_nr'] = df_keywords_list_agg['publn_nr'].progress_apply(flatten_and_convert)


100%|██████████| 1961759/1961759 [00:02<00:00, 783131.25it/s]


In [110]:
def remove_duplicates(lst):
    # Convert list to set to remove duplicates, then back to list
    return list(set(lst))

# Apply the function to each relevant column
df_keywords_list_agg['patent_id'] = df_keywords_list_agg['patent_id'].progress_apply(remove_duplicates)
df_keywords_list_agg['publn_nr'] = df_keywords_list_agg['publn_nr'].progress_apply(remove_duplicates)
df_keywords_list_agg['oaid'] = df_keywords_list_agg['oaid'].progress_apply(remove_duplicates)

100%|██████████| 1961759/1961759 [00:01<00:00, 1008155.66it/s]
100%|██████████| 1961759/1961759 [00:01<00:00, 1612314.25it/s]
100%|██████████| 1961759/1961759 [00:07<00:00, 273363.51it/s] 


In [127]:
def remove_duplicates_from_lists(df, columns):
    for col in columns:
        df[col] = df[col].progress_apply(lambda x: list(set(x)))
    return df

# Apply this function to the 'oaid', 'patent_id', and 'publn_nr' columns
df_keywords_list_agg = remove_duplicates_from_lists(df_keywords_list_agg, ['oaid', 'patent_id', 'publn_nr', 'cpc_group', 'cpc_class_symbol'])

100%|██████████| 1961759/1961759 [00:01<00:00, 1161631.03it/s]
100%|██████████| 1961759/1961759 [00:02<00:00, 927183.99it/s]
100%|██████████| 1961759/1961759 [00:01<00:00, 1511009.37it/s]
100%|██████████| 1961759/1961759 [00:07<00:00, 251310.94it/s] 
100%|██████████| 1961759/1961759 [00:01<00:00, 1594885.80it/s]


## Postprocessing

In [133]:
# Prune dataframe by document frequency and absolute frequency
min_abs_frequency = 5
max_abs_frequency = 1000
# max_doc_frequency = 0.3

df_keywords_list_agg_pruned = df_keywords_list_agg[(df_keywords_list_agg['abs_frequency'] >= min_abs_frequency) & (df_keywords_list_agg['abs_frequency'] <= max_abs_frequency)]

# Reset index
df_keywords_list_agg_pruned.reset_index(drop=True, inplace=True)

# Keyword Embedding

In [135]:
from sentence_transformers import SentenceTransformer
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [136]:
model_climatebert = SentenceTransformer('climatebert/distilroberta-base-climate-f')
model_bertforpatents = SentenceTransformer('anferico/bert-for-patents')
model_patentsberta = SentenceTransformer('AI-Growth-Lab/PatentSBERTa')

No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f. Creating a new one with MEAN pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at /home/thiesen/.cache/torch/sentence_transformers/climatebert_distilroberta-base-climate-f and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
No sentence-transformers model found with name /home/thiesen/.cache/torch/sentence_transformers/anferico_bert-for-patents. Creating a new one with MEAN pooling.


In [137]:
# Check if GPU is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU available: {}".format(torch.cuda.get_device_name(0)))
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU available: NVIDIA RTX A4500


In [138]:
# Generate copy of df_claims_keywords_list
df_keywords_list_agg_embeddings = df_keywords_list_agg_pruned.copy()

# Perform sentence embedding on the 'keyword_yake' (PatentsView) or 'keywords_yake_claims' (EPO) column
df_keywords_list_agg_embeddings['keyword_yake_patentsberta_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_patentsberta.encode(x)
)

df_keywords_list_agg_embeddings['keyword_yake_climatebert_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_climatebert.encode(x)
)

df_keywords_list_agg_embeddings['keyword_yake_bertforpatents_embedding'] = df_keywords_list_agg_embeddings['keyword_yake_lemma'].progress_apply(
    lambda x: model_bertforpatents.encode(x)
)

100%|██████████| 151044/151044 [16:15<00:00, 154.81it/s]
100%|██████████| 151044/151044 [09:49<00:00, 256.10it/s]
100%|██████████| 151044/151044 [32:06<00:00, 78.39it/s]


In [149]:
df_keywords_list_agg_embeddings.sample(5)

Unnamed: 0,keyword_yake_lemma,yake_conf_score,abs_frequency,patent_id,publn_nr,oaid,cpc_group,cpc_class_symbol,keyword_yake_patentsberta_embedding,keyword_yake_climatebert_embedding,keyword_yake_bertforpatents_embedding
52423,forming unit,0.116527,10,"[11024867, 11013773, 10953053, 10696572, 10258...",[],"[2314723347, 2127724848, 2000754494, 216455397...","[Y02A50/30, Y02T10/72, Y02P70/50, Y02E60/10, Y...",[],"[-0.2664432, -0.71336854, -0.19560274, 0.12043...","[-0.04168712, -0.01022782, -0.0611551, 0.01770...","[0.45502183, -0.12699415, 0.23816243, -0.44813..."
53910,fuel property,0.04117,17,"[8027781, 6109225, 7050901, 10113208, 6073611,...","[2778378, 2507495, 1517024]","[2170485159, 1969102714, 2089352982, 2094700891]","[Y02T10/12, Y02T10/40, Y02T10/30]",[Y02T 10/30],"[-0.36710405, -0.20286739, -0.29043177, 0.0753...","[-0.06109487, 0.07157007, -0.035822365, -0.114...","[-0.66123885, -0.11224864, -0.5386305, -0.1484..."
58175,gramineae,0.062648,7,"[8551758, 10226502, 8765438, 6331660, 6821782,...",[],"[2054771403, 2077478467, 69996558, 2153702005,...",[Y02A40/146],[],"[-0.071831, -0.38879472, -0.19552678, 0.319098...","[-0.10427654, -0.005826688, -0.026911682, 0.15...","[0.21785995, -0.5709935, 0.3596971, -0.9650602..."
67739,immunological reviewsvolume,0.033162,16,"[7279462, 10190095, 6340459, 5843904, 8772257,...",[],"[1998912267, 2141116067, 2085662422, 212531261...",[],[],"[0.24936381, -0.40906265, -0.2253454, -0.16598...","[-0.04198655, 0.1732413, 0.07903323, -0.016652...","[-0.43490568, -0.5597404, 0.81100875, 0.517311..."
39323,dvd player,0.104078,6,"[6166496, 10228667, 11316056, 8091772, 7453832...",[],"[1681689143, 2046204174, 2011142855, 2023891690]","[Y02B20/30, Y02D30/70]",[],"[-0.0057602962, -0.64290005, -0.27191755, 0.10...","[-0.0021057706, 0.10024575, -0.017373513, 0.01...","[-0.4767, 0.1379464, 0.036986217, 0.11431643, ..."


In [152]:
# Save dataframe to json
df_keywords_list_agg_embeddings.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json', orient='records')