In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

# Load Data for Core Cleantech Dictionary

In [2]:
df_yake_cleantech_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks_embeddings.json')

# Load Data for Semantic Search enriched Co-Occurrence Analysis

In [2]:
df_yake_cleantech_similarity = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Similarity Search/df_keyword_titles_cosine_similarity_radius_025_noun_chunks.json')

In [3]:
# Explode column keywords_bertforpatents
df_yake_cleantech_similarity = df_yake_cleantech_similarity.explode('keywords_bertforpatents')

In [4]:
# Build new dataframe out of all values in 'keyword_yake_lemma' and 'keywords_bertforpatents' column
combined_df = pd.concat([df_yake_cleantech_similarity['keyword_yake_lemma'], df_yake_cleantech_similarity['keywords_bertforpatents']]).drop_duplicates()
df_yake_cleantech_titles = pd.DataFrame(combined_df, columns=['keyword_yake_lemma'])

In [5]:
len(df_yake_cleantech_titles)

25304

# Load Data for Co-Occurrence Analysis

In [3]:
df_yake_uspto_epo_rel = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json')
# Drop columns keyword_yake_pos, keyword_yake_patentsberta_embedding, keyword_yake_climatebert_embedding and keyword_yake_bertforpatents_embedding
df_yake_uspto_epo_rel.drop(columns=['keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding'], inplace=True)

In [4]:
# Extract unique keywords
cleantech_titles = df_yake_cleantech_titles['keyword_yake_lemma'].unique()
uspto_epo_rel_keywords = df_yake_uspto_epo_rel['keyword_yake_lemma'].unique()

# Initialize co-occurrence matrix
co_occurrence_matrix_ids = pd.DataFrame(0, index=cleantech_titles, columns=uspto_epo_rel_keywords)
co_occurrence_matrix_y02 = pd.DataFrame(0, index=cleantech_titles, columns=uspto_epo_rel_keywords)

# Co-Occurence by CPC Classification - yields too few results, threshold only leaves 9 unique keywords

In [None]:
# Delete all spaces in list of strings in column 'cpc_class_symbol'
df_yake_uspto_epo_rel['cpc_class_symbol'] = df_yake_uspto_epo_rel['cpc_class_symbol'].progress_apply(lambda x: [i.replace(' ', '') for i in x])

In [None]:
df_yake_uspto_epo_rel.sample(5)

In [None]:
# Exploding the 'cpc_group' column while retaining the 'keyword_yake_lemma' column
cpc_group_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['cpc_group'].explode().reset_index()
# Delete all rows where cpc_group is of type Float
cpc_group_exploded = cpc_group_exploded[cpc_group_exploded['cpc_group'].apply(lambda x: isinstance(x, str))]
# Aggregate on 'cpc_group', list of keyword_yake_lemma
cpc_group_exploded = cpc_group_exploded.groupby('cpc_group')['keyword_yake_lemma'].apply(list).reset_index()

In [None]:
cpc_group_exploded.sample(5)

In [None]:
# Exploding the 'cpc_class_symbol' column while retaining the 'keyword_yake' column
cpc_class_symbol_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['cpc_class_symbol'].explode().reset_index()
# Delete all rows where cpc_group is of type Float
cpc_class_symbol_exploded = cpc_class_symbol_exploded[cpc_class_symbol_exploded['cpc_class_symbol'].apply(lambda x: isinstance(x, str))]
# Aggregate on 'cpc_group', list of keyword_yake_lemma
cpc_class_symbol_exploded = cpc_class_symbol_exploded.groupby('cpc_class_symbol')['keyword_yake_lemma'].apply(list).reset_index()

In [None]:
cpc_class_symbol_exploded.sample(5)

In [None]:
# Merge the two dataframes on 'cpc_group' and 'cpc_class_symbol', list of keyword_yake_lemma mind that keyword_yake_lemma is list of strings
cpc_group_class_symbol_merged = pd.merge(cpc_group_exploded, cpc_class_symbol_exploded, left_on='cpc_group', right_on='cpc_class_symbol', how='outer')
# New column cpc - if cpc_group is not NaN, then cpc = cpc_group, else cpc = cpc_class_symbol
cpc_group_class_symbol_merged['cpc'] = cpc_group_class_symbol_merged['cpc_group'].fillna(cpc_group_class_symbol_merged['cpc_class_symbol'])
# Merge keyword_yake_lemma_x and keyword_yake_lemma_y into one column keyword_yake_lemma
cpc_group_class_symbol_merged['keyword_yake_lemma'] = cpc_group_class_symbol_merged.progress_apply(
    lambda row: list(set(
        (row['keyword_yake_lemma_x'] if isinstance(row['keyword_yake_lemma_x'], list) else []) +
        (row['keyword_yake_lemma_y'] if isinstance(row['keyword_yake_lemma_y'], list) else [])
    )), axis=1)
# Delete columns cpc_group, cpc_class_symbol, keyword_yake_lemma_x and keyword_yake_lemma_y
cpc_group_class_symbol_merged.drop(columns=['cpc_group', 'cpc_class_symbol', 'keyword_yake_lemma_x', 'keyword_yake_lemma_y'], inplace=True)

In [None]:
# Explode the 'keyword_yake_lemma' column
cpc_group_class_symbol_exploded = cpc_group_class_symbol_merged.explode('keyword_yake_lemma').reset_index(drop=True)

In [None]:
# Merge cpc_group_class_symbol_exploded with df_yake_cleantech_titles on 'keyword_yake_lemma'and keep cpc column including duplicates
cpc_group_class_symbol_filtered = pd.merge(df_yake_cleantech_titles, cpc_group_class_symbol_exploded, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many').reset_index(drop=True)

In [None]:
# Delete all rows corresponding to 'keyword_yake_lemma' when keyword_yake_lemma occurs less than x times
co_occurrence_threshold = 100
co_occurrence = cpc_group_class_symbol_filtered['keyword_yake_lemma'].value_counts()
co_occurrence = co_occurrence[co_occurrence > co_occurrence_threshold]
co_occurrence = co_occurrence.reset_index()
co_occurrence.columns = ['keyword_yake_lemma', 'count']
cpc_group_class_symbol_filtered = cpc_group_class_symbol_filtered.merge(co_occurrence, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [None]:
cpc_group_class_symbol_filtered['keyword_yake_lemma'].nunique()

## Co-Occurence Matrix by CPC Classification

In [None]:
# Function to update co-occurrence matrix
def update_co_occurrence(row):
    if isinstance(row['patent_id'], str) and row['patent_id'] in patent_id_dict:
        keyword_lists = patent_id_dict[row['patent_id']]
    elif isinstance(row['publn_nr'], str) and row['publn_nr'] in publn_nr_dict:
        keyword_lists = publn_nr_dict[row['publn_nr']]
    elif isinstance(row['oaid'], str) and row['oaid'] in oaid_dict:
        keyword_lists = oaid_dict[row['oaid']]
    else:
        return

    for keyword_list in keyword_lists:
        for keyword in keyword_list:
            co_occurrence_matrix_ids.at[row['keyword_yake_lemma'], keyword] += 1

# Apply the function to each row in the DataFrame
for index, row in tqdm(df_yake_uspto_epo_rel_titles_filtered.iterrows(), total=len(df_yake_uspto_epo_rel_titles_filtered)):
    update_co_occurrence(row)

# Co-Occurrence by ID

In [5]:
# Exploding the 'patent_id' column while retaining the 'keyword_yake' column
patent_id_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['patent_id'].explode().reset_index()
# Put 'US' in front of each patent_id
patent_id_exploded['patent_id'] = 'US' + patent_id_exploded['patent_id'].astype(str)
# Delete all rows where patent_id contains the string None
patent_id_exploded = patent_id_exploded[~patent_id_exploded['patent_id'].str.contains('None')]
# Aggregate on 'patent_id', list of keyword_yake_lemma
patent_id_exploded = patent_id_exploded.groupby('patent_id')['keyword_yake_lemma'].apply(list).reset_index()

In [6]:
patent_id_exploded.head()

Unnamed: 0,patent_id,keyword_yake_lemma
0,US10000011,"[additively manufactured part, desired]"
1,US10000017,"[attachment, inlet opening, mounting apparatus..."
2,US10000021,"[annular, predetermined portion, shaped object]"
3,US10000025,[multiple location]
4,US10000033,"[double, elongated member, ethylene vinyl acet..."


In [7]:
# Exploding the 'publn_nr' column while retaining the 'keyword_yake' column
publn_nr_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['publn_nr'].explode().reset_index()
# Put 'EP' in front of each publn_nr
publn_nr_exploded['publn_nr'] = 'EP' + publn_nr_exploded['publn_nr'].astype(str)
# Delete all rows where publn_nr contains the string None
publn_nr_exploded = publn_nr_exploded[~publn_nr_exploded['publn_nr'].str.contains('None')]
# Aggregate on 'publn_nr', list all keyword_yake_lemma
publn_nr_exploded = publn_nr_exploded.groupby('publn_nr')['keyword_yake_lemma'].apply(list).reset_index()

In [8]:
publn_nr_exploded.head()

Unnamed: 0,publn_nr,keyword_yake_lemma
0,EP1000004,[stoichiometric quantity]
1,EP1000028,"[acid addition salt, carboxylic acid derivativ..."
2,EP100005,"[burr, fleece, increased electrical conductivity]"
3,EP100007,"[charged, porous substance, press]"
4,EP1000085,[enriched]


In [9]:
# Exploding the 'oaid' column while retaining the 'keyword_yake' column
oaid_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['oaid'].explode().reset_index()
# Put 'REL' in front of each oaid
oaid_exploded['oaid'] = 'REL' + oaid_exploded['oaid'].astype(str)
# Delete all rows where oaid contains the string None
oaid_exploded = oaid_exploded[~oaid_exploded['oaid'].str.contains('None')]
# Aggregate on 'oaid', list of 'keyword_yake_lemma'
oaid_exploded = oaid_exploded.groupby('oaid')['keyword_yake_lemma'].apply(list).reset_index()

In [10]:
oaid_exploded.head()

Unnamed: 0,oaid,keyword_yake_lemma
0,REL100000185,"[blind, crossover design]"
1,REL1000054809,"[improve, key technology, random, renewable, s..."
2,REL100007697,"[latency, problematic, rcu, recent work, sched..."
3,REL100014517,"[cap structure, eif, eukaryotic initiation fac..."
4,REL1000192467,"[july, physical examination, rash]"


In [11]:
# Concatenate the three exploded dataframes
df_keywords_uspto_epo_rel = pd.concat([patent_id_exploded, publn_nr_exploded, oaid_exploded], axis=0).reset_index(drop=True)

In [12]:
df_keywords_uspto_epo_rel.sample(5)

Unnamed: 0,patent_id,keyword_yake_lemma,publn_nr,oaid
127152,US11407316,"[cpu, driving force output, regenerative power...",,
1253719,,"[biochemistry, exploitation]",,REL378873212
393778,US8802271,[initial resistance],,
499187,US9977858,"[electric motor, emi, frequency range, motor d...",,
1023141,,"[peroxide, photoinduced electron transfer, sal...",,REL2088500785


In [13]:
# Explode the 'keyword_yake_lemma' column
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel.explode('keyword_yake_lemma').reset_index(drop=True)

In [14]:
len(df_keywords_uspto_epo_rel)

1273054

In [15]:
# Merge df_keywords_uspto_epo_rel_exploded with df_yake_cleantech_titles on 'keyword_yake_lemma'and keep patent_id, publn_nr and oaid, including duplicates
df_yake_uspto_epo_rel_titles_filtered = pd.merge(df_yake_cleantech_titles, df_keywords_uspto_epo_rel_exploded, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [16]:
# Delete all rows corresponding to 'keyword_yake_lemma' when keyword_yake_lemma occurs less than x times
co_occurrence_threshold = 100
co_occurrence = df_yake_uspto_epo_rel_titles_filtered['keyword_yake_lemma'].value_counts()
co_occurrence = co_occurrence[co_occurrence > co_occurrence_threshold]
co_occurrence = co_occurrence.reset_index()
co_occurrence.columns = ['keyword_yake_lemma', 'count']
df_yake_uspto_epo_rel_titles_filtered = df_yake_uspto_epo_rel_titles_filtered.merge(co_occurrence, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [17]:
df_yake_uspto_epo_rel_titles_filtered['keyword_yake_lemma'].nunique()

80

## Co-Occurence Matrix by IDs

In [18]:
# Preprocess df_keywords_uspto_epo_rel to create dictionaries for fast lookup
patent_id_dict = df_keywords_uspto_epo_rel.groupby('patent_id')['keyword_yake_lemma'].progress_apply(list).to_dict()
publn_nr_dict = df_keywords_uspto_epo_rel.groupby('publn_nr')['keyword_yake_lemma'].progress_apply(list).to_dict()
oaid_dict = df_keywords_uspto_epo_rel.groupby('oaid')['keyword_yake_lemma'].progress_apply(list).to_dict()

# Function to update co-occurrence matrix
def update_co_occurrence(row):
    if isinstance(row['patent_id'], str) and row['patent_id'] in patent_id_dict:
        keyword_lists = patent_id_dict[row['patent_id']]
    elif isinstance(row['publn_nr'], str) and row['publn_nr'] in publn_nr_dict:
        keyword_lists = publn_nr_dict[row['publn_nr']]
    elif isinstance(row['oaid'], str) and row['oaid'] in oaid_dict:
        keyword_lists = oaid_dict[row['oaid']]
    else:
        return

    for keyword_list in keyword_lists:
        for keyword in keyword_list:
            co_occurrence_matrix_ids.at[row['keyword_yake_lemma'], keyword] += 1

# Apply the function to each row in the DataFrame
for index, row in tqdm(df_yake_uspto_epo_rel_titles_filtered.iterrows(), total=len(df_yake_uspto_epo_rel_titles_filtered)):
    update_co_occurrence(row)

100%|██████████| 501272/501272 [00:06<00:00, 82704.82it/s]
100%|██████████| 172338/172338 [00:01<00:00, 96261.75it/s]
100%|██████████| 599444/599444 [00:07<00:00, 83782.45it/s]
100%|██████████| 35456/35456 [00:05<00:00, 6407.50it/s]


In [21]:
# Print descending columns for row 'xxx'
co_occurrence_matrix_ids.loc['wind energy'].sort_values(ascending=False).head(20)

wind energy                    136
energy conversion system         7
rotor blade                      5
moment                           3
energy plant                     3
energy conversion apparatus      2
gps receiver                     2
framework                        2
renewable                        2
angular velocity                 2
power system                     2
bearing                          2
output connection                2
adjacent flange                  2
velocity                         2
load side                        2
required                         2
horizontal axis                  2
sweep gas                        2
lockout                          2
Name: wind energy, dtype: int64

In [22]:
# Delete all rows with only zeros
co_occurrence_matrix_ids = co_occurrence_matrix_ids.loc[(co_occurrence_matrix_ids!=0).any(axis=1)]

In [23]:
# Divide rows in co-occurence matrix by largest value in row
co_occurrence_matrix_ids = co_occurrence_matrix_ids.div(co_occurrence_matrix_ids.max(axis=1), axis=0)

In [25]:
co_occurrence_matrix_ids.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurrence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel_ids_semantic_similarity_02.csv', index=True)