In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
df_yake_uspto_epo_rel = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings.json')
# Drop columns keyword_yake_pos, keyword_yake_patentsberta_embedding, keyword_yake_climatebert_embedding and keyword_yake_bertforpatents_embedding
df_yake_uspto_epo_rel.drop(columns=['keyword_yake_pos', 'keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding'], inplace=True)

In [4]:
df_yake_cleantech_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_y02_classification_embeddings_processed.json')

In [82]:
# Extract unique keywords
cleantech_titles = df_yake_cleantech_titles['keyword'].unique()
uspto_epo_rel_keywords = df_yake_uspto_epo_rel['keyword_yake'].unique()

# Initialize co-occurrence matrix
co_occurrence_matrix = pd.DataFrame(0, index=cleantech_titles, columns=uspto_epo_rel_keywords)

In [41]:
# Exploding the 'patent_id' column while retaining the 'keyword_yake' column
patent_id_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake')['patent_id'].explode().reset_index()
# Put 'US' in front of each patent_id
patent_id_exploded['patent_id'] = 'US' + patent_id_exploded['patent_id'].astype(str)
# Delete all rows where patent_id contains the string None
patent_id_exploded = patent_id_exploded[~patent_id_exploded['patent_id'].str.contains('None')]
# Aggregate on patent_id
patent_id_exploded = patent_id_exploded.groupby('patent_id')['keyword_yake'].agg(list).reset_index()

In [42]:
patent_id_exploded.head()

Unnamed: 0,patent_id,keyword_yake
0,US10000011,[composite]
1,US10000017,"[attachment, gap, mounting apparatus, pressing..."
2,US10000021,"[machining process, mold, predetermined portio..."
3,US10000025,"[fiber orientation, skin]"
4,US10000033,"[closure element, eva, sidewall]"


In [38]:
# Exploding the 'publn_nr' column while retaining the 'keyword_yake' column
publn_nr_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake')['publn_nr'].explode().reset_index()
# Put 'EP' in front of each publn_nr
publn_nr_exploded['publn_nr'] = 'EP' + publn_nr_exploded['publn_nr'].astype(str)
# Delete all rows where publn_nr contains the string None
publn_nr_exploded = publn_nr_exploded[~publn_nr_exploded['publn_nr'].str.contains('None')]
# Aggregate on publn_nr
publn_nr_exploded = publn_nr_exploded.groupby('publn_nr')['keyword_yake'].apply(list).reset_index()

In [35]:
# Exploding the 'oaid' column while retaining the 'keyword_yake' column
oaid_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake')['oaid'].explode().reset_index()
# Put 'REL' in front of each oaid
oaid_exploded['oaid'] = 'REL' + oaid_exploded['oaid'].astype(str)
# Delete all rows where oaid contains the string None
oaid_exploded = oaid_exploded[~oaid_exploded['oaid'].str.contains('None')]
# Aggregate on 'oaid'
oaid_exploded = oaid_exploded.groupby('oaid')['keyword_yake'].apply(list).reset_index()

In [43]:
# Concatenate the three exploded dataframes
df_keywords_uspto_epo_rel_exploded = pd.concat([patent_id_exploded, publn_nr_exploded, oaid_exploded], axis=0)

In [46]:
df_keywords_uspto_epo_rel_exploded.sample(25)

Unnamed: 0,patent_id,keyword_yake,publn_nr,oaid
225468,,"[alertsnew citation alert, citationpublisher s...",,REL2084349064
51946,,"[flexible structure, structure was synthesized]",,REL1971688656
310117,US7788011,[drive force control],,
306928,US7731626,[transmission control device],,
29749,,"[chronic obstructive pulmonary, cystic fibrosi...",,REL1819830833
231635,US6195271,"[adaptor, electronic apparatus]",,
108279,,[properties of materials],,REL2008330682
145763,,"[flow velocity, heat exchange fluid, solar ene...",EP4468,
296093,,"[biologic activity, sle, systemic lupus erythe...",,REL2141028706
99982,,"[fundamental wave component, power conversion ...",EP2991218,


In [47]:
# Cast the 'patent_id', 'publn_nr' and 'oaid' columns to string
df_keywords_uspto_epo_rel_exploded['patent_id'] = df_keywords_uspto_epo_rel_exploded['patent_id'].astype(str)
df_keywords_uspto_epo_rel_exploded['publn_nr'] = df_keywords_uspto_epo_rel_exploded['publn_nr'].astype(str)
df_keywords_uspto_epo_rel_exploded['oaid'] = df_keywords_uspto_epo_rel_exploded['oaid'].astype(str)

# Delete all rows where 'patent_id', 'publn_nr' or 'oaid' contains the string 'None'
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel_exploded[~df_keywords_uspto_epo_rel_exploded['patent_id'].str.contains('None')]
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel_exploded[~df_keywords_uspto_epo_rel_exploded['publn_nr'].str.contains('None')]
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel_exploded[~df_keywords_uspto_epo_rel_exploded['oaid'].str.contains('None')]

In [49]:
df_keywords_uspto_epo_rel_exploded.sample(25)

Unnamed: 0,patent_id,keyword_yake,publn_nr,oaid
468541,US9833500,"[cancer, head and neck, tigit, tumor]",,
335830,US8191371,"[degree of filling, hydrodynamic clutch, outpu...",,
418610,US9273359,"[amplification product, biological material, p...",,
131618,US3960838,[protecting group],,
117866,US11319424,"[fbp, liquid product]",,
130825,,"[electromagnetic force, steel sheet]",EP3456427,
31492,,[electrolyte includes],EP1886375,
339789,,[sodium hexametaphosphate],,REL2275855731
105116,,"[photonic device, trench, upper surface]",EP3058593,
117237,US11312262,"[battery charge amount, real time, setting inf...",,


In [13]:
# Merge df_keywords_uspto_epo_rel_exploded with df_yake_cleantech_titles on 'keyword_yake', 'keyword' and keep patent_id, publn_nr and oaid, including duplicates
df_yake_uspto_epo_rel_titles_filtered = pd.merge(df_yake_cleantech_titles, df_keywords_uspto_epo_rel_exploded, how='inner', left_on='keyword', right_on='keyword_yake', validate='many_to_many')
# Delete columns keyword_patentsberta_embedding	keyword_climatebert_embedding, keyword_bertforpatents_embedding, keyword_yake and yake_confidence
df_yake_uspto_epo_rel_titles_filtered.drop(columns=['keyword_patentsberta_embedding', 'keyword_climatebert_embedding', 'keyword_bertforpatents_embedding', 'keyword_yake', 'yake_confidence'], inplace=True)

In [161]:
df_yake_uspto_epo_rel_titles_filtered['keyword'].nunique()

126

In [95]:
df_yake_uspto_epo_rel_titles_filtered.sample(5)

Unnamed: 0,keyword,cpc_subclass,cpc_classification,patent_id,publn_nr,oaid
11312,greenhouse gases,Y02P,"[Y02C, Y02C20/00, Y02P90/845]",,,REL2086476405
7082,fuel cells,Y02W,"[Y02B90/10, Y02E60/50, Y02T90/40, Y02W30/84]",US4997727,,
4894,management,Y02T,[Y02T10/84],,,REL623746100
5770,solar cells,Y02E,"[Y02E10/542, Y02E10/543, Y02E10/544]",,,REL2320947061
3574,electric vehicles,Y02T,"[Y02T90/10, Y02T90/16]",,,REL2116494211


In [146]:
# Print all rows where keyword = energy storage in df_yake_uspto_epo_rel_titles_filtered
df_yake_uspto_epo_rel_titles_test = df_yake_uspto_epo_rel_titles_filtered[df_yake_uspto_epo_rel_titles_filtered['keyword'] == 'wind energy']
# df_yake_uspto_epo_rel_titles_test = df_yake_uspto_epo_rel_titles_filtered[
#     (df_yake_uspto_epo_rel_titles_filtered['keyword'] == 'renewable energy') | 
#     (df_yake_uspto_epo_rel_titles_filtered['keyword'] == 'renewable energies')
# ]
df_yake_uspto_epo_rel_titles_test.reset_index(drop=True, inplace=True)

In [147]:
df_yake_uspto_epo_rel_titles_test.sample(5)

Unnamed: 0,keyword,cpc_subclass,cpc_classification,patent_id,publn_nr,oaid
85,wind energy,Y02W,"[Y02E10/70, Y02W10/33]",,,REL1988927699
63,wind energy,Y02W,"[Y02E10/70, Y02W10/33]",,,REL2037965192
17,wind energy,Y02W,"[Y02E10/70, Y02W10/33]",US8109732,,
79,wind energy,Y02W,"[Y02E10/70, Y02W10/33]",,,REL2159437949
34,wind energy,Y02W,"[Y02E10/70, Y02W10/33]",,EP1488433,


In [148]:
for index, row in tqdm(df_yake_uspto_epo_rel_titles_test.iterrows()):
    if row['patent_id'] != 'nan':
        # Handle list of keywords in 'keyword_yake'
        for keyword_list in df_keywords_uspto_epo_rel_exploded[df_keywords_uspto_epo_rel_exploded['patent_id'] == row['patent_id']]['keyword_yake']:
            for keyword in keyword_list:
                co_occurrence_matrix.loc[row['keyword'], keyword] += 1
    elif row['publn_nr'] != 'nan':
        # Handle list of keywords in 'keyword_yake'
        for keyword_list in df_keywords_uspto_epo_rel_exploded[df_keywords_uspto_epo_rel_exploded['publn_nr'] == row['publn_nr']]['keyword_yake']:
            for keyword in keyword_list:
                co_occurrence_matrix.loc[row['keyword'], keyword] += 1
    elif row['oaid'] != 'nan':
        # Handle list of keywords in 'keyword_yake'
        for keyword_list in df_keywords_uspto_epo_rel_exploded[df_keywords_uspto_epo_rel_exploded['oaid'] == row['oaid']]['keyword_yake']:
            for keyword in keyword_list:
                co_occurrence_matrix.loc[row['keyword'], keyword] += 1
    else:
        print('Error')

98it [00:07, 13.51it/s]


In [149]:
# Print descending columns of co_occurrence_matrix where row = energy storage
co_occurrence_matrix.loc['wind energy'].sort_values(ascending=False).head(6)

wind energy                  98
energy conversion system      7
energy conversion systems     4
energy installation           3
wecs                          3
concrete material             2
Name: wind energy, dtype: int64

In [145]:
# Print descending columns of co_occurrence_matrix where row = energy storage
co_occurrence_matrix.loc['renewable energies'].sort_values(ascending=False).head(6)

renewable energies                  14
fossil fuel                          1
major focus                          1
effective utilization                1
output power                         1
building integrated photovoltaic     1
Name: renewable energies, dtype: int64