In [1]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
df_yake_uspto_epo_rel = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json')
# Drop columns keyword_yake_pos, keyword_yake_patentsberta_embedding, keyword_yake_climatebert_embedding and keyword_yake_bertforpatents_embedding
df_yake_uspto_epo_rel.drop(columns=['keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding'], inplace=True)

In [3]:
df_yake_cleantech_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks_embeddings.json')

In [4]:
# Extract unique keywords
cleantech_titles = df_yake_cleantech_titles['keyword_yake_lemma'].unique()
uspto_epo_rel_keywords = df_yake_uspto_epo_rel['keyword_yake_lemma'].unique()

# Initialize co-occurrence matrix
co_occurrence_matrix = pd.DataFrame(0, index=cleantech_titles, columns=uspto_epo_rel_keywords)

In [5]:
# Exploding the 'patent_id' column while retaining the 'keyword_yake' column
patent_id_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['patent_id'].explode().reset_index()
# Put 'US' in front of each patent_id
patent_id_exploded['patent_id'] = 'US' + patent_id_exploded['patent_id'].astype(str)
# Delete all rows where patent_id contains the string None
patent_id_exploded = patent_id_exploded[~patent_id_exploded['patent_id'].str.contains('None')]
# Aggregate on 'patent_id', list of keyword_yake_lemma
patent_id_exploded = patent_id_exploded.groupby('patent_id')['keyword_yake_lemma'].apply(list).reset_index()

In [6]:
patent_id_exploded.head()

Unnamed: 0,patent_id,keyword_yake_lemma
0,US10000011,[desired]
1,US10000017,"[attachment, inlet opening, mounting apparatus..."
2,US10000021,"[machining process, predetermined portion, sha..."
3,US10000025,"[damage tolerance, fiber orientation]"
4,US10000033,"[closure element, eva, substantially flat]"


In [7]:
# Exploding the 'publn_nr' column while retaining the 'keyword_yake' column
publn_nr_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['publn_nr'].explode().reset_index()
# Put 'EP' in front of each publn_nr
publn_nr_exploded['publn_nr'] = 'EP' + publn_nr_exploded['publn_nr'].astype(str)
# Delete all rows where publn_nr contains the string None
publn_nr_exploded = publn_nr_exploded[~publn_nr_exploded['publn_nr'].str.contains('None')]
# Aggregate on 'publn_nr', list all keyword_yake_lemma
publn_nr_exploded = publn_nr_exploded.groupby('publn_nr')['keyword_yake_lemma'].apply(list).reset_index()

In [8]:
publn_nr_exploded.head()

Unnamed: 0,publn_nr,keyword_yake_lemma
0,EP0000085,"[benzene, dilute solution, hbr, tributyl]"
1,EP0000338,"[agar gel, bluetongue virus, charlottesville, ..."
2,EP0000401,"[flock, livelihood, main source, negev, rumina..."
3,EP0000715,"[high efficiency solar, large area photovoltai..."
4,EP0000785,[drainage]


In [9]:
# Exploding the 'oaid' column while retaining the 'keyword_yake' column
oaid_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['oaid'].explode().reset_index()
# Put 'REL' in front of each oaid
oaid_exploded['oaid'] = 'REL' + oaid_exploded['oaid'].astype(str)
# Delete all rows where oaid contains the string None
oaid_exploded = oaid_exploded[~oaid_exploded['oaid'].str.contains('None')]
# Aggregate on 'oaid', list of 'keyword_yake_lemma'
oaid_exploded = oaid_exploded.groupby('oaid')['keyword_yake_lemma'].apply(list).reset_index()

In [10]:
oaid_exploded.head()

Unnamed: 0,oaid,keyword_yake_lemma
0,REL100000185,"[pain, physiology]"
1,REL1000054809,"[distributed, droop control, electric power sy..."
2,REL100007697,"[latency, millisecond, problematic, rcu, sched..."
3,REL100014517,"[biochemical and genetic, circular form, idea,..."
4,REL1000192467,"[arthralgia, headache, july, oval, pregnant wo..."


In [11]:
# Concatenate the three exploded dataframes
df_keywords_uspto_epo_rel = pd.concat([patent_id_exploded, publn_nr_exploded, oaid_exploded], axis=0).reset_index(drop=True)

In [12]:
df_keywords_uspto_epo_rel.sample(5)

Unnamed: 0,patent_id,keyword_yake_lemma,publn_nr,oaid
1233684,,"[eol, ndfeb, rees]",,REL2522817256
449138,US9422915,"[adoptive transfer, annual energy, bone marrow...",,
1071794,,"[artificial neural network, multitude, neuron,...",,REL2107994122
1119661,,"[carrier substrate, mno, superior]",,REL2134591935
591048,,"[conductive oxide film, dividing groove, photo...",EP2752883,


In [13]:
# Explode the 'keyword_yake_lemma' column
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel.explode('keyword_yake_lemma').reset_index(drop=True)

In [14]:
# Merge df_keywords_uspto_epo_rel_exploded with df_yake_cleantech_titles on 'keyword_yake_lemma'and keep patent_id, publn_nr and oaid, including duplicates
df_yake_uspto_epo_rel_titles_filtered = pd.merge(df_yake_cleantech_titles, df_keywords_uspto_epo_rel_exploded, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [15]:
df_yake_uspto_epo_rel_titles_filtered['keyword_yake_lemma'].nunique()

205

## Co-Occurence Matrix Single Core

In [16]:
# Preprocess df_keywords_uspto_epo_rel to create dictionaries for fast lookup
patent_id_dict = df_keywords_uspto_epo_rel.groupby('patent_id')['keyword_yake_lemma'].progress_apply(list).to_dict()
publn_nr_dict = df_keywords_uspto_epo_rel.groupby('publn_nr')['keyword_yake_lemma'].progress_apply(list).to_dict()
oaid_dict = df_keywords_uspto_epo_rel.groupby('oaid')['keyword_yake_lemma'].progress_apply(list).to_dict()

# Function to update co-occurrence matrix
def update_co_occurrence(row):
    if isinstance(row['patent_id'], str) and row['patent_id'] in patent_id_dict:
        keyword_lists = patent_id_dict[row['patent_id']]
    elif isinstance(row['publn_nr'], str) and row['publn_nr'] in publn_nr_dict:
        keyword_lists = publn_nr_dict[row['publn_nr']]
    elif isinstance(row['oaid'], str) and row['oaid'] in oaid_dict:
        keyword_lists = oaid_dict[row['oaid']]
    else:
        return

    for keyword_list in keyword_lists:
        for keyword in keyword_list:
            co_occurrence_matrix.at[row['keyword_yake_lemma'], keyword] += 1

# Apply the function to each row in the DataFrame
for index, row in tqdm(df_yake_uspto_epo_rel_titles_filtered.iterrows(), total=len(df_yake_uspto_epo_rel_titles_filtered)):
    update_co_occurrence(row)

100%|██████████| 502876/502876 [00:06<00:00, 81298.72it/s]
100%|██████████| 176155/176155 [00:01<00:00, 92090.05it/s]
100%|██████████| 608091/608091 [00:07<00:00, 79807.15it/s]
100%|██████████| 59245/59245 [01:22<00:00, 719.20it/s] 


In [27]:
# Print descending columns for row 'xxx'
co_occurrence_matrix.loc['wastewater treatment'].sort_values(ascending=False).head(20)

wastewater treatment          1.000000
cod                           0.123746
wastewater                    0.100334
anaerobic                     0.070234
environmental                 0.066890
aerobic                       0.063545
wastewater treatment plant    0.053512
microbial fuel cell           0.053512
organic matter                0.050167
nutrient                      0.046823
hrt                           0.046823
phosphorus                    0.046823
activated sludge              0.046823
mbr                           0.046823
treatment plant               0.043478
fouling                       0.040134
sbr                           0.040134
pilot                         0.040134
treatment system              0.040134
biogas                        0.040134
Name: wastewater treatment, dtype: float64

In [26]:
# Divide rows in co-occurence matrix by largest value in row
co_occurrence_matrix = co_occurrence_matrix.div(co_occurrence_matrix.max(axis=1), axis=0)

In [30]:
co_occurrence_matrix.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel.json', orient='records')

## Co-Occurence Matrix Multiprocessing

In [None]:
from multiprocessing import Pool

In [None]:
def process_chunk(chunk, df_keywords_uspto_epo_rel, co_occurrence_matrix):
    for index, row in chunk.iterrows():
        if isinstance(row['patent_id'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['patent_id'] == row['patent_id']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        elif isinstance(row['publn_nr'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['publn_nr'] == row['publn_nr']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        elif isinstance(row['oaid'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['oaid'] == row['oaid']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        else:
            print('Error')
    return co_occurrence_matrix

def main(df_yake_uspto_epo_rel, df_keywords_uspto_epo_rel, co_occurrence_matrix):
    num_cores = 6
    chunk_size = len(df_yake_uspto_epo_rel) // num_cores

    # Create a list of DataFrame chunks
    chunks = [df_yake_uspto_epo_rel.iloc[i:i + chunk_size] for i in range(0, df_yake_uspto_epo_rel.shape[0], chunk_size)]

    # Set up a multiprocessing Pool
    with Pool(num_cores) as pool:
        results = list(tqdm(pool.starmap(process_chunk, [(chunk, df_keywords_uspto_epo_rel, co_occurrence_matrix.copy()) for chunk in chunks]), total=len(chunks)))

    # Combine the results
    for matrix in results:
        co_occurrence_matrix += matrix

    return co_occurrence_matrix

# Call the main function with appropriate arguments
co_occurrence_matrix = main(df_yake_uspto_epo_rel, df_keywords_uspto_epo_rel, co_occurrence_matrix)