In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [None]:
df_yake_uspto_epo_rel = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json')
# Drop columns keyword_yake_pos, keyword_yake_patentsberta_embedding, keyword_yake_climatebert_embedding and keyword_yake_bertforpatents_embedding
df_yake_uspto_epo_rel.drop(columns=['keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding'], inplace=True)

In [None]:
df_yake_cleantech_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks_embeddings.json')

In [None]:
# Extract unique keywords
cleantech_titles = df_yake_cleantech_titles['keyword_yake_lemma'].unique()
uspto_epo_rel_keywords = df_yake_uspto_epo_rel['keyword_yake_lemma'].unique()

# Initialize co-occurrence matrix
co_occurrence_matrix = pd.DataFrame(0, index=cleantech_titles, columns=uspto_epo_rel_keywords)

In [None]:
# Exploding the 'patent_id' column while retaining the 'keyword_yake' column
patent_id_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['patent_id'].explode().reset_index()
# Put 'US' in front of each patent_id
patent_id_exploded['patent_id'] = 'US' + patent_id_exploded['patent_id'].astype(str)
# Delete all rows where patent_id contains the string None
patent_id_exploded = patent_id_exploded[~patent_id_exploded['patent_id'].str.contains('None')]
# Aggregate on 'patent_id', list of keyword_yake_lemma
patent_id_exploded = patent_id_exploded.groupby('patent_id')['keyword_yake_lemma'].apply(list).reset_index()

In [None]:
patent_id_exploded.head()

In [None]:
# Exploding the 'publn_nr' column while retaining the 'keyword_yake' column
publn_nr_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['publn_nr'].explode().reset_index()
# Put 'EP' in front of each publn_nr
publn_nr_exploded['publn_nr'] = 'EP' + publn_nr_exploded['publn_nr'].astype(str)
# Delete all rows where publn_nr contains the string None
publn_nr_exploded = publn_nr_exploded[~publn_nr_exploded['publn_nr'].str.contains('None')]
# Aggregate on 'publn_nr', list all keyword_yake_lemma
publn_nr_exploded = publn_nr_exploded.groupby('publn_nr')['keyword_yake_lemma'].apply(list).reset_index()

In [None]:
publn_nr_exploded.head()

In [None]:
# Exploding the 'oaid' column while retaining the 'keyword_yake' column
oaid_exploded = df_yake_uspto_epo_rel.set_index('keyword_yake_lemma')['oaid'].explode().reset_index()
# Put 'REL' in front of each oaid
oaid_exploded['oaid'] = 'REL' + oaid_exploded['oaid'].astype(str)
# Delete all rows where oaid contains the string None
oaid_exploded = oaid_exploded[~oaid_exploded['oaid'].str.contains('None')]
# Aggregate on 'oaid', list of 'keyword_yake_lemma'
oaid_exploded = oaid_exploded.groupby('oaid')['keyword_yake_lemma'].apply(list).reset_index()

In [None]:
oaid_exploded.head()

In [None]:
# Concatenate the three exploded dataframes
df_keywords_uspto_epo_rel = pd.concat([patent_id_exploded, publn_nr_exploded, oaid_exploded], axis=0).reset_index(drop=True)

In [None]:
df_keywords_uspto_epo_rel.sample(5)

In [None]:
# Explode the 'keyword_yake_lemma' column
df_keywords_uspto_epo_rel_exploded = df_keywords_uspto_epo_rel.explode('keyword_yake_lemma').reset_index(drop=True)

In [None]:
# Merge df_keywords_uspto_epo_rel_exploded with df_yake_cleantech_titles on 'keyword_yake_lemma'and keep patent_id, publn_nr and oaid, including duplicates
df_yake_uspto_epo_rel_titles_filtered = pd.merge(df_yake_cleantech_titles, df_keywords_uspto_epo_rel_exploded, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [None]:
# Delete all rows corresponding to 'keyword_yake_lemma' when keyword_yake_lemma occurs less than x times
co_occurrence_threshold = 100
co_occurrence = df_yake_uspto_epo_rel_titles_filtered['keyword_yake_lemma'].value_counts()
co_occurrence = co_occurrence[co_occurrence > co_occurrence_threshold]
co_occurrence = co_occurrence.reset_index()
co_occurrence.columns = ['keyword_yake_lemma', 'count']
df_yake_uspto_epo_rel_titles_filtered = df_yake_uspto_epo_rel_titles_filtered.merge(co_occurrence, how='inner', left_on='keyword_yake_lemma', right_on='keyword_yake_lemma', validate='many_to_many')

In [None]:
df_yake_uspto_epo_rel_titles_filtered['keyword_yake_lemma'].nunique()

## Co-Occurence Matrix Single Core

In [None]:
# Preprocess df_keywords_uspto_epo_rel to create dictionaries for fast lookup
patent_id_dict = df_keywords_uspto_epo_rel.groupby('patent_id')['keyword_yake_lemma'].progress_apply(list).to_dict()
publn_nr_dict = df_keywords_uspto_epo_rel.groupby('publn_nr')['keyword_yake_lemma'].progress_apply(list).to_dict()
oaid_dict = df_keywords_uspto_epo_rel.groupby('oaid')['keyword_yake_lemma'].progress_apply(list).to_dict()

# Function to update co-occurrence matrix
def update_co_occurrence(row):
    if isinstance(row['patent_id'], str) and row['patent_id'] in patent_id_dict:
        keyword_lists = patent_id_dict[row['patent_id']]
    elif isinstance(row['publn_nr'], str) and row['publn_nr'] in publn_nr_dict:
        keyword_lists = publn_nr_dict[row['publn_nr']]
    elif isinstance(row['oaid'], str) and row['oaid'] in oaid_dict:
        keyword_lists = oaid_dict[row['oaid']]
    else:
        return

    for keyword_list in keyword_lists:
        for keyword in keyword_list:
            co_occurrence_matrix.at[row['keyword_yake_lemma'], keyword] += 1

# Apply the function to each row in the DataFrame
for index, row in tqdm(df_yake_uspto_epo_rel_titles_filtered.iterrows(), total=len(df_yake_uspto_epo_rel_titles_filtered)):
    update_co_occurrence(row)

In [None]:
# Print descending columns for row 'xxx'
co_occurrence_matrix.loc['wastewater treatment'].sort_values(ascending=False).head(20)

In [None]:
# Divide rows in co-occurence matrix by largest value in row
co_occurrence_matrix = co_occurrence_matrix.div(co_occurrence_matrix.max(axis=1), axis=0)

In [None]:
co_occurrence_matrix.to_csv('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/Co-Occurence Analysis/co_occurrence_matrix_yake_keywords_cleantech_uspto_epo_rel.csv', index=True)

## Co-Occurence Matrix Multiprocessing

In [None]:
from multiprocessing import Pool

In [None]:
def process_chunk(chunk, df_keywords_uspto_epo_rel, co_occurrence_matrix):
    for index, row in chunk.iterrows():
        if isinstance(row['patent_id'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['patent_id'] == row['patent_id']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        elif isinstance(row['publn_nr'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['publn_nr'] == row['publn_nr']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        elif isinstance(row['oaid'], str):
            for keyword_list in df_keywords_uspto_epo_rel[df_keywords_uspto_epo_rel['oaid'] == row['oaid']]['keyword_yake_lemma']:
                for keyword in keyword_list:
                    co_occurrence_matrix.loc[row['keyword_yake_lemma'], keyword] += 1
        else:
            print('Error')
    return co_occurrence_matrix

def main(df_yake_uspto_epo_rel, df_keywords_uspto_epo_rel, co_occurrence_matrix):
    num_cores = 6
    chunk_size = len(df_yake_uspto_epo_rel) // num_cores

    # Create a list of DataFrame chunks
    chunks = [df_yake_uspto_epo_rel.iloc[i:i + chunk_size] for i in range(0, df_yake_uspto_epo_rel.shape[0], chunk_size)]

    # Set up a multiprocessing Pool
    with Pool(num_cores) as pool:
        results = list(tqdm(pool.starmap(process_chunk, [(chunk, df_keywords_uspto_epo_rel, co_occurrence_matrix.copy()) for chunk in chunks]), total=len(chunks)))

    # Combine the results
    for matrix in results:
        co_occurrence_matrix += matrix

    return co_occurrence_matrix

# Call the main function with appropriate arguments
co_occurrence_matrix = main(df_yake_uspto_epo_rel, df_keywords_uspto_epo_rel, co_occurrence_matrix)