In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [3]:
df_keywords_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_y02_classification_embeddings_processed.json')

# Convert columns 'keyword_patentsberta_embedding', 'keyword_climatebert_embedding', 'keyword_bertforpatents_embedding' to numpy arrays
df_keywords_titles['keyword_patentsberta_embedding'] = df_keywords_titles['keyword_patentsberta_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_titles['keyword_climatebert_embedding'] = df_keywords_titles['keyword_climatebert_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_titles['keyword_bertforpatents_embedding'] = df_keywords_titles['keyword_bertforpatents_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))

100%|██████████| 506/506 [00:00<00:00, 58579.02it/s]
100%|██████████| 506/506 [00:00<00:00, 61146.04it/s]
100%|██████████| 506/506 [00:00<00:00, 47068.48it/s]


In [4]:
df_keywords_claims = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings.json')

# Convert columns 'keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding' to numpy arrays
df_keywords_claims['keyword_yake_patentsberta_embedding'] = df_keywords_claims['keyword_yake_patentsberta_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_claims['keyword_yake_climatebert_embedding'] = df_keywords_claims['keyword_yake_climatebert_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_claims['keyword_yake_bertforpatents_embedding'] = df_keywords_claims['keyword_yake_bertforpatents_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))

100%|██████████| 119212/119212 [00:02<00:00, 56894.35it/s]
100%|██████████| 119212/119212 [00:02<00:00, 56850.77it/s]
100%|██████████| 119212/119212 [00:02<00:00, 43091.00it/s]


# Testing

In [5]:
# Print out all 'keyword' that contain 'xxx'
df_keywords_titles[df_keywords_titles['keyword'].str.contains('renewable energies')]

Unnamed: 0,keyword,cpc_subclass,yake_confidence,cpc_classification,keyword_patentsberta_embedding,keyword_climatebert_embedding,keyword_bertforpatents_embedding
408,integrating renewable energies,Y02B,0.016559,[Y02B10/70],"[0.0085369125, -0.27431762, -0.38026205, 0.115...","[-0.0068912036, 0.083790846, 0.029009134, 0.04...","[-0.64233226, -0.06896591, 0.97839415, -0.4392..."
417,renewable energies,Y02W,0.049404,"[Y02A40/58, Y02A40/924, Y02P60/12, Y02P60/52, ...","[0.19664274, -0.14419907, -0.42156896, -0.0085...","[0.0091875475, 0.18294825, 0.012616519, -0.066...","[-0.15973344, -0.056596946, 1.0088742, -0.2141..."


In [32]:
# Set search test to row with keyword = solar cells
df_search_test_titles = df_keywords_titles[df_keywords_titles['keyword'] == 'wind energy']

In [33]:
# Reset index of df_search_test_titles
df_search_test_titles.reset_index(drop=True, inplace=True)

In [34]:
df_search_test_titles

Unnamed: 0,keyword,cpc_subclass,yake_confidence,cpc_classification,keyword_patentsberta_embedding,keyword_climatebert_embedding,keyword_bertforpatents_embedding
0,wind energy,Y02W,0.025709,"[Y02E10/70, Y02W10/33]","[-0.13935463, -0.22561637, -0.41005242, 0.1530...","[-0.024680587, 0.121622354, -0.023151487, -0.1...","[0.027174294, -0.26510686, 0.42312163, -0.7440..."


# Similarity Search

In [9]:
import faiss
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors

# Euclidean Distance - kNN

In [None]:
# Eucledian distance
index_bertforpatents = faiss.IndexFlatL2(1024)   # build the index
index_climatebert = faiss.IndexFlatL2(768)   # build the index
index_patentsberta = faiss.IndexFlatL2(768)   # build the index

# Add df_keywords_claims column 'keyword_yake_bertforpatents_embedding' to index
index_bertforpatents.add(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
index_climatebert.add(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
index_patentsberta.add(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

query_vector_bertforpatents = np.array(df_search_test_titles['keyword_bertforpatents_embedding'][0]).reshape(1, -1)
query_vector_climatebert = np.array(df_search_test_titles['keyword_climatebert_embedding'][0]).reshape(1, -1)
query_vector_patentsberta = np.array(df_search_test_titles['keyword_patentsberta_embedding'][0]).reshape(1, -1)

In [None]:
k = 25
D_bertforpatents, I_bertforpatents = index_bertforpatents.search(query_vector_bertforpatents, k)
D_climatebert, I_climatebert = index_climatebert.search(query_vector_climatebert, k)
D_patentsberta, I_patentsberta = index_patentsberta.search(query_vector_patentsberta, k)

In [None]:
# Print row of df_keywords_claims that match the index
df_keywords_claims.iloc[I_patentsberta[0]]

# Cosine Similarity - kNN

In [None]:
# Cosine similarity
index_bertforpatents = faiss.index_factory(1024, "Flat", faiss.METRIC_INNER_PRODUCT)
index_climatebert = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)
index_patentsberta = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)

faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

index_bertforpatents.add(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
index_climatebert.add(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
index_patentsberta.add(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

query_vector_bertforpatents = np.array(df_search_test_titles['keyword_bertforpatents_embedding'][0]).reshape(1, -1)
query_vector_climatebert = np.array(df_search_test_titles['keyword_climatebert_embedding'][0]).reshape(1, -1)
query_vector_patentsberta = np.array(df_search_test_titles['keyword_patentsberta_embedding'][0]).reshape(1, -1)

faiss.normalize_L2(query_vector_bertforpatents)
faiss.normalize_L2(query_vector_climatebert)
faiss.normalize_L2(query_vector_patentsberta)

In [None]:
k = 6
D_bertforpatents, I_bertforpatents = index_bertforpatents.search(query_vector_bertforpatents, k)
D_climatebert, I_climatebert = index_climatebert.search(query_vector_climatebert, k)
D_patentsberta, I_patentsberta = index_patentsberta.search(query_vector_patentsberta, k)

In [None]:
# Print row of df_keywords_claims that match the index
df_keywords_claims.iloc[I_bertforpatents[0]]

# Cosine Similarity - Margin

In [35]:
# Convert lists of embeddings to a 2D array
bertforpatents_embeddings = np.vstack(df_keywords_claims['keyword_yake_bertforpatents_embedding'].apply(np.array))
climatebert_embeddings = np.vstack(df_keywords_claims['keyword_yake_climatebert_embedding'].apply(np.array))
patentsberta_embeddings = np.vstack(df_keywords_claims['keyword_yake_patentsberta_embedding'].apply(np.array))

# Normalize embeddings
bertforpatents_embeddings_normalized = normalize(bertforpatents_embeddings)
climatebert_embeddings_normalized = normalize(climatebert_embeddings)
patentsberta_embeddings_normalized = normalize(patentsberta_embeddings)

# Initialize NearestNeighbors with radius
radius = 0.25  # Adjust based on your requirements
nn_bertforpatents = NearestNeighbors(radius=radius, metric='cosine')
nn_climatebert = NearestNeighbors(radius=radius, metric='cosine')
nn_patentsberta = NearestNeighbors(radius=radius, metric='cosine')

# Fit the models with normalized embeddings
nn_bertforpatents.fit(bertforpatents_embeddings_normalized)
nn_climatebert.fit(climatebert_embeddings_normalized)
nn_patentsberta.fit(patentsberta_embeddings_normalized)

In [50]:
# Helper function to get keywords by indices
def get_keywords_by_indices(indices, df_source):
    return df_source.iloc[indices]['keyword_yake'].tolist() if indices.size else []

# Extend df_keywords_titles with columns for keywords corresponding to the neighbor indices
df_keywords_titles['keywords_bertforpatents'] = None
df_keywords_titles['keywords_climatebert'] = None
df_keywords_titles['keywords_patentsberta'] = None

from joblib import parallel_backend

with parallel_backend('threading', n_jobs=12):
    # Your scikit-learn code here
    # Loop over each row in the DataFrame to use each set of embeddings as query vectors
    for index, row in tqdm(df_keywords_titles.iterrows()):
        # Extract and normalize the query vectors
        query_vector_bertforpatents = normalize(np.array([row['keyword_bertforpatents_embedding']]).reshape(1, -1))
        query_vector_climatebert = normalize(np.array([row['keyword_climatebert_embedding']]).reshape(1, -1))
        query_vector_patentsberta = normalize(np.array([row['keyword_patentsberta_embedding']]).reshape(1, -1))
        
        # Find neighbors within the radius for each model
        indices_bertforpatents = nn_bertforpatents.radius_neighbors(query_vector_bertforpatents, return_distance=False)
        indices_climatebert = nn_climatebert.radius_neighbors(query_vector_climatebert, return_distance=False)
        indices_patentsberta = nn_patentsberta.radius_neighbors(query_vector_patentsberta, return_distance=False)
        
        # Get the neighbors, or an empty array if none are found
        neighbors_bertforpatents = indices_bertforpatents[0] if indices_bertforpatents[0].size else np.array([])
        neighbors_climatebert = indices_climatebert[0] if indices_climatebert[0].size else np.array([])
        neighbors_patentsberta = indices_patentsberta[0] if indices_patentsberta[0].size else np.array([])
        
        # Get the keywords from df_keywords_claims corresponding to the indices
        keywords_bertforpatents = get_keywords_by_indices(neighbors_bertforpatents, df_keywords_claims)
        keywords_climatebert = get_keywords_by_indices(neighbors_climatebert, df_keywords_claims)
        keywords_patentsberta = get_keywords_by_indices(neighbors_patentsberta, df_keywords_claims)
        
        # Assign the neighbors and keywords back to the DataFrame
        df_keywords_titles.at[index, 'neighbors_bertforpatents'] = neighbors_bertforpatents.tolist()
        df_keywords_titles.at[index, 'neighbors_climatebert'] = neighbors_climatebert.tolist()
        df_keywords_titles.at[index, 'neighbors_patentsberta'] = neighbors_patentsberta.tolist()
        df_keywords_titles.at[index, 'keywords_bertforpatents'] = keywords_bertforpatents
        df_keywords_titles.at[index, 'keywords_climatebert'] = keywords_climatebert
        df_keywords_titles.at[index, 'keywords_patentsberta'] = keywords_patentsberta

506it [04:50,  1.74it/s]


In [52]:
df_keywords_titles.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_titles_cosine_similarity_radius_025.json')