In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

In [2]:
df_keywords_titles = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/cpc_yake_keywords_list_noun_chunks_embeddings.json')

# Convert columns 'keyword_patentsberta_embedding', 'keyword_climatebert_embedding', 'keyword_bertforpatents_embedding' to numpy arrays
df_keywords_titles['keyword_yake_patentsberta_embedding'] = df_keywords_titles['keyword_yake_patentsberta_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_titles['keyword_yake_climatebert_embedding'] = df_keywords_titles['keyword_yake_climatebert_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_titles['keyword_yake_bertforpatents_embedding'] = df_keywords_titles['keyword_yake_bertforpatents_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))

100%|██████████| 467/467 [00:00<00:00, 58927.20it/s]
100%|██████████| 467/467 [00:00<00:00, 59723.14it/s]
100%|██████████| 467/467 [00:00<00:00, 46044.66it/s]


In [3]:
df_keywords_claims = pd.read_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keywords_list_agg_uspto_epo_rel_embeddings_noun_chunks.json')

# Convert columns 'keyword_yake_patentsberta_embedding', 'keyword_yake_climatebert_embedding', 'keyword_yake_bertforpatents_embedding' to numpy arrays
df_keywords_claims['keyword_yake_patentsberta_embedding'] = df_keywords_claims['keyword_yake_patentsberta_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_claims['keyword_yake_climatebert_embedding'] = df_keywords_claims['keyword_yake_climatebert_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))
df_keywords_claims['keyword_yake_bertforpatents_embedding'] = df_keywords_claims['keyword_yake_bertforpatents_embedding'].progress_apply(lambda x: np.array(x, dtype=np.float32))

100%|██████████| 151044/151044 [00:02<00:00, 54816.50it/s]
100%|██████████| 151044/151044 [00:02<00:00, 55158.06it/s]
100%|██████████| 151044/151044 [00:03<00:00, 41854.43it/s]


# Testing

In [27]:
# Print out all keyword_yake_lemma for cpc_class_symbol = Y02W
print(df_keywords_titles[df_keywords_titles['cpc_class_symbol'].progress_apply(lambda x: 'Y02W10/33' in x)][['keyword_yake_lemma', 'cpc_class_symbol']])

100%|██████████| 467/467 [00:00<00:00, 637195.83it/s]

    keyword_yake_lemma        cpc_class_symbol
460        wind energy  [Y02E10/70, Y02W10/33]





In [45]:
# Set search test to row with keyword = solar cells
df_search_test_titles = df_keywords_titles[df_keywords_titles['keyword_yake_lemma'] == 'wind energy']

In [46]:
# Reset index of df_search_test_titles
df_search_test_titles.reset_index(drop=True, inplace=True)

In [47]:
df_search_test_titles

Unnamed: 0,keyword_yake_lemma,yake_conf_score,cpc_class_symbol,keyword_yake_patentsberta_embedding,keyword_yake_climatebert_embedding,keyword_yake_bertforpatents_embedding
0,wind energy,0.037556,"[Y02E10/70, Y02W10/33]","[-0.13935463, -0.22561637, -0.41005242, 0.1530...","[-0.024680587, 0.121622354, -0.023151487, -0.1...","[0.027174294, -0.26510686, 0.42312163, -0.7440..."


# Similarity Search

In [4]:
import faiss
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors

# Euclidean Distance - kNN

In [None]:
# Eucledian distance
index_bertforpatents = faiss.IndexFlatL2(1024)   # build the index
index_climatebert = faiss.IndexFlatL2(768)   # build the index
index_patentsberta = faiss.IndexFlatL2(768)   # build the index

# Add df_keywords_claims column 'keyword_yake_bertforpatents_embedding' to index
index_bertforpatents.add(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
index_climatebert.add(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
index_patentsberta.add(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

query_vector_bertforpatents = np.array(df_search_test_titles['keyword_bertforpatents_embedding'][0]).reshape(1, -1)
query_vector_climatebert = np.array(df_search_test_titles['keyword_climatebert_embedding'][0]).reshape(1, -1)
query_vector_patentsberta = np.array(df_search_test_titles['keyword_patentsberta_embedding'][0]).reshape(1, -1)

In [None]:
k = 25
D_bertforpatents, I_bertforpatents = index_bertforpatents.search(query_vector_bertforpatents, k)
D_climatebert, I_climatebert = index_climatebert.search(query_vector_climatebert, k)
D_patentsberta, I_patentsberta = index_patentsberta.search(query_vector_patentsberta, k)

In [None]:
# Print row of df_keywords_claims that match the index
df_keywords_claims.iloc[I_patentsberta[0]]

# Cosine Similarity - kNN

In [48]:
# Cosine similarity
index_bertforpatents = faiss.index_factory(1024, "Flat", faiss.METRIC_INNER_PRODUCT)
index_climatebert = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)
index_patentsberta = faiss.index_factory(768, "Flat", faiss.METRIC_INNER_PRODUCT)

faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
faiss.normalize_L2(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

index_bertforpatents.add(np.array(df_keywords_claims['keyword_yake_bertforpatents_embedding'].tolist()))
index_climatebert.add(np.array(df_keywords_claims['keyword_yake_climatebert_embedding'].tolist()))
index_patentsberta.add(np.array(df_keywords_claims['keyword_yake_patentsberta_embedding'].tolist()))

query_vector_bertforpatents = np.array(df_search_test_titles['keyword_yake_bertforpatents_embedding'][0]).reshape(1, -1)
query_vector_climatebert = np.array(df_search_test_titles['keyword_yake_climatebert_embedding'][0]).reshape(1, -1)
query_vector_patentsberta = np.array(df_search_test_titles['keyword_yake_patentsberta_embedding'][0]).reshape(1, -1)

faiss.normalize_L2(query_vector_bertforpatents)
faiss.normalize_L2(query_vector_climatebert)
faiss.normalize_L2(query_vector_patentsberta)

In [49]:
k = 6
D_bertforpatents, I_bertforpatents = index_bertforpatents.search(query_vector_bertforpatents, k)
D_climatebert, I_climatebert = index_climatebert.search(query_vector_climatebert, k)
D_patentsberta, I_patentsberta = index_patentsberta.search(query_vector_patentsberta, k)

In [50]:
# Print row of df_keywords_claims that match the index
df_keywords_claims.iloc[I_bertforpatents[0]]

Unnamed: 0,keyword_yake_lemma,yake_conf_score,abs_frequency,patent_id,publn_nr,oaid,cpc_group,cpc_class_symbol,keyword_yake_patentsberta_embedding,keyword_yake_climatebert_embedding,keyword_yake_bertforpatents_embedding
149171,wind energy,0.042034,134,"[8368239, 6677683, 7694473, 11128231, 7932620,...","[2400619, 2811159, 1588049, 1726088, 2495434, ...","[2009362773, 2040229032, 2134988099, 208502141...","[Y02E40/30, Y02E60/50, Y02E10/50, Y02E10/76, Y...","[Y02P 80/10, Y02E 10/46, Y02E 10/70, Y02B ...","[-0.13935463, -0.22561637, -0.41005242, 0.1530...","[-0.024680587, 0.121622354, -0.023151487, -0.1...","[0.027174294, -0.26510686, 0.42312163, -0.7440..."
149175,wind energy generation,0.023001,5,"[7606638, 8138619, 8774949, 9366232]",[3501080],"[2011363340, 2081553571, 2358301737, 199932612...",[],[],"[-0.16996582, -0.24660143, -0.4050405, 0.21332...","[0.05130585, 0.037141915, -0.011692343, -0.140...","[0.10602985, -0.09561676, 0.34691587, -0.92305..."
149172,wind energy conversion,0.011451,54,"[7116006, 6641367, 10408189, 8851828, 8937399,...","[33318, 2840258, 2035698, 1222388, 3054153, 21...","[2088567219, 2105830722, 2059219910, 210121622...","[Y02B10/70, Y02E60/16, Y02P90/50, Y02T70/5236,...","[Y02E 10/728, Y02E 10/72, Y02E 60/36]","[-0.023859166, -0.2339391, -0.36079568, 0.2357...","[0.06923427, 0.13018538, -0.019503046, -0.1241...","[0.28009745, -0.087188385, 0.3997029, -0.94693..."
149203,wind power,0.035548,249,"[8598731, 10509868, 10521525, 7569944, 1133483...","[2599175, 2478610, 2304231, 2454803, 3054153, ...","[1989879055, 2020802965, 2138128951, 214698100...","[Y02B10/70, Y02C20/20, Y02E60/32, Y02B20/72, Y...","[Y02E 60/10, Y02E 40/50, Y02T 50/60, Y02P ...","[-0.15514976, -0.33485973, -0.38647285, 0.1456...","[-0.04475324, 0.12447504, 0.0034881362, -0.145...","[0.5202647, -0.16505587, 0.02494584, -0.914383..."
149174,wind energy generating,0.00732,6,"[9447774, 7547984, 7525210, 8197208, 7741727, ...",[],[],"[Y02B10/70, Y02E10/56, Y02E10/72, Y02E10/50, Y...",[],"[-0.18674615, -0.20287327, -0.37713137, 0.2543...","[0.07267913, 0.16291437, -0.013103838, -0.1352...","[0.50781286, 0.13417198, 0.3842438, -0.5118474..."
149183,wind energy turbine,0.013775,16,"[7168251, 11065840, 10927811, 7470114, 7762771]","[1754886, 1798414, 1616066, 1337755, 1754887, ...",[2078810650],"[Y02B10/30, Y02P70/50, Y02E10/72]","[Y02P 70/50, Y02E 10/74, Y02E 10/76, Y02E ...","[-0.09484156, -0.4393699, -0.3285693, 0.120928...","[-0.0023944834, 0.14874709, -0.014920734, -0.1...","[0.29088497, -0.03869297, 0.8150261, -0.576195..."


# Cosine Similarity - Margin

In [22]:
# Convert lists of embeddings to a 2D array
bertforpatents_embeddings = np.vstack(df_keywords_claims['keyword_yake_bertforpatents_embedding'].apply(np.array))
climatebert_embeddings = np.vstack(df_keywords_claims['keyword_yake_climatebert_embedding'].apply(np.array))
patentsberta_embeddings = np.vstack(df_keywords_claims['keyword_yake_patentsberta_embedding'].apply(np.array))

# Normalize embeddings
bertforpatents_embeddings_normalized = normalize(bertforpatents_embeddings)
climatebert_embeddings_normalized = normalize(climatebert_embeddings)
patentsberta_embeddings_normalized = normalize(patentsberta_embeddings)

# Initialize NearestNeighbors with radius
radius = 0.5 # 0.1 # 0.5 # 0.25  # Adjust based on your requirements
nn_bertforpatents = NearestNeighbors(radius=radius, metric='cosine')
nn_climatebert = NearestNeighbors(radius=radius, metric='cosine')
nn_patentsberta = NearestNeighbors(radius=radius, metric='cosine')

# Fit the models with normalized embeddings
nn_bertforpatents.fit(bertforpatents_embeddings_normalized)
nn_climatebert.fit(climatebert_embeddings_normalized)
nn_patentsberta.fit(patentsberta_embeddings_normalized)

In [23]:
df_keywords_titles.head()

Unnamed: 0,keyword_yake_lemma,yake_conf_score,cpc_class_symbol,keyword_yake_patentsberta_embedding,keyword_yake_climatebert_embedding,keyword_yake_bertforpatents_embedding,keywords_bertforpatents,keywords_climatebert,keywords_patentsberta,neighbors_bertforpatents,neighbors_climatebert,neighbors_patentsberta
0,abiotic stress,0.025709,[Y02A40/13],"[-0.39456344, -0.7677266, -0.381351, 0.1383062...","[-0.014673094, 0.11892946, 0.0026377938, -0.01...","[0.10681355, -0.69053596, 0.47798485, 0.076534...","[abiotic stress, abiotic stress condition, abi...","[aa, aa concrete, aaa, aaa server, aab, aabb, ...","[abiotic stress, abiotic stress condition, abi...","[111, 112, 113, 114]","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[111, 112, 113]"
1,absorption based system,0.008321,[Y02B30/62],"[-0.11752392, -0.7287776, -0.18765274, 0.01301...","[0.010742969, 0.083290644, -0.02593848, -0.048...","[-0.3856159, 0.8286994, 0.39117292, -0.2979813...",[],"[aa, aa concrete, aaa, aaa server, aab, aabb, ...",[absorption system],[],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[361]
2,acid,0.297366,[Y02P20/30],"[0.15585586, -0.46968186, -0.384583, 0.1502907...","[-0.039344836, 0.11001184, 0.0187124, 0.004480...","[-0.4963131, -0.19806552, 0.24239738, -0.06992...",[],"[aa, aa concrete, aaa, aaa server, aab, aabb, ...",[],[],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
3,acid or caprolactam,0.097004,[Y02P20/30],"[0.06250752, -0.7046676, -0.32794353, 0.009134...","[0.0009763143, 0.08154529, 0.042667452, -0.020...","[-0.40428108, 0.24819246, 0.43703476, -0.73723...",[caprolactam],"[aa, aa concrete, aaa, aaa server, aab, aabb, ...",[],[17430],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[]
4,active power filtering,0.008321,[Y02E40/20],"[-0.20968668, -0.887124, -0.2824073, -0.101121...","[0.022758853, 0.12130108, -0.0364266, 0.024433...","[-0.75558263, 0.27234003, -0.737954, -1.259309...",[],"[aa, aa concrete, aaa, aaa server, aab, aabb, ...",[active power filter],[],"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",[2023]


In [24]:
# Helper function to get keywords by indices
def get_keywords_by_indices(indices, df_source):
    return df_source.iloc[indices]['keyword_yake_lemma'].tolist() if indices.size else []

# Extend df_keywords_titles with columns for keywords corresponding to the neighbor indices with empty lists
df_keywords_titles['neighbors_bertforpatents'] = df_keywords_titles['keyword_yake_bertforpatents_embedding'].progress_apply(lambda x: [])
df_keywords_titles['neighbors_climatebert'] = df_keywords_titles['keyword_yake_climatebert_embedding'].progress_apply(lambda x: [])
df_keywords_titles['neighbors_patentsberta'] = df_keywords_titles['keyword_yake_patentsberta_embedding'].progress_apply(lambda x: [])

df_keywords_titles['keywords_bertforpatents'] = df_keywords_titles['keyword_yake_bertforpatents_embedding'].progress_apply(lambda x: [])
df_keywords_titles['keywords_climatebert'] = df_keywords_titles['keyword_yake_climatebert_embedding'].progress_apply(lambda x: [])
df_keywords_titles['keywords_patentsberta'] = df_keywords_titles['keyword_yake_patentsberta_embedding'].progress_apply(lambda x: [])

# Loop over each row in the DataFrame to use each set of embeddings as query vectors
for index, row in tqdm(df_keywords_titles.iterrows()):
    # Extract and normalize the query vectors
    query_vector_bertforpatents = normalize(np.array([row['keyword_yake_bertforpatents_embedding']]).reshape(1, -1))
    query_vector_climatebert = normalize(np.array([row['keyword_yake_climatebert_embedding']]).reshape(1, -1))
    query_vector_patentsberta = normalize(np.array([row['keyword_yake_patentsberta_embedding']]).reshape(1, -1))
    
    # Find neighbors within the radius for each model
    indices_bertforpatents = nn_bertforpatents.radius_neighbors(query_vector_bertforpatents, return_distance=False)
    indices_climatebert = nn_climatebert.radius_neighbors(query_vector_climatebert, return_distance=False)
    indices_patentsberta = nn_patentsberta.radius_neighbors(query_vector_patentsberta, return_distance=False)
    
    # Get the neighbors, or an empty array if none are found
    neighbors_bertforpatents = indices_bertforpatents[0] if indices_bertforpatents[0].size else np.array([])
    neighbors_climatebert = indices_climatebert[0] if indices_climatebert[0].size else np.array([])
    neighbors_patentsberta = indices_patentsberta[0] if indices_patentsberta[0].size else np.array([])
    
    # Get the keywords from df_keywords_claims corresponding to the indices
    keywords_bertforpatents = get_keywords_by_indices(neighbors_bertforpatents, df_keywords_claims)
    keywords_climatebert = get_keywords_by_indices(neighbors_climatebert, df_keywords_claims)
    keywords_patentsberta = get_keywords_by_indices(neighbors_patentsberta, df_keywords_claims)

    # Assign the neighbors and keywords back to the DataFrame
    df_keywords_titles.at[index, 'neighbors_bertforpatents'] = neighbors_bertforpatents.tolist()
    df_keywords_titles.at[index, 'neighbors_climatebert'] = neighbors_climatebert.tolist()
    df_keywords_titles.at[index, 'neighbors_patentsberta'] = neighbors_patentsberta.tolist()
    df_keywords_titles.at[index, 'keywords_bertforpatents'] = keywords_bertforpatents
    df_keywords_titles.at[index, 'keywords_climatebert'] = keywords_climatebert
    df_keywords_titles.at[index, 'keywords_patentsberta'] = keywords_patentsberta

100%|██████████| 467/467 [00:00<00:00, 703317.76it/s]
100%|██████████| 467/467 [00:00<00:00, 879147.20it/s]
100%|██████████| 467/467 [00:00<00:00, 979369.98it/s]
100%|██████████| 467/467 [00:00<00:00, 986770.76it/s]
100%|██████████| 467/467 [00:00<00:00, 907664.49it/s]
100%|██████████| 467/467 [00:00<00:00, 992269.49it/s]
467it [07:17,  1.07it/s]


In [25]:
df_keywords_titles.to_json('/mnt/hdd01/patentsview/Similarity Search - CPC Classification and Claims/df_keyword_titles_cosine_similarity_radius_05_noun_chunks.json')