In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import numpy as np
import string

In [None]:
# Function to create SimCSE embeddings
def get_simcse_embeddings(text_list, batch_size=16):
    model.eval()  # Set model to evaluation mode
    embeddings = []

    with torch.no_grad():  # Disable gradient computation
        for i in range(0, len(text_list), batch_size):
            batch = text_list[i:i+batch_size]
            inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt")
            
            # Move tensors to GPU if available
            if torch.cuda.is_available():
                inputs = {key: val.cuda() for key, val in inputs.items()}
                model.cuda()
            
            # Get the embeddings (SimCSE uses the `pooler_output`)
            outputs = model(**inputs, return_dict=True)
            batch_embeddings = outputs.pooler_output.cpu()  # Detach from GPU
            
            embeddings.append(batch_embeddings)

    # Combine all embeddings into a single tensor
    return torch.cat(embeddings)

In [2]:
# Load the pretrained SimCSE model
model_name = "princeton-nlp/sup-simcse-bert-base-uncased"  # Change this for different variants
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

## Main Part

In [3]:
examples_dataframe = pd.read_excel(r"C:\Users\John\Desktop\Patent Datasets\final_data\patent_database_final_F03D.xlsx")

In [4]:
title_list = examples_dataframe["title"].tolist()
claims_list = examples_dataframe["claims"].tolist()
description_list = examples_dataframe["description"].tolist()
id_list = examples_dataframe["lens_id"].tolist()

In [5]:
examples_x = []
for i in range(len(title_list)):
    example = str(title_list[i]) + " " + str(claims_list[i]) + " " + str(description_list[i])
    examples_x.append(example)

print(examples_x[0])

BLADE MOUNTING CLAIMS: 1. An arrangement for in situ mounting of rotor blades to a rotor hub of a wind power plant comprising rotor blades, a rotor hub (40), a disc (1), a means for rotation (30) of the disc (1) around its axis, the disc (1) being in connected to rotor hub (40) in such a way that when the disc (1) rotates a certain degree of angle around its axis (3) it will cause the rotor hub (40) to rotate the same degree of angle around the axis of the rotor hub, the disc (1) being provided with a connecting device (2), characterized in that the disc (1) is caused to rotate by the means for rotation (30) and when the means for rotation (30) comes into a desired position, the disc (1) is secured by a securing means (20; 36, 37). 2. The arrangement according to claim 1, characterized in that the securing means comprises a male portion (23; 37) and a at least one female portion (2; 220; 36) respectively, and that one of the portions has a fixed position in relation to the disc (1) dur

In [47]:
chunk_size = 500
examples_x_new = []
id_list_new = []
title_list_new = []

for doc, doc_id, title in zip(examples_x, id_list, title_list):
    words = doc.split()  # Split document into words
    chunks = [words[i:i + chunk_size] for i in range(0, len(words), chunk_size)]  # Create chunks
    for chunk in chunks:
        examples_x_new.append(" ".join(chunk))  # Join words back into a string for each chunk
        id_list_new.append(doc_id)  # Append the same ID for each chunk
        title_list_new.append(title)  # Append the same title for each chunk

In [48]:
examples_x = examples_x_new
id_list= id_list_new
title_list = title_list_new

In [51]:
# Calculate the average number of words for all documents in examples_x
def average_word_count(documents):
    total_words = sum(len(doc.split()) for doc in documents)
    return total_words / len(documents)

# Assuming examples_x is already defined
average_words = average_word_count(examples_x)
print(f"The average number of words for all documents {len(examples_x)} in examples_x is {average_words:.2f}")

The average number of words for all documents 576369 in examples_x is 477.79


576369

In [52]:
# Example usage
#title_list = ["This is a test sentence.", "Another sentence for testing.", "Final example sentence."]
simcse_embeddings = get_simcse_embeddings(examples_x[:2000], batch_size=16)

In [53]:
test_doc = examples_x[11601]

In [56]:
# Create embedding for the test title
test_doc_embedding = get_simcse_embeddings([test_doc])

# Compute cosine similarities
cosine_scores = cosine_similarity(test_doc_embedding, simcse_embeddings)[0]

# Get the top 10 similar titles
top_indices = cosine_scores.argsort()[-20:][::-1]

# Print the index, rank, score, lens_id, and title of each document
# print("Top 10 similar docs to '{}':".format(test_doc))
for rank, idx in enumerate(top_indices, start=1):
    score = cosine_scores[idx]
    lens_id = examples_dataframe.iloc[idx]['lens_id']
    title = examples_dataframe.iloc[idx]['title']
    print(f"Index: {idx}, Rank: {rank}, Score: {score:.4f}, Lens ID: {lens_id}, Title: {title}")

Index: 1961, Rank: 1, Score: 0.8644, Lens ID: 183-136-394-043-221, Title: Method and apparatus to remotely control a wind turbine park
Index: 1936, Rank: 2, Score: 0.8570, Lens ID: 171-622-816-293-625, Title: Element for lifting the blade and method followed
Index: 1476, Rank: 3, Score: 0.8493, Lens ID: 133-858-820-170-882, Title: Energy generation plant driven by wind or water currents
Index: 1482, Rank: 4, Score: 0.8488, Lens ID: 134-734-952-929-354, Title: Wind turbine, yaw system controller and yaw system for a wind turbine and method of reducing the loads acting on such a yaw system
Index: 895, Rank: 5, Score: 0.8468, Lens ID: 093-613-372-316-053, Title: Wind turbine blade having a flow guiding device with optimised height
Index: 1886, Rank: 6, Score: 0.8461, Lens ID: 161-787-086-720-957, Title: Method of start up at least a part of a wind power plant, wind power plant and use of the wind power plant
Index: 1480, Rank: 7, Score: 0.8458, Lens ID: 134-214-724-617-317, Title: Compres

## FAISS ANN

In [57]:
# Convert to numpy for ANN or cosine similarity
simcse_embeddings_np = simcse_embeddings.numpy()

# Dimensionality of embeddings
embedding_dim = simcse_embeddings_np.shape[1]

# Convert the test title embedding to numpy
test_doc_embedding_np = test_doc_embedding.numpy()

In [58]:
# Create FAISS index for Euclidean distance
index = faiss.IndexFlatL2(embedding_dim)  # L2 for Euclidean distance
index.add(simcse_embeddings_np)

# Search for top 10 most similar titles using Euclidean distance
k = 10
distances, indices = index.search(test_doc_embedding_np, k)

# Retrieve and display similar titles
similar_titles = [id_list[idx] for idx in indices[0]]

for i, id in enumerate(similar_titles):
    print(f"Rank {i+1}: {id} (Distance: {distances[0][i]}) Title: {title_list[indices[0][i]]}")

Rank 1: 013-371-305-262-568 (Distance: 6.295607089996338) Title: RETRACTABLE BLADE STRUCTURE WITH A SPLIT TRAILING EDGE
Rank 2: 012-903-836-155-903 (Distance: 6.662032127380371) Title: Self governing fluid energy turbine
Rank 3: 009-604-420-618-187 (Distance: 7.053715229034424) Title: Vertical axis wind turbine with articulating rotor
Rank 4: 009-604-420-618-187 (Distance: 7.1538405418396) Title: Vertical axis wind turbine with articulating rotor
Rank 5: 009-604-420-618-187 (Distance: 7.249152183532715) Title: Vertical axis wind turbine with articulating rotor
Rank 6: 013-003-079-530-86X (Distance: 7.332096576690674) Title: WIND TURBINE WITH PRESTRESSABLE SUPPORTING ARMS
Rank 7: 003-607-558-948-547 (Distance: 7.429407596588135) Title: EXHAUST POWER GENERATION DEVICE
Rank 8: 009-604-420-618-187 (Distance: 7.580665588378906) Title: Vertical axis wind turbine with articulating rotor
Rank 9: 013-976-231-755-440 (Distance: 7.638299942016602) Title: Linear motion wind driven power plant
Rank

In [59]:
# Create a FAISS index (L2 similarity by default; for cosine, normalize embeddings)
index = faiss.IndexFlatL2(embedding_dim)

# Add your precomputed embeddings to the index
index.add(simcse_embeddings_np)

# Normalize embeddings to unit length (for cosine similarity)
faiss.normalize_L2(simcse_embeddings_np)
index = faiss.IndexFlatIP(embedding_dim)  # Inner product for cosine similarity
index.add(simcse_embeddings_np)

# Normalize test title embedding if using cosine similarity
faiss.normalize_L2(test_doc_embedding_np)

# Search for the top 10 most similar titles
k = 10  # Number of nearest neighbors
distances, indices = index.search(test_doc_embedding_np, k)

# `indices` contains the indices of the top 10 most similar titles
# `distances` contains the similarity scores

# Get the top 10 most similar titles
similar_docs_id = [id_list[idx] for idx in indices[0]]


# Print results
for i, id in enumerate(similar_docs_id):
    print(f"Rank {i+1}: {id} (Score: {distances[0][i]}) Title: {title_list[indices[0][i]]}")

Rank 1: 013-371-305-262-568 (Score: 0.8643819093704224) Title: RETRACTABLE BLADE STRUCTURE WITH A SPLIT TRAILING EDGE
Rank 2: 012-903-836-155-903 (Score: 0.856980562210083) Title: Self governing fluid energy turbine
Rank 3: 009-604-420-618-187 (Score: 0.8492686152458191) Title: Vertical axis wind turbine with articulating rotor
Rank 4: 009-604-420-618-187 (Score: 0.8487626314163208) Title: Vertical axis wind turbine with articulating rotor
Rank 5: 013-003-079-530-86X (Score: 0.8467873334884644) Title: WIND TURBINE WITH PRESTRESSABLE SUPPORTING ARMS
Rank 6: 010-949-078-926-035 (Score: 0.8460617065429688) Title: Wind powered electricity generation system
Rank 7: 009-604-420-618-187 (Score: 0.8457615375518799) Title: Vertical axis wind turbine with articulating rotor
Rank 8: 009-604-420-618-187 (Score: 0.8434953689575195) Title: Vertical axis wind turbine with articulating rotor
Rank 9: 003-607-558-948-547 (Score: 0.8424891233444214) Title: EXHAUST POWER GENERATION DEVICE
Rank 10: 013-976