In [40]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import pickle
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import stanza
import faiss

In [3]:
model = Word2Vec.load("word2vec_F03D_model.model")

In [6]:
examples_dataframe = pd.read_excel(r"C:\Users\John\Desktop\Patent Datasets\final_data\patent_database_final_F03D.xlsx")

In [7]:
title_list = examples_dataframe["title"].tolist()
claims_list = examples_dataframe["claims"].tolist()
description_list = examples_dataframe["description"].tolist()
id_list = examples_dataframe["lens_id"].tolist()

In [4]:
def preprocess_document(doc):
    # Tokenize and clean your document (implement as needed)
    tokens = doc.lower().split()  # Replace with a more robust tokenizer
    tokens = [t for t in tokens if t in model.wv]  # Keep only tokens in Word2Vec vocab
    return tokens

def document_vector(doc, model):
    tokens = preprocess_document(doc)
    if not tokens:
        return np.zeros(model.vector_size)  # Handle empty documents
    return np.mean([model.wv[word] for word in tokens], axis=0)

In [13]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
nlp = stanza.Pipeline('en', processors='tokenize,mwt,pos,lemma', use_gpu=True, batch_size=32)

2024-11-27 00:10:38 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

2024-11-27 00:10:38 INFO: Downloaded file to C:\Users\John\stanza_resources\resources.json
2024-11-27 00:10:38 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-27 00:10:38 INFO: Using device: cuda
2024-11-27 00:10:38 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-27 00:10:40 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-27 00:10:40 INFO: Loading: pos
  checkpoint = torch.load(filename, lambda storage, loc: storage)
  data = torch.load(self.filename, lambda storage, loc: storage)
  state = torch.load(filename, lambda storage, loc: storage)
2024-11-27 00:10:40 INFO: Loading: lemma
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-27 00:10:40 INFO: Done 

In [12]:
def clean_and_process_example(example):
    # Convert to lowercase
    example = example.lower()
    
    # Remove punctuation and digits
    cleaned_example = example.translate(str.maketrans('', '', string.punctuation + string.digits))
    
    # Replace double spaces with single space
    cleaned_example = cleaned_example.replace("  ", " ")
    
    # Split into words and remove stop words
    words = cleaned_example.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join filtered words back into a sentence
    cleaned_example = ' '.join(filtered_words)
    
    # Stem the words
    stemmed_sentence = [stemmer.stem(word) for word in cleaned_example.split()]
    
    # Lemmatize the words using Stanza
    doc = nlp(" ".join(stemmed_sentence))
    lemmatized_sentence = [word.lemma for sent in doc.sentences for word in sent.words]
    
    return ' '.join(lemmatized_sentence)

In [9]:
with open('examples_x_lemmatized.pkl', 'rb') as file:
    documents = pickle.load(file)

In [28]:
# Function to compute the document vector
def document_vector(tokens, model):
    tokens = [t for t in tokens if t in model.wv]  # Keep only tokens in Word2Vec vocab
    if not tokens:
        return np.zeros(model.vector_size)  # Handle empty documents
    return np.mean([model.wv[word] for word in tokens], axis=0)

In [29]:
document_vectors = np.array([document_vector(doc, model) for doc in documents])

#### Query

In [34]:
test_text = """
A wind turbine comprising

a wind turbine tower comprising at least two annular tower rings placed vertically on top of each other,

characterized in that

a first tower ring of said at least two tower rings overlaps at least a further tower ring of said at least two tower rings.

2. The wind turbine according to claim 1, wherein said at least two tower rings overlaps in a substantially horizontal overlap region consisting of a bottom section of a tower ring and a top section of another tower rings.

3. The wind turbine according to claim 2, wherein said substantially horizontal overlap region extends in said tower rings longitudinal direction.

4. The wind turbine according to claim 2, wherein said bottom section and/or said top section are angled in an angle α, β, respectively, in relation to a middle section of said tower rings.

5. The wind turbine according to claim 4, wherein said angles α, β are between 0.5° and 15°, preferably 1° and 10° and most preferred between 2° and 7°.

6. The wind turbine according to claim 1, wherein said at least two tower rings overlaps downwards, making an upper tower ring overlap a lower tower ring placed immediately beneath said upper tower ring and so on.

7. The wind turbine according to claim 1, wherein said at least two tower rings are connected through said substantially horizontal overlap region.

8. The wind turbine according to claim 7, wherein said at least two tower rings are connected by bolts.

9. The wind turbine according to claim 1, wherein said at least two tower rings are of substantially constant height.

10. The wind turbine according to claim 1, wherein said at least two tower rings are made of steel.
"""


In [35]:
query_document = clean_and_process_example(test_text)
query_document = query_document.split()

In [36]:
query_vector = document_vector(query_document , model).reshape(1, -1)

In [37]:
cosine_scores = cosine_similarity(query_vector, document_vectors)[0]


In [47]:
# Get the top 10 most similar documents
top_indices = cosine_scores.argsort()[-20:][::-1]

# Print the top 10 most similar documents
print("Top 20 similar documents to the query document:")
for rank, idx in enumerate(top_indices, start=1):
    score = cosine_scores[idx]
    lens_id = examples_dataframe.iloc[idx]['lens_id']
    title = examples_dataframe.iloc[idx]['title']
    print(f"Index: {idx}, Rank: {rank}, Score: {score:.4f}, Lens ID: {lens_id}, Title: {title}")

Top 20 similar documents to the query document:
Index: 43256, Rank: 1, Score: 0.7534, Lens ID: 150-304-091-843-216, Title: A WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Index: 21239, Rank: 2, Score: 0.7501, Lens ID: 171-982-519-628-942, Title: TOWER OF A WIND TURBINE
Index: 3651, Rank: 3, Score: 0.7499, Lens ID: 083-736-749-295-537, Title: WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Index: 20462, Rank: 4, Score: 0.7498, Lens ID: 108-619-738-940-02X, Title: TOWER OF A WIND TURBINE
Index: 3650, Rank: 5, Score: 0.7393, Lens ID: 083-383-932-429-416, Title: A WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Index: 34289, Rank: 6, Score: 0.7327, Lens ID: 099-368-030-381-164, Title: Wind turbine with tensile-type structure
Index: 565, Rank: 7, Score: 0.7074, Lens ID: 055-697-218-048-292, 

#### FAISS ANN

In [41]:
# Create a FAISS index
dimension = model.vector_size
index = faiss.IndexFlatL2(dimension)

# Add document vectors to the FAISS index
index.add(document_vectors)

In [42]:
query_vector = document_vector(query_document, model).reshape(1, -1)

In [45]:
# Search for the top 10 most similar documents
k = 20
distances, indices = index.search(query_vector, k)

In [46]:
# Print results
for rank, idx in enumerate(indices[0]):
    score = distances[0][rank]
    print(f"Rank {rank+1}: {idx} (Score: {distances[0][rank]}) Title: {title_list[indices[0][rank]]}")


Rank 1: 43256 (Score: 47.77791976928711) Title: A WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Rank 2: 3651 (Score: 48.489601135253906) Title: WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Rank 3: 3650 (Score: 49.92790222167969) Title: A WIND TURBINE TOWER, A WIND TURBINE, A WIND TURBINE TOWER ELEVATOR AND A METHOD FOR ASSEMBLING A WIND TURBINE TOWER
Rank 4: 21239 (Score: 50.09141159057617) Title: TOWER OF A WIND TURBINE
Rank 5: 20462 (Score: 50.288291931152344) Title: TOWER OF A WIND TURBINE
Rank 6: 34289 (Score: 50.9012336730957) Title: Wind turbine with tensile-type structure
Rank 7: 565 (Score: 54.86802673339844) Title: Wind Turbine Tower, A Wind Turbine, A Wind Turbine Tower Elevator And A Method For Assembling A Wind Turbine Tower
Rank 8: 20225 (Score: 54.91025924682617) Title: Wind turbine tower and method of assembling
Rank 9: 1455 (Score: 5