In [1]:
! pip install sentence_transformers rank_bm25 cohere

Collecting sentence_transformers
  Using cached sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Using cached sentence_transformers-4.1.0-py3-none-any.whl (345 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25, sentence_transformers
Successfully installed rank_bm25-0.2.2 sentence_transformers-4.1.0



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import cohere


In [None]:
docunents = [
"This is a list which containing sample documents.",
"Keywords are important for keyword-based search.",
"Document analysis involves extracting keywords.",
"Keyword-based search relies on sparse embeddings.",
"Understanding document structure aids in keyword extraction.",
"Efficient keyword extraction enhances search accuracy.",
"Semantic similarity improves document retrieval performance.",
"Machine learning algorithms can optimize keyword extraction methods."
]

In [None]:
# Load pre-traind sentence transformer model
model_name = "sentence-transformers/paraphrase-xlm-r-multilingual-v1"

In [None]:
model = SentenceTransformer(model_name)

In [None]:
docunents_embeddings = model.encode(docunents)

In [None]:
for i, embedding in enumerate(docunents_embeddings):
    print(f"Documents {i + 1} embedding: {embedding}")

In [None]:
query = "Natural Language Processing techniques enhance keyword extraction efficiency"

In [None]:
query_embedding = model.encode(query)

In [None]:
print("Query embedding: ", query_embedding)

In [None]:
similarities = cosine_similarity(np.array([query_embedding]), docunents_embeddings)

In [None]:
similarities

In [None]:
most_similar_index = np.argmax(similarities)

In [None]:
most_similar_index

In [None]:
most_similar_document = docunents[most_similar_index]

In [None]:
most_similar_document

In [None]:
similarity_score = similarities[0][most_similar_index]

In [None]:
similarity_score

In [None]:
sorted_indices = np.argsort(similarities[0])[::-1]

In [None]:
sorted_indices

In [None]:
ranked_documents = [(docunents[i], similarities[0][i]) for i in sorted_indices]

In [None]:
ranked_documents

In [None]:
print("Ranked Documents: ")
for rank, (document, similarity) in enumerate(ranked_documents, start=1):
    print(f"Rank: {rank}, Document: {document}, Similarity: {similarity}")

In [None]:
print("Top 4 Documents: ")

for rank, (document, similarity) in enumerate(ranked_documents[:4], start=1):
    print(f"Rank: {rank}, Documents: {document}, Similarity_score: {similarity}")

In [None]:
top_4_documents = [doc[0] for doc in ranked_documents[:4]]

In [None]:
tokenized_top_4_documents = [doc.split() for doc in top_4_documents]

In [None]:
tokenized_top_4_documents

In [None]:
tokenized_query = query.split()

In [None]:
tokenized_query

In [None]:
bm25 = BM250kapi(tokenized_top_4_documents)

In [None]:
bm25_scores = bm25.get_scores(tokenized_query)

In [None]:
bm25_scores

In [None]:
sorted_indices_2 = np.argsort(bm25_scores)[::-1]

In [None]:
sorted_indices_2

In [None]:
reranked_documents = [(top_4_documents[i], bm25_scores[i]) for i in sorted_indices_2]

In [None]:
reranked_documents

In [None]:
print("Top 4 Documents: ")

for rank, (document, similarity) in enumerate(reranked_documents, start=1):
    print(f"Rank: {rank}, Documents: {document}, Similarity_score: {similarity}")

In [None]:
reranked_documents[:4]

Cross encoder

In [None]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-5-v2")

In [None]:
pairs = []

for doc in top_4_documents:
    pairs.append([query, doc])

In [None]:
pairs

In [None]:
scores = cross_encoder.predict(pairs)
scores

In [None]:
scored_docs = zip(scores, top_4_documents)
scored_docs

In [None]:
reranked_document_cross_encoder = sorted(scored_docs, reverse=True)

In [None]:
reranked_document_cross_encoder

BM_25

In [None]:
reranked_documents

In [None]:
co = cohere.Client("COHERE_API_KEY")

In [None]:
response = co.rerank(
    model="rerank-english-v3.0",
    query="Natural language processing techniques enhance keyword extraction efficiency ",
    documents=top_4_documents,
    return_documents=True
)

In [None]:
print(response)

In [None]:
response.results[0].document.text

In [None]:
response.results[0].relevance_score

In [None]:
for i in range(4):
    print(f"Text: {response.results[i].document.text}, Score: {response.results[i].relevance_score}")

In [None]:
# Initialize model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
sentence_a = "The movie was fantastic"
sentence_b = "I really enjoyed the film"

In [None]:
# Encode Input
inputs = tokenizer(sentence_a, sentence_b, return_tensors="pt", padding=True, truncation=True)

In [None]:
# Get model output
outputs = model(**inputs)
logits = outputs.logits 

In [None]:
# Convert logits to probabilities
probs = torch.softmax(logits, dim=1)

In [None]:
# Assuming a binary classification task (similarity score)
similarity_score_1 = probs[0][1].item()
print(f"Similarity Score: {similarity_score_1}")