In [1]:
from sentence_transformers import util
import sys
from pathlib import Path
import torch

# Add the project root to Python path
project_root = str(Path.cwd().parent.parent)
sys.path.append(project_root)

from week_2.data_preparation.data_prep import load_triplets_from_json
from week_2.data_preparation.data_prep import load_passages_from_file
from model import DualTowerWithFC, TripletLoss

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = DualTowerWithFC()
model.load_state_dict(torch.load("dual_tower_model_base_384D.pt", map_location=torch.device('cpu')))
#model.to("cuda")
model.eval()

DualTowerWithFC(
  (embedding_model): SentenceTransformer(
    (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
    (2): Normalize()
  )
  (fc_query): Sequential(
    (0): Linear(in_features=384, out_features=384, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=384, out_features=384, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=384, out_features=384, bias=True)
    (7): ReLU()
  )
  (fc_doc): Sequential(
    (0): Linear(in_features=384, out_features=384, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=384

In [None]:
query = "What did Einstein discover?"
positive_doc = "Einstein developed the theory of relativity."
negative_doc = "Einstein was a physicist that won the nobel prize in 1921."

query_embedding = model(query, tower_type="query").to("cuda")
pos_embedding = model(positive_doc, tower_type="doc").to("cuda")
neg_embedding = model(negative_doc, tower_type="doc").to("cuda")

pos_score = util.cos_sim(query_embedding, pos_embedding)
neg_score = util.cos_sim(query_embedding, neg_embedding)

print("Positive similarity:", pos_score.item())
print("Negative similarity:", neg_score.item())


Positive similarity: 0.9823693037033081
Negative similarity: 0.9634515047073364


In [10]:
# Set up paths
project_root = Path.cwd().parent / "data_preparation"
all_docs_path = project_root / "all_docs.json"

all_passages = load_passages_from_file(str(all_docs_path))[:5_000_000]


Loaded 9078734 passages from /root/TwoTowerSearch/week_2/data_preparation/all_docs.json


In [26]:
# simply look through this list of strings and return those containing "Einstein".lower()
def filter_passages_by_keyword(passages, keyword):
    return [passage for passage in passages if keyword.lower() in passage.lower()]


keyword = "einstein"
filtered_passages = filter_passages_by_keyword(all_passages, keyword)

print(f"Found {len(filtered_passages)} passages containing the keyword '{keyword}'.")
# print the first 5 passages
for i, passage in enumerate(filtered_passages[:5]):
    print(f"Passage {i+1}: {passage}")

# now cosine similarity keyword with each of the filtered passages
filtered_embeddings = []
for passage in filtered_passages:
    passage_embedding = model(passage, tower_type="doc").to("cuda")
    filtered_embeddings.append(passage_embedding)


Found 1643 passages containing the keyword 'einstein'.
Passage 1: It is said that Einstein had the IQ of 160 and that is not the most gifted or Immeasurable genius. The higher IQ rating is a natural and innate ability that can be improved. But it can not be claimed to be a definite measure of success. In other words a higher IQ rating can’t guarantee a success in your life.
Passage 2: Einstein was considered to only have an IQ of about 160. Mensa is a society for people with high IQ, in the top 2% (1 in 50). In 1926, psychologist Catherine Morris Cox published a study of the most eminent men and women who had lived between 1450 and 1850 to estimate what their IQs might have been.
Passage 3: 12 Answers. I have looked at all of the answers and some of those in other sources and none of the answers seem to capture the emotion of watershed moment . The aha or epiphany moment, the eureka moment... a discovery or the moment of change. ex. The moment Einstein envisioned the theory of relativi

In [33]:
query = "what IQ did albert einstein have?"
query_embedding = model(query, tower_type="query").to("cuda")

# calculate cosine similarity
similarities = []
for passage_embedding in filtered_embeddings:
    similarity = util.cos_sim(query_embedding, passage_embedding)
    similarities.append(similarity.item())

# sort the passages by similarity
sorted_passages = sorted(zip(filtered_passages, similarities), key=lambda x: x[1], reverse=True)
# print the top 5 passages
print("Top 5 passages:")
for i, (passage, similarity) in enumerate(sorted_passages[:5]):
    print(f"Passage {i+1}: {passage} (similarity: {similarity})")



Top 5 passages:
Passage 1: Albert Einstein had an IQ level in the range of 160-190. But there are others having more Iq than him: Garry Kasparov (IQ Level – 190) This man needs no intro....the master of chess. Philip Emeagwali (IQ Level- 190) This man is an engineer, mathematician and geologist who used the Connection Machine supercomputer to analyze petroleum fields. (similarity: 0.95863938331604)
Passage 2: According to NDTV, Albert Einstein had an IQ of 160. Most people score between 70 and 130, or nowhere near Einstein’s suggested IQ. The young genius has since joined Mensa, a high IQ society, and looks forward to meeting other people with interests similar to his own. (similarity: 0.9548270106315613)
Passage 3: It appears that both Albert Einstein and Stephen Hawking share the IQ of 160. To my knowledge the maximum score is 162, at least on most reputable tests. Although he does not have the maximum score, it is most certainly a more than respectable IQ. (similarity: 0.95065635442