In [1]:
!pip install InstructorEmbedding

Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl.metadata (20 kB)
Downloading InstructorEmbedding-1.0.1-py2.py3-none-any.whl (19 kB)
Installing collected packages: InstructorEmbedding
Successfully installed InstructorEmbedding-1.0.1


In [2]:
import pandas as pd
import numpy as np

In [4]:
doc = pd.read_csv('rag-mini-wikipedia_document.csv')
q = pd.read_csv('rag-mini-wikipedia_q_and_a.csv')

In [5]:
doc

Unnamed: 0,passage,id
0,"Uruguay (official full name in ; pron. , Eas...",0
1,"It is bordered by Brazil to the north, by Arge...",1
2,Montevideo was founded by the Spanish in the e...,2
3,The economy is largely based in agriculture (m...,3
4,"According to Transparency International, Urugu...",4
...,...,...
3195,"*In 2007, a duck in Tallahassee, Florida survi...",3196
3196,*A rare genetic mutation sees some ducks born ...,3197
3197,*The Moche people of ancient Peru worshipped n...,3198
3198,*Angel Wing - A disease common in ducks.,3199


In [6]:
import torch, gc

gc.collect()
torch.cuda.empty_cache()


In [None]:
torch.cuda.memory_summary(device=None, abbreviated=True)




In [None]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# bert
bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
model = AutoModel.from_pretrained(bert_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# batch
batch_size = 8
texts = doc["passage"].tolist()
embeddings_list = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # mean pooling over sequence length
        batch_emb = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(batch_emb.cpu().numpy())

    torch.cuda.empty_cache()
# concat
bert_embeddings = np.vstack(embeddings_list)
print("BERT embeddings shape:", bert_embeddings.shape)


100%|██████████| 400/400 [00:34<00:00, 11.45it/s]

BERT embeddings shape: (3200, 768)





In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
from sentence_transformers import SentenceTransformer

mpnet_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# encode all passages
mpnet_embeddings = mpnet_model.encode(
    doc["passage"].tolist(),
    batch_size=8,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("MPNet embeddings shape:", mpnet_embeddings.shape)

from sklearn.preprocessing import normalize
mpnet_embeddings = normalize(mpnet_embeddings)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/400 [00:00<?, ?it/s]

MPNet embeddings shape: (3200, 768)


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# load
model_name = "intfloat/e5-small-v2"
e5_model = SentenceTransformer(model_name)

# prefix as required
texts = ["passage: " + t for t in doc["passage"].tolist()]

e5_embeddings = e5_model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

# normalize for cosine similarity search
e5_embeddings = normalize(e5_embeddings)

print("E5-small-v2 embeddings shape:", e5_embeddings.shape)


Batches:   0%|          | 0/100 [00:00<?, ?it/s]

E5-small-v2 embeddings shape: (3200, 384)


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
from InstructorEmbedding import INSTRUCTOR

instructor_model = INSTRUCTOR("hkunlp/instructor-large")

# add task instruction for retrieval
sentences = [["Represent the passage for retrieval:", p] for p in doc["passage"].tolist()]

instructor_embeddings = instructor_model.encode(
    sentences,
    batch_size=8,
    show_progress_bar=True
)

print("Instructor-Large embeddings shape:", instructor_embeddings.shape)




pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



Batches:   0%|          | 0/400 [00:00<?, ?it/s]

Instructor-Large embeddings shape: (3200, 1024)


In [None]:
# to df
df_emb = doc[["id", "passage"]].copy()

df_emb["bert-base-uncased"] = [vec.tolist() for vec in bert_embeddings]
df_emb["multi-qa-mpnet-base-dot-v1"] = [vec.tolist() for vec in mpnet_embeddings]
df_emb["hkunlp-instructor-large"] = [vec.tolist() for vec in instructor_embeddings]
df_emb["intfloat-e5-small-v2"] = [vec.tolist() for vec in e5_embeddings]

df_emb.head()


Unnamed: 0,id,passage,bert-base-uncased,multi-qa-mpnet-base-dot-v1,hkunlp-instructor-large,intfloat-e5-small-v2
0,0,"Uruguay (official full name in ; pron. , Eas...","[-0.5065356492996216, -0.061035141348838806, 0...","[-0.0341777466237545, 0.002857460640370846, -0...","[-0.01721850410103798, -0.005168597679585218, ...","[-0.040801145136356354, 0.06251285970211029, 0..."
1,1,"It is bordered by Brazil to the north, by Arge...","[-0.694625198841095, -0.003389554563909769, -0...","[-0.011842395178973675, -0.02865975722670555, ...","[-0.00820520706474781, -0.011139578185975552, ...","[-0.030295174568891525, 0.021842412650585175, ..."
2,2,Montevideo was founded by the Spanish in the e...,"[-0.41299816966056824, 0.002561133122071624, 0...","[-0.03803553432226181, 0.02742711640894413, -0...","[-0.020763935521245003, -0.01978014037013054, ...","[-0.09860740602016449, 0.07213453203439713, 0...."
3,3,The economy is largely based in agriculture (m...,"[-0.587813675403595, -0.14051802456378937, 0.1...","[-0.06013483181595802, -0.011563033796846867, ...","[-0.020554738119244576, -0.024487322196364403,...","[-0.035195767879486084, 0.04417723789811134, 0..."
4,4,"According to Transparency International, Urugu...","[-0.05339909717440605, -0.2507496774196625, 0....","[-0.03831447288393974, 0.0452677421271801, -0....","[-0.02443518675863743, 0.007906915619969368, -...","[-0.06756877154111862, 0.07074808329343796, 0...."


In [None]:
df_emb.to_parquet("rag-mini-wikipedia_document_embedding.parquet", index=False)

In [7]:
q

Unnamed: 0,question,answer,id
0,Was Abraham Lincoln the sixteenth President of...,yes,0
1,Did Lincoln sign the National Banking Act of 1...,yes,2
2,Did his mother die of pneumonia?,no,4
3,How many long was Lincoln's formal education?,18 months,6
4,When did Lincoln begin his political career?,1832,8
...,...,...,...
913,Was Wilson president of the American Political...,Yes,1710
914,Did he not cast his ballot for John M. Palmer ...,Yes,1711
915,Did Wilson not spend 1914 through the beginnin...,Yes,1712
916,"Was Wilson , a staunch opponent of antisemitis...",Yes,1713


In [None]:
gc.collect()
torch.cuda.empty_cache()

In [8]:
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# bert
bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)
model = AutoModel.from_pretrained(bert_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.eval()

# batch
batch_size = 8
texts = q["question"].tolist()
embeddings_list = []

for i in tqdm(range(0, len(texts), batch_size)):
    batch_texts = texts[i:i+batch_size]
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        # mean pooling over sequence length
        batch_emb = outputs.last_hidden_state.mean(dim=1)
        embeddings_list.append(batch_emb.cpu().numpy())

    torch.cuda.empty_cache()
# concat
bert_embeddings = np.vstack(embeddings_list)
print("BERT embeddings shape:", bert_embeddings.shape)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

100%|██████████| 115/115 [00:02<00:00, 57.28it/s]

BERT embeddings shape: (918, 768)





In [9]:
gc.collect()
torch.cuda.empty_cache()

from sentence_transformers import SentenceTransformer

mpnet_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

# encode all passages
mpnet_embeddings = mpnet_model.encode(
    q["question"].tolist(),
    batch_size=8,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("MPNet embeddings shape:", mpnet_embeddings.shape)

from sklearn.preprocessing import normalize
mpnet_embeddings = normalize(mpnet_embeddings)


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/115 [00:00<?, ?it/s]

MPNet embeddings shape: (918, 768)


In [10]:
gc.collect()
torch.cuda.empty_cache()

# load
model_name = "intfloat/e5-small-v2"
e5_model = SentenceTransformer(model_name)

# prefix as required
texts = ["question: " + t for t in q["question"].tolist()]

e5_embeddings = e5_model.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

# normalize for cosine similarity search
e5_embeddings = normalize(e5_embeddings)

print("E5-small-v2 embeddings shape:", e5_embeddings.shape)


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

E5-small-v2 embeddings shape: (918, 384)


In [11]:
gc.collect()
torch.cuda.empty_cache()

from InstructorEmbedding import INSTRUCTOR

instructor_model = INSTRUCTOR("hkunlp/instructor-large")

# add task instruction for query
sentences = [["Represent the question for query:", p] for p in q["question"].tolist()]

instructor_embeddings = instructor_model.encode(
    sentences,
    batch_size=8,
    show_progress_bar=True
)

print("Instructor-Large embeddings shape:", instructor_embeddings.shape)


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]



config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



Batches:   0%|          | 0/115 [00:00<?, ?it/s]

Instructor-Large embeddings shape: (918, 1024)


In [13]:
# to df
df_emb = q.copy()

df_emb["bert-base-uncased"] = [vec.tolist() for vec in bert_embeddings]
df_emb["multi-qa-mpnet-base-dot-v1"] = [vec.tolist() for vec in mpnet_embeddings]
df_emb["hkunlp-instructor-large"] = [vec.tolist() for vec in instructor_embeddings]
df_emb["intfloat-e5-small-v2"] = [vec.tolist() for vec in e5_embeddings]

df_emb.head()

Unnamed: 0,question,answer,id,bert-base-uncased,multi-qa-mpnet-base-dot-v1,hkunlp-instructor-large,intfloat-e5-small-v2
0,Was Abraham Lincoln the sixteenth President of...,yes,0,"[-0.19312725961208344, -0.13748712837696075, -...","[-0.05237957090139389, -0.01714211329817772, -...","[-0.05883704498410225, 0.006434370297938585, -...","[-0.08810286968946457, 0.08150017261505127, -0..."
1,Did Lincoln sign the National Banking Act of 1...,yes,2,"[0.14002685248851776, -0.05434509739279747, -0...","[-0.030207067728042603, 0.0009371377527713776,...","[-0.05082063749432564, 0.006711824331432581, 0...","[-0.09225689619779587, 0.08364736288785934, 0...."
2,Did his mother die of pneumonia?,no,4,"[0.42659080028533936, -0.20329923927783966, -0...","[0.007733686827123165, -0.019836006686091423, ...","[-0.041972026228904724, 0.0011429721489548683,...","[-0.06938131153583527, 0.06415015459060669, 0...."
3,How many long was Lincoln's formal education?,18 months,6,"[0.13052700459957123, 0.04617954418063164, -0....","[-0.055185094475746155, 0.011871792376041412, ...","[-0.05959087982773781, 0.07057556509971619, -0...","[-0.07752346992492676, 0.07538802921772003, 0...."
4,When did Lincoln begin his political career?,1832,8,"[-0.05585627630352974, 0.025420386344194412, -...","[-0.06685220450162888, 0.008905233815312386, -...","[-0.06925185769796371, 0.04178489372134209, -0...","[-0.09624198079109192, 0.11157051473855972, 0...."


In [14]:
df_emb.to_parquet("rag-mini-wikipedia_question_embedding.parquet", index=False)