In [1]:
import chromadb
from chromadb.config import Settings

In [2]:
client = chromadb.Client()

In [3]:
collection = client.create_collection("my_collection")

In [4]:
import pandas as pd
df = pd.read_csv("embedded_chunks_instructor_base.csv")

In [5]:
df

Unnamed: 0,id,chunks,chunks_embedded
0,<6501090.1075843349674.JavaMail.evans@thyme>,FYI. Commentary calls George W. Bush task gett...,"[0.03689831867814064, 0.02045171894133091, 0.0..."
1,<6501090.1075843349674.JavaMail.evans@thyme>,"inauguration, President Bush's electricity pol...","[0.04325525462627411, 0.04659399017691612, 0.0..."
2,<6501090.1075843349674.JavaMail.evans@thyme>,Energy Regulatory Commission (FERC) signals FE...,"[0.035252027213573456, 0.03817130625247955, 0...."
3,<6501090.1075843349674.JavaMail.evans@thyme>,California. Skyrocketing retail electric price...,"[0.040909528732299805, 0.0485917329788208, 0.0..."
4,<6501090.1075843349674.JavaMail.evans@thyme>,fuses bombs lit passage Energy Policy Act (EPA...,"[0.05127160623669624, 0.0412898063659668, 0.01..."
...,...,...,...
5765,<22527048.1075843384871.JavaMail.evans@thyme>,(E-mail)'; 'Cody Carter (E-mail)'; 'Curt Hatto...,"[0.053517237305641174, -0.007062846794724464, ..."
5766,<22527048.1075843384871.JavaMail.evans@thyme>,Lednicky (E-mail)'; 'Marty McFadden (E-mail)';...,"[0.060209449380636215, 0.024425501003861427, -..."
5767,<22527048.1075843384871.JavaMail.evans@thyme>,committee soon possible meet discuss direction...,"[0.032075170427560806, 0.01716688834130764, 0...."
5768,<22527048.1075843384871.JavaMail.evans@thyme>,also set agenda Thursday person meeting. Thurs...,"[0.03593168780207634, 0.006306346971541643, 0...."


In [6]:
document = []
metaData = []

for i in range(len(df)) :
    document.append(df.iloc[i]["chunks"])
    metaData.append({"id" : df.iloc[i]["id"]})

In [7]:
import ast

In [8]:
batch_size = 500

# Parcourir le dataframe par lots
for batch_start in range(0, len(df), batch_size):
    # Créer un batch de données
    batch = df.iloc[batch_start:batch_start + batch_size]
    
    # Convertir les embeddings sous forme de chaîne en liste (utilisation de eval pour convertir de string à list)
    embeddings = batch['chunks_embedded'].apply(ast.literal_eval).tolist()

    # Extraire les documents du batch
    documents = batch['chunks'].tolist()  # Liste des chunks

    # Convertir les metadatas en dictionnaires (avec l'ID du mail comme clé)
    metadatas = [{'id': str(mail_id)} for mail_id in batch['id'].tolist()]

    # Créer des ids uniques pour chaque chunk
    chunk_ids = [f"{batch_start + idx}" for idx in range(len(batch))]  # Créer un id unique pour chaque chunk

    # Ajouter les documents et leurs métadonnées à la collection
    collection.add(
        metadatas=metadatas,  # Métadonnées sous forme de dictionnaire
        ids=chunk_ids,  # ID uniques pour chaque chunk (numérotés)
        documents=documents,  # Les chunks du document
        embeddings=embeddings  # Embeddings associés
    )

In [9]:
# Compter le nombre de documents dans la collection
num_documents = collection.count()
print(f"Il y a {num_documents} documents dans la collection.")

Il y a 5770 documents dans la collection.


In [10]:
result = collection.query(
    query_embeddings=embeddings[0],  # Exemple d'embedding pour la recherche
    n_results=5  # Limiter le nombre de résultats à 5
)

# Afficher les résultats
print(result)

{'ids': [['5500', '3082', '2197', '5239', '4112']], 'embeddings': None, 'documents': [['Steve= n=20 Schuman (schuman@haas.berkeley.edu) Thursday (March 15): Evening - Haas Technology Club Firm Night - Airport Marriot Hotel,=20 Burlingame, CA Friday (March 16): 3:30-4:45:? John Williams, SVP Marketing Intelligence Alliances, VI= SA=20 International - Wells Fargo Room - John Williams, Senior Vice President Market Intelligence Alliance= s=20 VISA International, speak experiences original member = of=20 Palm team, co-founder Razorfish graduate Haas an= d=20 Boalt schools. 5:00-??:?', '<Nancy.Sellers@RobertMondavi.com> 08/31/2000 09:53:11 To: "\'Prentice @ Berkeley\'" <PSellers@haas.berkeley.edu>, "\'Jeff Dasovich\'" <Jeff_Dasovich@enron.com> cc: Subject: know Prentice get e-mail, Jeff - could try help make sure happens.? Sunday Amber\'s birthday.? Since Cubs game would great guys could send card. come get stuff, take tomatoes want you!? going take Annie? Nancy (707) 251-4870 (phone) (707) 

In [11]:
# Récupérer un échantillon de documents depuis la collection
results = collection.query(
    query_embeddings=[0] * len(embeddings[0]),  # Utilisation de vecteurs nuls pour récupérer des documents
    n_results=1  # Limiter à 1 résultat pour récupérer la première ligne
)

# Afficher les résultats, en particulier le premier document
print("Premier document:")
print(f"IDs: {results['ids']}")
print(f"Documents: {results['documents']}")
print(f"Metadatas: {results['metadatas']}")



Premier document:
IDs: [['2959']]
Documents: [["informed hold whole week Wednesday, > September 6, therefore, important narrow options > soon can. > > Best wishes, > Heather > > > > 06:44 8/30/00 -0400, gramlr@pjm.com wrote: > >Shall try talk Monday? think talk Borenstein see > >what Haas folks mind. > > > >I tried capture everyone's comments. Allen, might want explain > > >about panel suggestions since justice. see > >took liberty offering new characterization panels > >didn't > >bring call. Everything offered strawman > >criticized changed."]]
Metadatas: [[{'id': '<943139.1075842957062.JavaMail.evans@thyme>'}]]


In [12]:
print(df['chunks_embedded'].iloc[0])  # Affiche les embeddings du premier chunk

[0.03689831867814064, 0.02045171894133091, 0.014735488221049309, -0.00098425114993006, -0.023907167837023735, -0.008060101419687271, -0.03032178059220314, -0.00013816554564982653, -0.007914449088275433, -0.016663897782564163, 0.008383555337786674, 0.012192963622510433, -0.035215139389038086, 0.0051359208300709724, -0.014724835753440857, 0.007907538674771786, -0.02225903794169426, 0.03610175848007202, -0.015431255102157593, 0.026789922267198563, -0.018149975687265396, 0.0164593905210495, 0.01756126806139946, 0.02240891195833683, -0.004460047464817762, 0.010683909058570862, 0.003620243165642023, 0.012042084708809853, -0.04743048921227455, 0.012937258929014206, 0.04579886049032211, 0.055435799062252045, 0.006289324257522821, 0.004206673242151737, 0.001196561031974852, 0.024629969149827957, -0.010899149812757969, -0.003418616484850645, 0.036424145102500916, 0.0031379947904497385, -0.02115854062139988, -0.0002919462858699262, -0.011390739120543003, -0.016907405108213425, -0.0098114972934126

In [13]:
collection.get(include=['embeddings']).keys()

dict_keys(['ids', 'embeddings', 'documents', 'uris', 'data', 'metadatas', 'included'])

In [14]:
dc = collection.get(include=['embeddings', "documents", "metadatas"])

In [15]:
for k in ['ids', 'uris', 'data', 'included'] :
    del dc[k]

In [16]:
dc.keys()

dict_keys(['embeddings', 'documents', 'metadatas'])

In [17]:
dc["embeddings"] = list(dc["embeddings"])

In [18]:
type(dc["metadatas"][0])

dict

In [19]:
dc["metadatas"][0]

{'id': '<6501090.1075843349674.JavaMail.evans@thyme>'}

In [20]:
dc["metadatas"] = list(map(lambda x : x["id"], dc["metadatas"]))

In [21]:
len(dc["documents"])

5770

In [22]:
len(dc["metadatas"])

5770

In [23]:
res = pd.DataFrame.from_dict(dc)
res

Unnamed: 0,embeddings,documents,metadatas
0,"[0.03689831867814064, 0.02045171894133091, 0.0...",FYI. Commentary calls George W. Bush task gett...,<6501090.1075843349674.JavaMail.evans@thyme>
1,"[0.04325525462627411, 0.04659399017691612, 0.0...","inauguration, President Bush's electricity pol...",<6501090.1075843349674.JavaMail.evans@thyme>
2,"[0.035252027213573456, 0.03817130625247955, 0....",Energy Regulatory Commission (FERC) signals FE...,<6501090.1075843349674.JavaMail.evans@thyme>
3,"[0.040909528732299805, 0.0485917329788208, 0.0...",California. Skyrocketing retail electric price...,<6501090.1075843349674.JavaMail.evans@thyme>
4,"[0.05127160623669624, 0.0412898063659668, 0.01...",fuses bombs lit passage Energy Policy Act (EPA...,<6501090.1075843349674.JavaMail.evans@thyme>
...,...,...,...
5765,"[0.053517237305641174, -0.007062846794724464, ...",(E-mail)'; 'Cody Carter (E-mail)'; 'Curt Hatto...,<22527048.1075843384871.JavaMail.evans@thyme>
5766,"[0.060209449380636215, 0.024425501003861427, -...",Lednicky (E-mail)'; 'Marty McFadden (E-mail)';...,<22527048.1075843384871.JavaMail.evans@thyme>
5767,"[0.032075170427560806, 0.01716688834130764, 0....",committee soon possible meet discuss direction...,<22527048.1075843384871.JavaMail.evans@thyme>
5768,"[0.03593168780207634, 0.006306346971541643, 0....",also set agenda Thursday person meeting. Thurs...,<22527048.1075843384871.JavaMail.evans@thyme>


In [None]:
res.to_csv("instructor-Embedding-v0.csv",  index=False)

In [24]:
for i in range(100) :
    print('//\n')
    print(res.iloc[i]["documents"])
    print('//\n')

//

FYI. Commentary calls George W. Bush task getting competition electricity markets. Jim ----- Forwarded James Steffes/NA/Enron 02/06/2001 08:06 ----- "PennFuture" <pennfuture@pennfuture.org> 02/05/2001 03:33 PM Please respond "PennFuture" To: <Undisclosed-Recipient:@mailman.enron.com;> cc: Subject: PennFuture's E-cubed - Breaking Box PennFuture's E-cubed commentary biweekly email publication concerning current themes trends energy market. February 5, 2001 Vol. 3, No. 3 Breaking Box Two weeks
//

//

inauguration, President Bush's electricity policy already emerging crucible California characterized obedience free market ideology federalism confines federal government limited, backseat role. told California's crisis isolated event, result California's unique mistakes. view, federal government neither contributed crisis major role play solving it. Wholesale market caps level declared inappropriate intrusion free markets make things worse. Moreover, appointment Curt Hebert new Chairman

In [25]:
print(res.iloc[1]["documents"])

inauguration, President Bush's electricity policy already emerging crucible California characterized obedience free market ideology federalism confines federal government limited, backseat role. told California's crisis isolated event, result California's unique mistakes. view, federal government neither contributed crisis major role play solving it. Wholesale market caps level declared inappropriate intrusion free markets make things worse. Moreover, appointment Curt Hebert new Chairman Federal


In [26]:
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
from tqdm import tqdm

# Charger le modèle et le tokenizer pour 'Instructor-Base'
model_name = 'hkunlp/instructor-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).encoder.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# Fonction pour appliquer une instruction spécifique et générer des embeddings
def compute_embeddings_batch(batch_texts, instruction="Represent the text for clustering"):
    # Ajouter l'instruction au début de chaque texte
    instructed_texts = [[instruction, text] for text in batch_texts]
    
    # Encoder les textes avec l'instruction
    encoded_input = tokenizer(instructed_texts, padding=True, truncation=True, max_length=500, return_tensors='pt').to(model.device)
    
    # Calculer les sorties du modèle
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Extraire les embeddings du token [CLS] (premier token)
    sentence_embeddings = model_output.last_hidden_state[:, 0, :]
    
    # Normaliser les embeddings
    return F.normalize(sentence_embeddings, p=2, dim=1).cpu().tolist()

# Fonction pour traiter le DataFrame par lots et calculer les embeddings
def process_in_batches(df, batch_size=200, instruction="Represent the text for clustering"):
    all_embeddings = []
    for i in tqdm(range(0, len(df), batch_size)):
        batch_texts = df['chunks'][i:i+batch_size].tolist()  # Extraire les textes (chunks) pour le batch
        batch_embeddings = compute_embeddings_batch(batch_texts, instruction)  # Passer l'instruction à la fonction
        all_embeddings.extend(batch_embeddings)  # Ajouter les embeddings calculés
    return all_embeddings

# Exemple d'utilisation pour une seule question avec instruction
question = ["What was the impact of President Bush's electricity policy on California's crisis?"]

# Calculer l'embedding pour la question donnée
question_embedding = compute_embeddings_batch(question, instruction="Represent the text for clustering")

# Afficher l'embedding de la question
print(question_embedding)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of T5Model were not initialized from the model checkpoint at hkunlp/instructor-base and are newly initialized: ['decoder.block.0.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.0.SelfAttention.o.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight', 'decoder.block.0.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.o.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.0.layer.1.EncDecAttention.v.weight', 'decoder.block.0.layer.1.layer_norm.weight', 'decoder.block.0.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.k.weight', 'decoder.block.1.layer.0.SelfAttention.o.weight', 'decod

[[0.018463067710399628, 0.0442759208381176, -0.02457207813858986, 0.024228503927588463, 0.02771761454641819, 0.02665036730468273, 0.008203443139791489, 0.02613377943634987, -0.010203095152974129, -0.016640396788716316, -0.009070590138435364, 0.02047143317759037, -0.05917111784219742, 0.01768452674150467, -0.030639750882983208, 0.051705408841371536, 0.0055555435828864574, -0.008591403253376484, -0.01771545596420765, 0.03511461988091469, 0.025261102244257927, 0.016839267686009407, 0.02302825078368187, 0.02676311321556568, -0.013401038944721222, -0.04005355015397072, 0.03826333209872246, -0.03321031853556633, -0.0634833350777626, 0.02926575392484665, 0.08376418799161911, 0.026441892609000206, 0.02372833341360092, -0.02027185820043087, -0.014240972697734833, 0.048967842012643814, -0.028662923723459244, -0.03514677286148071, 0.054921332746744156, -0.013024800457060337, -0.024573130533099174, -0.016352269798517227, 0.023493103682994843, 0.0030178995802998543, -0.0075017851777374744, 0.033266

In [27]:
# Effectuer la requête en utilisant l'embedding de la question
result = collection.query(
    query_embeddings=question_embedding,  # L'embedding de la question
    n_results=5  # Limiter le nombre de résultats à 5
)

# Afficher les résultats
print(result)


{'ids': [['3833', '4732', '4111', '3560', '147']], 'embeddings': None, 'documents': [["RE: tx buyout announced Friday. best info available story today's WSJ. help? Best, Jeff", 'access generation daily? wondering guy wrote story...', 'Washington, Rivera Brooks Los Angeles. Times staff writers Tim Reiterman San Francisco Nancy Vogel Sacramento contributed story. Copyright 2001 Los Angeles Times ? ?? ?', 'Journal ? Ron Statler, KMJ News', 'proposal? Jim']], 'uris': None, 'data': None, 'metadatas': [[{'id': '<20828423.1075843372506.JavaMail.evans@thyme>'}, {'id': '<26512509.1075842957925.JavaMail.evans@thyme>'}, {'id': '<3507255.1075843374130.JavaMail.evans@thyme>'}, {'id': '<26414653.1075843370269.JavaMail.evans@thyme>'}, {'id': '<2194981.1075843350330.JavaMail.evans@thyme>'}]], 'distances': [[0.22638805210590363, 0.273346483707428, 0.2763492465019226, 0.28824013471603394, 0.29058775305747986]], 'included': [<IncludeEnum.distances: 'distances'>, <IncludeEnum.documents: 'documents'>, <Inc

In [28]:
for i in range (5):
    print ('Mail ID:', result['ids'][0][i])
    print ('Document:', result['documents'][0][i])


Mail ID: 3833
Document: RE: tx buyout announced Friday. best info available story today's WSJ. help? Best, Jeff
Mail ID: 4732
Document: access generation daily? wondering guy wrote story...
Mail ID: 4111
Document: Washington, Rivera Brooks Los Angeles. Times staff writers Tim Reiterman San Francisco Nancy Vogel Sacramento contributed story. Copyright 2001 Los Angeles Times ? ?? ?
Mail ID: 3560
Document: Journal ? Ron Statler, KMJ News
Mail ID: 147
Document: proposal? Jim
