<a href="https://colab.research.google.com/github/keylacampusano/Python-Practice/blob/main/Embeddings_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install chromadb
!pip install sentence-transformers
!pip install faiss-cpu
!pip install datasets
!pip install pandas
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
!pip install nltk
!pip isntall gensim

In [3]:
#Hello, en esta parte vamos a importar Chroma DB, FAISS, y todas las librerias que utilizaremos

import chromadb
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
from datasets import load_dataset
from nltk.tokenize import sent_tokenize, word_tokenize
import gensim
from gensim.models import Word2Vec
import spacy
from collections import defaultdict
from time import time
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from chromadb.config import Settings
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import os


In [271]:
#Import Dataset from HF
dataset = load_dataset("embedding-data/sentence-compression")

In [288]:
df_train = pd.DataFrame(dataset["train"]).head(10)

In [289]:
texts = df_train["set"]

In [290]:
print (texts)

0    [The USHL completed an expansion draft on Mond...
1    [Major League Baseball Commissioner Bud Selig ...
2    [It's fresh cherry time in Michigan and the be...
3    [An Evesham man is facing charges in Pennsylva...
4    [NRT LLC, one of the nation's largest resident...
5    [THE JSE kept toying with an all time high by ...
6    [The government is defending the latest police...
7    [The renovated Marappalam bridge, which had be...
8    [A new survey shows 30 percent of Californians...
9    [Brightpoint ,a provider of logistic services ...
Name: set, dtype: object


In [368]:
new_texts = texts.tolist()

In [369]:
new_texts

[["The USHL completed an expansion draft on Monday as 10 players who were on the rosters of USHL teams during the 2009-10 season were selected by the League's two newest entries, the Muskegon Lumberjacks and Dubuque Fighting Saints.",
  'USHL completes expansion draft'],
 ['Major League Baseball Commissioner Bud Selig will be speaking at St. Norbert College next month.',
  'Bud Selig to speak at St. Norbert College'],
 ["It's fresh cherry time in Michigan and the best time to enjoy this delicious and nutritious fruit.",
  "It's cherry time"],
 ['An Evesham man is facing charges in Pennsylvania after he allegedly dragged his girlfriend from the side of his pickup truck on the campus of Kutztown University in the early morning hours of Dec. 5, police said.',
  'Evesham man faces charges for Pa.'],
 ["NRT LLC, one of the nation's largest residential real estate brokerage companies, announced several executive appointments within its Coldwell Banker Residential Brokerage operations in Sout

In [8]:
df_train.shape

(10, 1)

In [None]:
#Now we import models from HF
from transformers import AutoTokenizer, AutoModel

model_st = SentenceTransformer('paraphrase-distilroberta-base-v1')



In [12]:
import concurrent.futures

In [193]:
embeddings = model_st.encode(texts)

In [196]:
def encode_texts(model):
    return model_st.encode(texts)

texts_embeddings = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    # Launch text encoding for each model in parallel to obtain a fastest result
    futures = [executor.submit(encode_texts, model) for model in model_st]

    # Gather results as they complete with the embed
    for future, model in zip(concurrent.futures.as_completed(futures), model_st):
        texts_embeddings.append(future.result())

In [195]:
print(embeddings)

[[-0.01476244 -0.11661881 -0.0585124  ... -0.07039721 -0.00150099
   0.3003174 ]
 [ 0.01026955  0.24575628 -0.01019975 ...  0.08323041  0.2253961
   0.2159326 ]
 [-0.17097703  0.24742264  0.3673481  ...  0.76167136  0.04267222
   0.10653625]
 ...
 [-0.2833038  -0.23699163 -0.0029812  ...  0.42410824 -0.07033559
  -0.10299055]
 [-0.2131704  -0.02782431 -0.09215224 ...  0.24908543  0.41691625
   0.24698234]
 [ 0.11760301  0.36261845 -0.04503371 ... -0.02317489  0.2086976
   0.10080846]]


In [197]:
embeddings.shape

(10, 768)

In [198]:
#def a function to search by similarity, (idea is to have an input and the sentence to compare, and then to obtain a similarity with the euclidan vector)
def euclidean_distances(X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None):

  def search_similarity(query_vector, embeddings, distance_metric="euclidean", k=5):
    if distance_metric == "euclidean":
        distances = euclidean_distances(query_vector.reshape(1, -1), embeddings)
    else:
        raise ValueError("Unsupported distance metric")
    sorted_indices = np.argsort(distances)
    return sorted_indices[:k], distances[sorted_indices][:k]



In [201]:
#Now we need to create the indexes for FAISS & CHROMA DB, sirven para mejorar el rendimiento de las consultas de búsqueda por similitud.

vector_dimensions = embeddings.shape[1]

index= faiss.IndexFlatL2(vector_dimensions)
faiss.normalize_L2(embeddings)
index.add(embeddings)

In [217]:
import numpy as np

search_text = 'its fresh'
search_vector = model_st.encode(search_text)
new_vector = np.array(search_vector.reshape(1, -1))
faiss.normalize_L2(new_vector)

In [218]:
distances,ann = index.search(new_vector, k=4)
results= pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
df_merged = pd.merge(results,df_train,left_on='ann', right_index=True)

In [245]:
df_merged.head()

Unnamed: 0,distances,ann,set
0,1.467611,2,[It's fresh cherry time in Michigan and the be...
1,1.807476,4,"[NRT LLC, one of the nation's largest resident..."
2,1.835946,7,"[The renovated Marappalam bridge, which had be..."
3,1.839991,0,[The USHL completed an expansion draft on Mond...


In [None]:
#from sklearn.metrics.pairwise import euclidean_distances

In [253]:
print(query_vector.shape)


(768,)


In [280]:
def chromadb_index(embeddings):
    from langchain.vectorstores import Chroma
    chroma_index = Chroma(embeddings.shape[1])
    chroma_index.populate(embeddings)
    return chroma_index

In [320]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
texts_db = [texts]
text_embeddings_db = model.encode(texts)


In [321]:
type(text_embeddings_db)

numpy.ndarray

In [322]:
text_embeddings_db.shape

(10, 384)

In [None]:
!python -m spacy download en_core_web_md



In [None]:
!python -m pip install spacy

In [None]:
!pip install chromadb

In [326]:
pip install cosine-similarity



In [373]:
import chromadb
from chromadb.utils import embedding_functions
CHROMA_DATA_PATH = "chroma_data/"
EMBED_MODEL = "all-MiniLM-L6-v2"
COLLECTION_NAME = "demo_docs"

client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

In [None]:
embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name=EMBED_MODEL
)

collection = client.create_collection(
    name=COLLECTION_NAME,
    embedding_function=embedding_func,
    metadata={"hnsw:space": "cosine"},
)

In [376]:
documents = (texts)
documents = documents.tolist()


In [377]:
genres = [
    "technology",
    "travel",
    "science",
    "food",
    "history",
    "fitness",
    "art",
    "climate change",
    "business",
    "music",
]

In [None]:
collection.add(
    documents=documents,
    ids=[f"id{i}" for i in range(len(documents))],
    metadatas=[{"genre": g} for g in genres]
)


In [379]:
print(f"Number of documents: {len(documents)}")
print(f"Number of ids: {len([f'id{i}' for i in range(len(documents))])}")
print(f"Number of genres: {len(genres)}")

Number of documents: 10
Number of ids: 10
Number of genres: 10


In [382]:
query_results = collection.query(
    query_texts=["albert einstein"],
    n_results=1,
)

query_results.keys()
query_results["documents"]
query_results["ids"]
query_results["distances"]
query_results["metadatas"]

[[{'genre': 'travel'}]]

In [383]:
query_results = collection.query(
    query_texts=["Teach me about history",
                 "What's going on in the world?"],
    include=["documents", "distances"],
    n_results=2
)

query_results["documents"][0]

query_results["distances"][0]

query_results["documents"][1]

query_results["distances"][1]

[0.7700092399209757, 0.7864988653289766]