In [None]:
import google.cloud.aiplatform as aip
from google.cloud import storage
import nltk
from langchain.document_loaders import GCSFileLoader, GCSDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
PROJECT_ID = 
LOCATION = 
BUCKET_NAME = 
BUCKET_URI = 
SERVICE_ACCOUNT=
prefix=
aip.init(project=PROJECT_ID, location=LOCATION)
gcs = storage.Client(project=PROJECT_ID)

## Create chunks of text data using Langchain

In [None]:
loader = GCSDirectoryLoader(project_name=PROJECT_ID, bucket=BUCKET_NAME, prefix=prefix)
documents=loader.load()
r_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    separators = ["\n"," ","\n\n"]
)

r_docs = r_text_splitter.split_documents(documents)

In [None]:
chunked = []
for s in r_docs:
    # print(s.metadata['source'])
    r = {"metadata": s.metadata['source'], "content": s.page_content}
    chunked.append(r)

## Embedding function

In [None]:
from typing import List, Optional
import vertexai
import pandas as pd

from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
vertexai.init(project=PROJECT_ID, location=LOCATION)

def embed_text(
    texts: List[str] = ["Test 1 ", "test 2"],
    task: str = "RETRIEVAL_DOCUMENT",
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 256,
) -> List[List[float]]:
    """Embeds texts with a pre-trained, foundational model."""
    model = TextEmbeddingModel.from_pretrained(model_name)

    inputs = [TextEmbeddingInput(text, task) for text in texts]
    kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
    embeddings = model.get_embeddings(inputs, **kwargs)
    return [embedding.values for embedding in embeddings]

## Generating Summary of table data

In [None]:
storage_client = storage.Client()
bucket = storage_client.get_bucket(BUCKET_NAME)
blobs_specific = list(bucket.list_blobs(prefix=prefix))
blob_names = [blob.name for blob in blobs_specific]

In [None]:
import vertexai

from vertexai.generative_models import GenerativeModel, GenerationConfig


vertexai.init(project=PROJECT_ID, location=LOCATION)

def llm_prediction(prompt:str):
    
    model = GenerativeModel(model_name="gemini-1.0-pro-002",generation_config=GenerationConfig(
                temperature=0.0,
                top_p=1,
                top_k=10,
                candidate_count=1,
                max_output_tokens=200,
                stop_sequences=["STOP!"],
            ))
    
    prompt_text = f"""You are an assistant tasked with summarizing tables and text. \
    Give a concise summary of the table or text. Table or text chunk: {prompt} """
    
    try:
        response = model.generate_content(prompt_text).text
    except Exception as e:
        return "NA"
    return response

In [None]:
table_summary = {
    "metadata": [],
    "content": [],
    "summary": [],
}
for files in blob_names:
    if files[-1]!="/":
        print("\n *** Generating summary of filename: ",files)
        blob = bucket.get_blob(files)
        downloaded_blob = blob.download_as_string()
        result = llm_prediction(downloaded_blob)
        if result == "NA":
            print("No text found...")
            continue
        table_summary["metadata"].append(files)
        table_summary["content"].append(f"{downloaded_blob}")
        table_summary["summary"].append(f"{result}")
        print(f"\n *** Summary of the above table in {files} is done..")

In [None]:
table_summary_embedding.to_csv("table_embedding.csv", encoding='utf-8', index=False)

## Generate embeddings of chunks created by Langchain

In [None]:
batch_size = 5
temp_chunked = chunked
for i in range(0, len(temp_chunked), batch_size):
    request = [x["content"] for x in temp_chunked[i : i + batch_size]]
    response = embed_text(request)
    # Store the retrieved vector embeddings for each chunk back.
    for x, e in zip(chunked[i : i + batch_size], response):
        x["embedding"] = e

# Store the generated embeddings in a pandas dataframe.
conf_embeddings = pd.DataFrame(temp_chunked)

In [None]:
conf_embeddings.to_csv("research_embedding.csv", encoding='utf-8', index=False)

## Generate embeddings of Table summary

In [None]:
temp_table_embedding = table_summary_embedding
request = temp_table_embedding["summary"]
response = embed_text(request)
temp_table_embedding["embedding"]=response

In [None]:
temp_table_embedding.to_csv("table_summary_embedding.csv", encoding='utf-8', index=False)

## Storing the embeddings in chromadb

In [None]:
from langchain.document_loaders import CSVLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from chromadb.config import Settings
import chromadb
import numpy as np
from pprint import pprint
from IPython.display import Markdown
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import pandas as pd

In [None]:
chroma_client = chromadb.PersistentClient(path="my_db")
collection = chroma_client.get_or_create_collection("research_paper_embeddings")

In [None]:
columns_=["metadata","content","embedding"]
df_1 = pd.read_csv("research_embedding.csv",usecols=columns_)

df_2 = pd.read_csv("research_table_summary_embedding.csv",usecols=columns_)
df_3= pd.concat([df_1, df_2], ignore_index=True)
df_3.to_csv("consolidated_embeddings.csv", encoding='utf-8', index=False)

In [None]:
import csv

# Load sample data (a restaurant menu of items)
with open('consolidated_embeddings.csv') as file:
    lines = csv.reader(file)

    # Store the name of the menu items in this array. In Chroma, a "document" is a string i.e. name, sentence, paragraph, etc.
    documents = []

    # Store the corresponding menu item IDs in this array.
    metadatas = []
    embeddings = []

    # Each "document" needs a unique ID. This is like the primary key of a relational database. We'll start at 1 and increment from there.
    ids = []
    id = 1

    # Loop thru each line and populate the 3 arrays.
    for i, line in enumerate(lines):
        if i==0:
            # Skip the first row (the column headers)
            continue

        documents.append(line[1])
        metadatas.append({"item_id": line[0]})
        embeddings.append(np.concatenate(np.asarray(np.matrix(line[2]))).tolist())
        ids.append(str(id))
        id+=1

In [None]:
collection.add(
    embeddings=embeddings,
    documents=documents,
    metadatas=metadatas,
    ids=ids
)