## Notebook to play around with different vectorstores

### Chromadb

In [15]:
import os
import glob
from typing import List
from dotenv import load_dotenv
from multiprocessing import Pool
from tqdm import tqdm

from langchain.document_loaders import (
    CSVLoader,
    JSONLoader,
    DataFrameLoader
)

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
#from constants import CHROMA_SETTINGS

from pandas import DataFrame, to_datetime, read_parquet
load_dotenv()

import openai

import requests

from azure.data.tables import TableServiceClient, TableEntity
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO
from datetime import date
import ast


In [2]:
# Load environment variables
persist_directory = os.environ.get('PERSIST_DIRECTORY')
source_directory = os.environ.get('SOURCE_DIRECTORY', 'source_documents')
embeddings_model_name = os.environ.get('EMBEDDINGS_MODEL_NAME')
chunk_size = 500
chunk_overlap = 50

In [3]:
import os
from dotenv import load_dotenv
from chromadb.config import Settings

load_dotenv()

# Define the folder for storing database
PERSIST_DIRECTORY = os.environ.get('PERSIST_DIRECTORY')

# Define the Chroma settings
CHROMA_SETTINGS = Settings(
        chroma_db_impl='duckdb+parquet',
        persist_directory=PERSIST_DIRECTORY,
        anonymized_telemetry=False
)


In [6]:
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')

In [7]:
#get data from azure blob storage
def get_data(file_name):
    try:
        # Create the BlobServiceClient object which will be used
        blob_service_client = BlobServiceClient.from_connection_string(OUTLOOK_CONTENT_CONNECTION_STRING)

        container_name = 'outlookcontent'
        #get today's date
        # Create a blob client using the local file name as the name for the blob
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
        
        #download blob
        blob = blob_client.download_blob()
        #convert blob to dataframe
        df = read_parquet(BytesIO(blob.readall()))
        
        #convert blob to dataframe
        #df =read_csv(BytesIO(blob.readall()), sep=',', encoding='utf-8')
        
    except Exception as e:
        return e.message, e.args

    else:
        return df

In [12]:
df = get_data('2023-07-04test_modindask_outlook_data.parquet')
df.shape

(1587, 13)

In [13]:
df["finish_reason"] = df['content_processed'].apply(lambda x: x["choices"][0]["finish_reason"])

In [14]:
#drop rows with finish_reason is not Error
df = df[df['finish_reason'] == 'stop']
df.shape


(506, 14)

In [18]:
#create list of dictionaries from json in content_processed column
def create_list(value):
    try:
        string_to_list = ast.literal_eval(value["choices"][0]["message"]["content"])
    except:
        string_to_list = []
    return string_to_list

In [19]:
#create new column with list of dictionaries
df['content_processed_list'] = df['content_processed'].apply(create_list)

In [20]:
failed_rows = df[df['content_processed_list'].apply(len) == 0]
failed_rows.shape

(14, 15)

In [21]:
#drop rows with empty content_processed_list
df = df[df['content_processed_list'].apply(len) > 0]
df.shape

In [22]:
#create a text from the Nachrichtkeys in the json
def create_text(data):
    output_string = ''
    for d in data:
        for key, value in d.items():
            output_string += str(key) + ': ' + str(value) + '\n'                
        output_string += '\n'  
    return output_string

In [23]:
df['text'] = df['content_processed_list'].apply(create_text)
df.shape

(492, 16)

In [30]:
#create txt for each row in dataframe column text
def create_txt(row):
    try:
        file_name = "/home/bender/GIT/CAS_AML_final/emails_processed/" + row['PartitionKey'] + '.txt'
        with open(file_name, 'w', encoding='utf-8') as f:
            f.write(row['text'])
    except Exception as e:
        return e.message, e.args
   

In [31]:
df.apply(create_txt, axis=1)

0       None
1       None
2       None
16      None
17      None
        ... 
1572    None
1573    None
1574    None
1575    None
1583    None
Length: 492, dtype: object

In [None]:
#function to create a list of dictionaries from the json in the content_processed column
def create_list(value):
    try:
        string_to_list = ast.literal_eval(value["choices"][0]["message"]["content"])
    except:
        string_to_list = []
    return string_to_list

In [8]:
# Custom document loaders
class MyElmLoader(UnstructuredEmailLoader):
    """Wrapper to fallback to text/plain when default does not work"""

    def load(self) -> List[Document]:
        """Wrapper adding fallback for elm without html"""
        try:
            try:
                doc = UnstructuredEmailLoader.load(self)
            except ValueError as e:
                if 'text/html content not found in email' in str(e):
                    # Try plain text
                    self.unstructured_kwargs["content_source"]="text/plain"
                    doc = UnstructuredEmailLoader.load(self)
                else:
                    raise
        except Exception as e:
            # Add file_path to exception message
            raise type(e)(f"{self.file_path}: {e}") from e

        return doc


# Map file extensions to document loaders and their arguments
LOADER_MAPPING = {
    ".csv": (CSVLoader, {}),
    # ".docx": (Docx2txtLoader, {}),
    ".doc": (UnstructuredWordDocumentLoader, {}),
    ".docx": (UnstructuredWordDocumentLoader, {}),
    ".enex": (EverNoteLoader, {}),
    ".eml": (MyElmLoader, {}),
    ".epub": (UnstructuredEPubLoader, {}),
    ".html": (UnstructuredHTMLLoader, {}),
    ".md": (UnstructuredMarkdownLoader, {}),
    ".odt": (UnstructuredODTLoader, {}),
    ".pdf": (PDFMinerLoader, {}),
    ".ppt": (UnstructuredPowerPointLoader, {}),
    ".pptx": (UnstructuredPowerPointLoader, {}),
    ".txt": (TextLoader, {"encoding": "utf8"}),
    # Add more mappings for other file extensions and loaders as needed
}


In [9]:
def load_single_document(file_path: str) -> Document:
    ext = "." + file_path.rsplit(".", 1)[-1]
    if ext in LOADER_MAPPING:
        loader_class, loader_args = LOADER_MAPPING[ext]
        loader = loader_class(file_path, **loader_args)
        return loader.load()[0]

    raise ValueError(f"Unsupported file extension '{ext}'")


In [10]:
def load_documents(source_dir: str, ignored_files: List[str] = []) -> List[Document]:
    """
    Loads all documents from the source documents directory, ignoring specified files
    """
    all_files = []
    for ext in LOADER_MAPPING:
        all_files.extend(
            glob.glob(os.path.join(source_dir, f"**/*{ext}"), recursive=True)
        )
    filtered_files = [file_path for file_path in all_files if file_path not in ignored_files]

    with Pool(processes=os.cpu_count()) as pool:
        results = []
        with tqdm(total=len(filtered_files), desc='Loading new documents', ncols=80) as pbar:
            for i, doc in enumerate(pool.imap_unordered(load_single_document, filtered_files)):
                results.append(doc)
                pbar.update()

    return results

In [11]:
def process_documents(ignored_files: List[str] = []) -> List[Document]:
    """
    Load documents and split in chunks
    """
    print(f"Loading documents from {source_directory}")
    documents = load_documents(source_directory, ignored_files)
    if not documents:
        print("No new documents to load")
        exit(0)
    print(f"Loaded {len(documents)} new documents from {source_directory}")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    texts = text_splitter.split_documents(documents)
    print(f"Split into {len(texts)} chunks of text (max. {chunk_size} tokens each)")
    return texts

In [12]:
def does_vectorstore_exist(persist_directory: str) -> bool:
    """
    Checks if vectorstore exists
    """
    if os.path.exists(os.path.join(persist_directory, 'index')):
        if os.path.exists(os.path.join(persist_directory, 'chroma-collections.parquet')) and os.path.exists(os.path.join(persist_directory, 'chroma-embeddings.parquet')):
            list_index_files = glob.glob(os.path.join(persist_directory, 'index/*.bin'))
            list_index_files += glob.glob(os.path.join(persist_directory, 'index/*.pkl'))
            # At least 3 documents are needed in a working vectorstore
            if len(list_index_files) > 3:
                return True
    return False

In [None]:
def main():
    # Create embeddings
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)

    if does_vectorstore_exist(persist_directory):
        # Update and store locally vectorstore
        print(f"Appending to existing vectorstore at {persist_directory}")
        db = Chroma(persist_directory=persist_directory, embedding_function=embeddings, client_settings=CHROMA_SETTINGS)
        collection = db.get()
        texts = process_documents([metadata['source'] for metadata in collection['metadatas']])
        print(f"Creating embeddings. May take some minutes...")
        db.add_documents(texts)
    else:
        # Create and store locally vectorstore
        print("Creating new vectorstore")
        texts = process_documents()
        print(f"Creating embeddings. May take some minutes...")
        db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory, client_settings=CHROMA_SETTINGS)
    db.persist()
    db = None

    print(f"Ingestion complete! You can now run privateGPT.py to query your documents")

In [13]:
main()

### Redis

In [1]:
import redis
from redis.commands.search.indexDefinition import (
    IndexDefinition,
    IndexType
)
from redis.commands.search.query import Query
from redis.commands.search.field import (
    TextField,
    VectorField
)
from dotenv import load_dotenv
import os
import json
from azure.data.tables import TableServiceClient, TableEntity
from pandas import DataFrame, to_datetime, read_parquet
from numpy import array, float32
import openai
from datetime import datetime, date
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from io import BytesIO

In [2]:
load_dotenv()


True

In [3]:
OUTLOOK_CONTENT_CONNECTION_STRING = os.environ.get('OUTLOOK_CONTENT_CONNECTION_STRING')
OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")

In [4]:
#OUTLOOK_CONTENT_CONNECTION_STRING

In [5]:
#get data from azure blob storage
def get_data(file_name):
    try:
        # Create the BlobServiceClient object which will be used
        blob_service_client = BlobServiceClient.from_connection_string(OUTLOOK_CONTENT_CONNECTION_STRING)

        container_name = 'outlookcontent'
        #get today's date
        #today = date.today().strftime('%Y-%m-%d')
        # Create a blob client using the local file name as the name for the blob
        
        blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
        
        #download blob
        blob = blob_client.download_blob()
        #convert blob to dataframe
        df = read_parquet(BytesIO(blob.readall()))
        
    except Exception as e:
        return e.message, e.args

    else:
        return df


In [8]:
data = get_data("2023-06-23_outlook_data.parquet")

In [9]:
data.shape

(1375, 12)

In [None]:
data.head()

In [11]:
len(data['content_ada_embedding'][4])

1536

In [12]:
data.columns

Index(['PartitionKey', 'RowKey', 'message_id', 'subject', 'content', 'sender',
       'recipient', 'received_datetime', 'conversation_id', 'web_link',
       'content2embed', 'content_ada_embedding'],
      dtype='object')

In [16]:
#drop rows where content_ada_embedding is empty list
data = data[data['content_ada_embedding'].map(len) > 0]
data.shape

(1371, 12)

In [17]:
REDIS_PASSWORD = os.environ.get('REDIS_PASSWORD')

In [18]:
# Connect to Redis
redis_client = redis.Redis(
    host="redis-17123.c56.east-us.azure.cloud.redislabs.com",
    port=17123,
    password=REDIS_PASSWORD
)
redis_client.ping()

True

In [19]:
# Constants
VECTOR_DIM = 1536 # length of the vectors
VECTOR_NUMBER = len(data)                 # initial number of vectors
INDEX_NAME = "openai_embeddings-index"           # name of the search index
PREFIX = "outlook"                            # prefix for the document keys
DISTANCE_METRIC = "COSINE"                # distance metric for the vectors (ex. COSINE, IP, L2)

In [20]:
# Define RediSearch fields for each of the columns in the dataset
subject = TextField(name="subject")
web_link = TextField(name="web_link")
#conversation_id = TextField(name="conversation_id")
text = TextField(name="content2embed")

content_ada_embedding = VectorField("content_ada_embedding",
    "FLAT", {
        "TYPE": "FLOAT32",
        "DIM": VECTOR_DIM,
        "DISTANCE_METRIC": DISTANCE_METRIC,
        "INITIAL_CAP": VECTOR_NUMBER,
    }
)
fields = [subject, web_link, text, content_ada_embedding]

In [21]:
# Check if index exists
try:
    redis_client.ft(INDEX_NAME).info()
    print("Index already exists")
except:
    # Create RediSearch Index
    redis_client.ft(INDEX_NAME).create_index(
        fields = fields,
        definition = IndexDefinition(prefix=[PREFIX], index_type=IndexType.HASH)
)

Index already exists


In [22]:
def index_documents(client: redis.Redis, prefix: str, documents: DataFrame):
    records = documents.to_dict("records")
    for doc in records:
        key = f"{prefix}:{str(doc['message_id'])}"

        # create byte vectors for title and content
        
        content_ada_embedding = array(doc["content_ada_embedding"], dtype=float32).tobytes()

        # replace list of floats with byte vectors
        
        doc["content_ada_embedding"] = content_ada_embedding

        client.hset(key, mapping = doc)

In [23]:
data1 = data[['message_id', 'subject', 'web_link', 'content2embed', 'content_ada_embedding']]

In [24]:
index_documents(redis_client, PREFIX, data1)


ResponseError: OOM command not allowed when used memory > 'maxmemory'.

In [45]:
print(f"Loaded {redis_client.info()['Test']['keys']} documents in Redis search index with name: {INDEX_NAME}")

KeyError: 'Test'