# Indexing Chat Text

In [1]:
import os
from typing import List

import redis
import pandas as pd
from langchain.document_loaders import TextLoader, CSVLoader
from langchain.vectorstores.redis import Redis
from langchain.schema import Document
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
CACHE_TYPE = os.getenv("CACHE_TYPE")
REDIS_URL = os.getenv("REDIS_URL")
OPENAI_COMPLETIONS_ENGINE = os.getenv("OPENAI_COMPLETIONS_ENGINE")
INDEX_NAME = os.getenv("INDEX_NAME")

## 1. Loading Chat Text

The [starter notebook](./0.0.1-sample-LLM-Stack-Hackathon-Starter.ipynb) used manual indexing with schema that's not compatible with Langchain internals.

Thus, we are encoding the embeddings and indexing everything again.

In [3]:
datasource = pd.read_csv(
    '../data/chats.csv'
).to_dict("records")
# Create documents
chat_texts = [
    Document(
        page_content=doc["chat_text"],
        metadata={
            "channel_name": doc["channel_name"],
            "thread_id": doc["thread_id"],
        }
    ) for doc in datasource
]

## 2. Indexing

In [4]:
embeddings = OpenAIEmbeddings()
vectorstore = Redis.from_documents(
    documents=chat_texts,
    embedding=embeddings,
    index_name='chat_index',
    redis_url=REDIS_URL,
)

In [5]:
vectorstore.similarity_search("foobar")

[Document(page_content='U019ABXBYET: <https://bard.google.com/>', metadata={'channel_name': 'generative-ai', 'thread_id': '2023-03-24 15:56:52.108999 UTC'}),
 Document(page_content='U019ABXBYET: <https://bard.google.com/>', metadata={'channel_name': 'generative-ai', 'thread_id': '2023-03-24 15:56:52.108999 UTC'}),
 Document(page_content='U01G8F6E38T: <https://www.youtube.com/watch?v=OWIxzE2D7Xk>', metadata={'channel_name': 'random', 'thread_id': '2023-04-24 17:24:07.087339 UTC'}),
 Document(page_content='U01G8F6E38T: <https://www.youtube.com/watch?v=OWIxzE2D7Xk>', metadata={'channel_name': 'random', 'thread_id': '2023-04-24 17:24:07.087339 UTC'})]