# Install Packages

In [21]:
# Install necessary packages
!pip install weaviate-client langchain openai unstructured "unstructured[pdf]" -U langchain-community pymupdf -U langchain-openai pytesseract pillow transformers torch torchvision torchaudio pydub SpeechRecognition

!sudo apt-get update && sudo apt-get install -y tesseract-ocr ffmpeg

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Ign:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://r2u.stat.illinois.edu/ubuntu jammy Release
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading pack

In [22]:
!pip install youtube-transcript-api



In [23]:
!pip install -U langchain-openai



In [24]:
!pip install pytube



In [25]:
pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib



# Import Libraries

In [26]:
# Import Libraries
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from langchain.schema import Document
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
import os
from pydub import AudioSegment
import speech_recognition as sr

# Get the API Secretes

In [27]:
# Get the secrets
from google.colab import drive, userdata
drive.mount('/content/drive')

OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
WEAVIATE_API_KEY = userdata.get('WEAVIATE_API_KEY')
WEAVIATE_URL = userdata.get('WEAVIATE_URL')
YOUTUBE_URL = userdata.get('YOUTUBE_URL')

if OPENAI_API_KEY:
    print("OpenAI API key retrieved successfully")
else:
    print("OpenAI API key not found in Colab secrets")

if WEAVIATE_API_KEY:
    print("Weaviate API key retrieved successfully")
else:
    print("Weaviate API key not found in Colab secrets")

if WEAVIATE_URL:
    print("Weaviate URL retrieved successfully")
else:
    print("Weaviate URL not found in Colab secrets")

if YOUTUBE_URL:
    print("YOUTUBE URL retrieved successfully")
else:
    print("YOUTUBE URL not found in Colab secrets")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
OpenAI API key retrieved successfully
Weaviate API key retrieved successfully
Weaviate URL retrieved successfully
YOUTUBE URL retrieved successfully


# Import Packages to Load Documents

In [28]:
# Import Load document packages
from langchain.document_loaders import (
    DirectoryLoader,
    UnstructuredImageLoader,
    PyMuPDFLoader,
    CSVLoader,
    YoutubeLoader
)
from langchain.document_loaders.image import UnstructuredImageLoader
from langchain.document_loaders.csv_loader import CSVLoader


# Define the laoders for the data

In [29]:
# Define loaders for different modalities
text_loader = DirectoryLoader('/content/drive/MyDrive/Dallas AI/rag/text/', glob="**/*.txt", loader_cls=PyMuPDFLoader)
pdf_loader = DirectoryLoader('/content/drive/MyDrive/Dallas AI/rag/text/', glob="**/*.pdf", loader_cls=PyMuPDFLoader)
image_loader = DirectoryLoader('/content/drive/MyDrive/Dallas AI/rag/images/', glob="**/*.{png,jpg,jpeg}", loader_cls=UnstructuredImageLoader)
csv_loader = DirectoryLoader('/content/drive/MyDrive/Dallas AI/rag/tables/', glob="**/*.csv", loader_cls=CSVLoader)
video_loader = YoutubeLoader.from_youtube_url(YOUTUBE_URL, add_video_info=True)



In [30]:
# Load data from each modality
text_data = text_loader.load()
pdf_data = pdf_loader.load()
image_data = image_loader.load()
csv_data = csv_loader.load()
video_data = video_loader.load()

In [33]:
print(f"Number of text documents: {len(text_data)}")
print(f"Number of text documents: {len(pdf_data)}")
print(f"Number of image documents: {len(image_data)}")
print(f"Number of CSV documents: {len(csv_data)}")
print(f"Number of video documents: {len(video_data)}")


Number of text documents: 0
Number of text documents: 73
Number of image documents: 0
Number of CSV documents: 85
Number of video documents: 1


# Combine all the data into one
# i.e all_data

In [34]:
# Combine all data
all_data = text_data + pdf_data + image_data + csv_data + video_data

In [35]:
all_data

[Document(metadata={'source': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'file_path': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref package', 'producer': 'pdfTeX-1.40.17', 'creationDate': 'D:20171207010315Z', 'modDate': 'D:20171207010315Z', 'trapped': ''}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based o

# Data Splitting
# Split the data (all_data) into chunks

In [36]:
# Split text documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_docs = text_splitter.split_documents(all_data)

In [37]:
all_docs

[Document(metadata={'source': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'file_path': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref package', 'producer': 'pdfTeX-1.40.17', 'creationDate': 'D:20171207010315Z', 'modDate': 'D:20171207010315Z', 'trapped': ''}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based o

# Embeddings
# Embedding Conversions

In [38]:
# Set the API key as an environment variable
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [39]:
# Create the embeddings object
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

In [40]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x79d18b8f1810>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x79d18b8f3130>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

# Weaviate Vector Database Connection
# Connect to Weaviate Vector Database to store data in vector store

In [41]:
# Weaviate Vector Database Connection
import weaviate
from langchain.vectorstores import Weaviate

auth_config = weaviate.auth.AuthApiKey(api_key=WEAVIATE_API_KEY)

client = weaviate.Client(
    url=WEAVIATE_URL,
    additional_headers={"X-OpenAI-Api-key": OPENAI_API_KEY},
    auth_client_secret=auth_config,
    startup_period=10
)

# Check if your instance is live and ready
# This should return `True`
client.is_ready()

True

# Schema for Multimodal Data

In [42]:
# Define schema for multimodal data
client.schema.delete_class('MultimodalExample')

schema = {
    "classes": [
        {
            "class": "MultimodalExample",
            "vectorizer": "text2vec-openai",
            "properties": [
                {
                    "dataType": ["text"],
                    "name": "content"
                },
                {
                    "dataType": ["text"],
                    "name": "source"
                },
                {
                    "dataType": ["text"],
                    "name": "type"
                }
            ]
        }
    ]
}

client.schema.create(schema)

In [43]:
client.data_object.get()

{'deprecations': None,
 'objects': [{'class': 'A_collection_of_articles',
   'creationTimeUnix': 1723422754826,
   'id': '00075531-1281-4a0d-acf9-3f265987132c',
   'lastUpdateTimeUnix': 1723422754826,
   'properties': {'row': 13,
    'source': '/content/drive/MyDrive/Dallas AI/rag/tables/Employment_data.csv',
    'text': '\ufeff"Comparison of All Employee Average Weekly Hours: \nSeasonally Adjusted: \nbefore and after the March 2023 Benchmark": \n: None'},
   'vectorWeights': None},
  {'class': 'A_collection_of_articles',
   'creationTimeUnix': 1723415995433,
   'id': '002961df-dc23-4b0e-bd31-a4cfefc62f46',
   'lastUpdateTimeUnix': 1723415995433,
   'properties': {'author': 'U.S. Bureau of Labor Statistics',
    'creationDate': "D:20240801191110-04'00'",
    'creator': 'Microsoft® Word for Microsoft 365',
    'file_path': '/content/drive/MyDrive/Dallas AI/rag/text/current_employment_statistics_highlights_07_2024.pdf',
    'format': 'PDF 1.7',
    'keywords': '',
    'modDate': "D:20240

# Vector Store
# Load Text data into Vector Store

In [44]:
# Be patient and wait! This takes a few minutes to load
# Initialize the Weaviate vectorstore
vectorstore = Weaviate(client, index_name="MultimodalExample", text_key="text")


# Code 1: Load individual documents
for doc in all_docs:
    vectorstore.add_texts(
        texts=[doc.page_content],
        metadatas=[{"source": doc.metadata.get("source", ""), "type": doc.metadata.get("type", "")}]
    )

# Code 2: Load additional documents in bulk
text_meta_pair = [(doc.page_content, doc.metadata) for doc in all_docs]
texts, meta = list(zip(*text_meta_pair))
vectorstore.add_texts(texts, meta)


['8cae7cef-5206-41f8-a460-126d8e0bc745',
 '32e2d4d7-fa4b-4822-85c6-ef47ebb2e1ff',
 '30448fae-7276-43d9-90c9-d6bb787d80e9',
 '29e0f483-220b-45a2-a51f-2d9ce66acae4',
 'f9b60fca-a193-4b95-b97a-5de6e236fb4f',
 'a1ac4078-59a7-497f-b4d8-1fb08602e100',
 '6bbe5266-87c6-4e34-a331-12329ff54a0f',
 'c4382cba-a484-465b-b00d-f3cc107f655b',
 '33f07bae-631f-4ebb-b1f8-9840e5dd10ad',
 '1ee01012-e3bb-4e84-8356-baa9afc64a9a',
 '31f78dfe-e014-465a-ab0e-4e8124d1745c',
 '13df6c8d-c886-4df4-a34d-c9aa658bac10',
 '44cd65b2-4b98-477b-bcbc-75ab6e91734b',
 '375a29df-af0f-4de7-93cd-f400a618e744',
 'e8b0140b-fd21-40d9-8323-fa049af62ae8',
 '311f485d-7639-44b9-b575-fdb4468de13d',
 'bcf16bbc-f792-44d8-b2a4-b1fa8e4bfa78',
 'b721d586-3a0b-4c9b-89c0-3e07ac4fd908',
 '14c00e07-eeab-4e2c-9293-2a93ab9499e9',
 'f0281e17-bf2b-459c-95d9-2110ae130cc3',
 '14728249-806b-443e-bcfb-9c60d193f877',
 '8e8280bb-70af-49b6-9988-6d52c2931b47',
 '4b979623-9bdc-4f3b-b8ca-e89a50eaa51e',
 'fbd75c72-e19b-490b-8b5d-844666b528fd',
 'a4e8e2b3-84df-

In [45]:
vectorstore

<langchain_community.vectorstores.weaviate.Weaviate at 0x79d18b909330>

In [46]:
doc

Document(metadata={'source': 'fcogIjcb2gc', 'title': 'RAG Architecture Explained: Practical Example in 5 Minutes!', 'description': 'Unknown', 'view_count': 97, 'thumbnail_url': 'https://i.ytimg.com/vi/fcogIjcb2gc/hq720.jpg?sqp=-oaymwEmCIAKENAF8quKqQMa8AEB-AH-CYAC0AWKAgwIABABGBsgUSh_MA8=&rs=AOn4CLCt-Mb6WoRv0trcJJ76zJGYN3jj8Q', 'publish_date': '2024-06-28 00:00:00', 'length': 308, 'author': 'Ajay Gupta'}, page_content="skills are thiss additionally user is also proficient in the following uh languages tools and Technologies that's what we were looking for so this is what a rag example looks like we'll understand this uh going forward uh that's it for this video thank you for watching have a good day")

In [47]:
all_docs

[Document(metadata={'source': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'file_path': '/content/drive/MyDrive/Dallas AI/rag/text/Attention_Is_All_You_Need_Transformer.pdf', 'page': 0, 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'LaTeX with hyperref package', 'producer': 'pdfTeX-1.40.17', 'creationDate': 'D:20171207010315Z', 'modDate': 'D:20171207010315Z', 'trapped': ''}, page_content='Attention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based o

# Score the closeness of the vectors for the stored documents

In [52]:
def query_weaviate(query, collection_name):

    nearText = {
        "concepts": [query],
        "distance": 0.7,
    }

    properties = [
        "title", "content", "url",
        "_additional {certainty distance}"
    ]

    result = (
        client.query
        .get(collection_name, properties)
        .with_near_text(nearText)
        .with_limit(10)
        .do()
    )

    # Check for errors
    if ("errors" in result):
        print ("\033[91mYou probably have run out of OpenAI API calls for the current minute – the limit is set at 60 per minute.")
        raise Exception(result["errors"][0]['message'])

    return result["data"]["Get"][collection_name]

In [54]:
query_result = query_weaviate("all_docs", "Article")

for i, article in enumerate(query_result):
    print(f"{i+1}. { article['title']} (Score: {round(article['_additional']['certainty'],3) })")

1. None (Score: 0.894)
2. None (Score: 0.894)
3. None (Score: 0.894)
4. None (Score: 0.894)
5. None (Score: 0.894)
6. None (Score: 0.894)
7. None (Score: 0.894)
8. None (Score: 0.894)
9. None (Score: 0.894)
10. None (Score: 0.894)


#Initialize the Q&A chain

In [55]:
from langchain_openai import OpenAI

# define chain
chain = load_qa_chain(
    OpenAI(openai_api_key = OPENAI_API_KEY,temperature=0),
    chain_type="stuff")


# Similarity Search
# This searches for vectors (data) that are similar in the vectorstore

In [58]:
query = "what is a Transformer?"

# retrieve text related to the  query
docs = vectorstore.similarity_search(query, top_k=2)

In [59]:
# create answer
chain.run(input_documents=docs, question=query)

' The Transformer is a neural network architecture that relies entirely on self-attention to compute representations of its input and output, without using sequence-aligned RNNs or convolution. It has been shown to outperform other models on tasks such as machine translation and language modeling.'

In [60]:
query = "Why are there two monthly measures of employment?"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)

In [61]:
# create answer
chain.run(input_documents=docs, question=query)

' The household survey and establishment survey both produce sample-based estimates of employment, and both have strengths and limitations. The establishment survey has a smaller margin of error and a more expansive scope, while the household survey includes self-employed workers and other groups that are excluded by the establishment survey.'

In [62]:
query = "Hurricane Beryl"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)

In [63]:
# create answer
chain.run(input_documents=docs, question=query)

' Hurricane Beryl had no discernible effect on employment and hours estimates for the reference periods in both the household and establishment surveys. However, severe weather can potentially impact average weekly hours in the establishment survey and the number of people working full time in the household survey. The Bureau of Labor Statistics collects data on the impact of severe weather on employment and hours estimates, but it is not always possible to precisely quantify the effect. '

In [64]:

query = "what is the unemployment rate in July"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)



In [65]:
# create answer
chain.run(input_documents=docs, question=query)

' The unemployment rate in July was 4.3 percent.'

In [66]:
query = "Health care added how many jobs in July"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)

In [67]:
# create answer
chain.run(input_documents=docs, question=query)

' 55,000'

In [68]:
query = "What is Encoder"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)


In [69]:
# create answer
chain.run(input_documents=docs, question=query)

' The encoder is a component of the Transformer model architecture that is composed of a stack of identical layers. Each layer has two sub-layers, a multi-head self-attention mechanism and a simple, position-wise fully connected feed-forward network. The encoder is responsible for mapping an input sequence of symbol representations to a sequence of continuous representations.'

In [70]:
query = "Comparison of All Employee Average Weekly Hours, Seasonally Adjusted, before and after the March 2023 Benchmark"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)


In [71]:
# create answer
chain.run(input_documents=docs, question=query)

' This context is discussing the comparison of average weekly hours for all employees before and after the March 2023 Benchmark, with the data being seasonally adjusted. It is not providing any specific information or data, but rather just stating the topic of the comparison.'

In [72]:
query = "What is the Current Employment Statistics Summary, July 2024"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)

In [73]:
# create answer
chain.run(input_documents=docs, question=query)

' The Current Employment Statistics Summary is a report released on August 2, 2024 that provides an overview of employment trends in various industries, including social assistance, construction, nonfarm payroll, health care, transportation and warehousing, information, and government. It also includes revisions to employment data from previous months.'

In [74]:
query = "RAG Architecture Explained: Practical Example in 5 Minutes"

# retrieve text related to the query
docs = vectorstore.similarity_search(query, top_k=2)


In [75]:
# create answer
chain.run(input_documents=docs, question=query)

' The RAG architecture is a combination of indexing and retrieval/augmented generation. The indexing part involves converting documents into text format and storing them in a vector database. The retrieval/augmented generation part involves sending a user query to the vector database and finding similar text, which is then used to provide context for the LLM to generate an answer. A practical example of this would involve loading a resume into a document, converting it to raw text, and using text splitter and embeddings to load it into the vector database.'