# Install Dependencies

In [None]:
!pip install langchain
!pip install unstructured
!pip install openai
!pip install chromadb
!pip install Cython
!pip install tiktoken
!pip install azure-storage-blob



# Imports and API Key

In [None]:
from langchain.document_loaders import UnstructuredPDFLoader, DirectoryLoader, UnstructuredPowerPointLoader, Docx2txtLoader
from langchain.document_loaders import AzureBlobStorageContainerLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

In [None]:
import os
os.environ["OPENAI_API_KEY"] = ""

# Google Drive Mount and Import #don't execute

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#don't execute

root_dir = "/content/drive/My Drive"
pdf_folder_path = f'{root_dir}/Knowledge Base/'
#os.listdir(pdf_folder_path)
os.listdir('/content/drive/MyDrive/Knowledge Base')

['Northwind_Standard_Benefits_Details.pdf',
 'CauseAndEffectOfHomelessness.txt',
 'CauseAndEffectOfHomelessness2.txt',
 'CauseAndEffectOfHomelessness3.txt',
 'Northwind_Health_Plus_Benefits_Details (1).pdf',
 'Benefit_Options.pdf',
 'PerksPlus.pdf',
 'role_library.pdf',
 'File Viewer Migration factory.pdf']

# File and Directory Loaders #don't execute

In [None]:
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

In [None]:
text_loader = DirectoryLoader('/content/drive/MyDrive/Knowledge Base', glob='**/*.txt')
pdf_loader = DirectoryLoader('/content/drive/MyDrive/Knowledge Base', glob='**/*.pdf')
readme_loader = DirectoryLoader('/content/drive/MyDrive/Knowledge Base', glob='**/*.md')
doc_loader = DirectoryLoader('/content/drive/MyDrive/Knowledge Base', glob='**/*.docx')
ppt_loader = DirectoryLoader('/content/drive/MyDrive/Knowledge Base', glob='**/*.pptx')

In [None]:
loaders = [pdf_loader, readme_loader, text_loader, doc_loader, ppt_loader]
documents = []
for loader in loaders:
  documents.extend(loader.load())

In [None]:
print(f'You have {len(documents)} documents in your data')

You have 11 documents in your data


# Vector Store Index Creator for Directory Loader don't execute
1. Splitting documents into chunks
2. Creating embeddings for each document
3. Storing documents and embeddings in a vectorstore

In [None]:
index = VectorstoreIndexCreator(vectorstore_cls=Chroma,embedding=OpenAIEmbeddings(),text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders([loaders])
index



VectorStoreIndexWrapper(vectorstore=<langchain.vectorstores.chroma.Chroma object at 0x7f0c3ac27490>)

# Blob Loader with VectorstoreIndex Creator

In [None]:
loaders = AzureBlobStorageContainerLoader(conn_str="DefaultEndpointsProtocol=https;AccountName=ishitagptblob1;",container="ishita-container-langchain")

In [None]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

In [None]:
documents = loaders.load()
print(f'You have {len(documents)} documents in your data')

You have 17 documents in your data


In [None]:
index = VectorstoreIndexCreator().from_loaders([loaders])

# Text Splitter, Embeddings, VectorStore and Index
**Not Needed as we have VectorstoreIndexCreator which is a wrapper around this logic**

In [None]:
#text splitter - split the documents into chunks.
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(documents)
print(len(documents))

#select which embeddings we want to use
embeddings = OpenAIEmbeddings()

#create the vectorstore to use as the index.
db = Chroma.from_documents(documents, embeddings)

#expose this index in a retriever interface.
retriever = db.as_retriever()

#create a chain and use it to answer questions
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=retriever)

query = "What is the Northwind Standard Health Plan"
qa.run(query)

# Query and Source Query with multiple file formats

In [None]:
#with pdf
print(index.query('What is the Northwind Standard Health Plan'))
print(index.query_with_sources('What is the Northwind Standard Health Plan')['sources'])
print((index.query_with_sources('What is the Northwind Standard Health Plan')['sources']).rsplit('/', 1)[-1])

 The Northwind Standard Health Plan is a comprehensive health plan that provides coverage for medical, vision, and dental services, as well as preventive care services and prescription drug coverage. It offers a variety of in-network providers, including primary care physicians, specialists, hospitals, and pharmacies. It does not offer coverage for emergency services, mental health and substance abuse coverage, or out-of-network services.
/tmp/tmptnb6rux_/ishita-container-langchain/Northwind_Standard_Benefits_Details.pdf, /tmp/tmpb5mb1qi7/ishita-container-langchain/Benefit_Options.pdf
Benefit_Options.pdf


In [None]:
#with md
print(index.query('How to setup CMAV tool'))
print(index.query_with_sources('How to setup CMAV tool')['sources'])
print((index.query_with_sources('How to setup CMAV tool')['sources']).rsplit('/', 1)[-1])

 To setup the CMAV tool, first download and unzip the file. Then open the “CloudMigrationAssessmentAndValidation.sln” file in Visual Studio or any other IDE. After that, open the “Configuration.json” file and set the values for different keys, according to the instructions provided. Finally, make sure that Python is installed in your IDE or system.
/tmp/tmpduz3xsyq/ishita-container-langchain/test5.pdf, /tmp/tmpqluldfpn/ishita-container-langchain/Wiki.md
Wiki.md


In [None]:
#with txt
print(index.query('What are the effects of homelessness'))
print(index.query_with_sources('What are the effects of homelessness')['sources'])
print((index.query_with_sources('What are the effects of homelessnesss')['sources']).rsplit('/', 1)[-1])

 The effects of homelessness can include poor health, personal and psychological decline, decreased access to opportunity, loss of job or income, poverty, substance abuse, violence in the home, and disability and illness.
/tmp/tmp6ccv4j3o/ishita-container-langchain/CauseAndEffectOfHomelessness3.txt, /tmp/tmppcos27zq/ishita-container-langchain/CauseAndEffectOfHomelessness2.txt
CauseAndEffectOfHomelessness2.txt


In [None]:
#with doc
print(index.query('Tell me about criminal violence against Black Americans'))
print(index.query_with_sources('Tell me about criminal violence against Black Americans')['sources'])
print((index.query_with_sources('Tell me about criminal violence against Black Americans')['sources']).rsplit('/', 1)[-1])

 I'm sorry, I don't know.
/tmp/tmpzf4kdryi/ishita-container-langchain/test4.pdf
test4.pdf


In [None]:
#with ppt
print(index.query('Problem Statement Related to Migration'))
print(index.query_with_sources('Problem Statement Related to Migration')['sources'])
print((index.query_with_sources('Problem Statement Related to Migration')['sources']).rsplit('/', 1)[-1])

 The problem statement related to migration is that over a decade there are many artifacts built to support the growing need of Standard Operating Procedures, tools, User guides, Checklists/playbooks, automations, technical trackers and learnings. As an initial prototype, we are planning to build a ChatGPT kind of solution to help Factory team members to identify the artifacts and information that would enable them to consume the info more efficiently and will optimize their delivery.
/tmp/tmpa27iixqq/ishita-container-langchain/test3.pdf, /tmp/tmposl3latb/ishita-container-langchain/OPEN AI-BASED FILE RETRIEVAL SYSTEM (1).pptx
OPEN AI-BASED FILE RETRIEVAL SYSTEM (1).pptx


#Downloadable file link

In [None]:
from azure.storage.blob import BlobServiceClient

In [None]:
connection_string = "DefaultEndpointsProtocol=https;AccountName=ishitagptblob1;EndpointSuffix=core.windows.net"

# Create a BlobServiceClient using the connection string
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

In [None]:
container_name = "ishita-container-langchain"
file_name = input("Enter file name: ")

Enter file name: test1.pdf


In [None]:
# Get the container client and blob client
container_client = blob_service_client.get_container_client(container_name)
blob_client = container_client.get_blob_client(file_name)

# List the blobs within the container
blob_list = [blob.name for blob in container_client.list_blobs()]
print('\n'.join(blob_list))

Benefit_Options.pdf
CauseAndEffectOfHomelessness.txt
CauseAndEffectOfHomelessness2.txt
CauseAndEffectOfHomelessness3.txt
Crime with Violence in USA and SA.docx
EmpSampledata.csv
Northwind_Health_Plus_Benefits_Details (1).pdf
Northwind_Standard_Benefits_Details.pdf
OPEN AI-BASED FILE RETRIEVAL SYSTEM (1).pptx
PerksPlus.pdf
Wiki.md
role_library.pdf
test1.pdf
test2.pdf
test3.pdf
test4.pdf
test5.pdf


In [None]:
from datetime import datetime, timedelta
from azure.storage.blob import generate_blob_sas, BlobSasPermissions

In [None]:
# Generate the SAS token for the blob
# Grant limited access to Azure Storage resources using shared access signatures (SAS)
expiry = datetime.utcnow() + timedelta(hours=1)
sas_token = generate_blob_sas(
    account_name=blob_service_client.account_name,
    container_name=container_name,
    blob_name=file_name,
    account_key=blob_service_client.credential.account_key,
    permission=BlobSasPermissions(read=True),
    expiry=expiry,
)

In [None]:
# Create the downloadable link by combining the blob URL and the SAS token
blob_url = blob_client.url
download_link = f"{blob_url}?{sas_token}"

# Print the download link
print(download_link)

https://ishitagptblob1.blob.core.windows.net/ishita-container-langchain/test1.pdf?se=2023-06-28T18%3A00%3A08Z&sp=r&sv=2022-11-02&sr=b&sig=o1eT1eXmwshStOetYULt5koK6R35jXGjhfZgwVYudQE%3D


In [None]:
!pip install fuzzywuzzy
from fuzzywuzzy import fuzz, process



In [None]:
#check if blob exists or does not exist
if file_name in blob_list:
    print(f"The blob '{file_name}' exists.")
    blob_url = blob_client.url
    download_link = f"{blob_url}?{sas_token}"
else:
    closest_match, similarity = process.extractOne(file_name, blob_list)
    print(f"The blob '{file_name}' does not exist. Did you mean '{closest_match}'? (Similarity: {similarity})")
    matches = process.extract(file_name, blob_list, scorer=fuzz.ratio, limit=1)

    suggested_blob_name = matches[0][0]
    suggested_blob_client = blob_service_client.get_blob_client(container=container_name, blob=suggested_blob_name)
    sas_token = suggested_blob_client.generate_shared_access_signature(permission="r")
    download_link = f"{suggested_blob_client.url}?{sas_token}"
    print(f"Suggested blob name: {suggested_blob_name}")
    print(f"Downloadable link for the suggested blob: {download_link}")


The blob 'test1.pdf' exists.
