## Financial Services

In [1]:
!ls

qdrant.txt  sample_data


In [2]:
!pip install -r qdrant.txt



In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredURLLoader
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain_community.vectorstores import Qdrant
from langchain.chains import RetrievalQA
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams
from langchain_huggingface import HuggingFaceEmbeddings
from cleantext import clean



In [4]:
from dotenv import load_dotenv
import os
import nltk
import time

In [5]:
from google.colab import userdata

In [6]:
# Load environment variables from .env file
#load_dotenv()

# Access the environment variables
qdrant_api_key = userdata.get('QDRANT_API_KEY')
hf_token = userdata.get('HF_TOKEN')


#### Wikipedia

In [7]:
from langchain_community.document_loaders import WikipediaLoader

In [8]:
docs = WikipediaLoader(query="International Financial Services Centres Authority", doc_content_chars_max = 100000, load_max_docs=1).load()

In [9]:
docs[0].metadata  # meta-information of the Document

{'title': 'International Financial Services Centres Authority',
 'summary': 'The International Financial Services Centres Authority (IFSCA) is the regulatory body for the Indian special economic zones such as the GIFT International Financial Services Centre for International Financial Services and commodity markets under the ownership of the Government of India. It was established in 2020, under the International Financial Services Centres Authority Act, 2019. The International Financial Services Centre (IFSC) is located in Gujarat International Finance Tec-City (GIFT City).',
 'source': 'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority'}

In [10]:
docs[0].metadata['source']

'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority'

In [11]:
raw_text =docs[0].page_content
len(raw_text)

9374

In [12]:
clean_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [13]:
len(clean_text)

9347

In [14]:
from langchain.schema import Document
metadata = {
    "source": docs[0].metadata['source'],
    "title": docs[0].metadata['title']
}

custom_doc = Document(page_content=clean_text, metadata=metadata)
document =[custom_doc]

In [15]:
document

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority', 'title': 'International Financial Services Centres Authority'}, page_content='the international financial services centres authority (ifsca) is the regulatory body for the indian special economic zones such as the gift international financial services centre for international financial services and commodity markets under the ownership of the government of india. it was established in 2020, under the international financial services centres authority act, 2019. the international financial services centre (ifsc) is located in gujarat international finance tec-city (gift city).\n== history ==\n=== developments that led to the ifsca ===\nin december 2015 the first international financial services centre (ifsc) in india was set up in gujarat international finance tec-city (gift city).\nin 2019 the government of india enacted an act of parliament called the international financi

In [16]:
print(document[0].page_content)

the international financial services centres authority (ifsca) is the regulatory body for the indian special economic zones such as the gift international financial services centre for international financial services and commodity markets under the ownership of the government of india. it was established in 2020, under the international financial services centres authority act, 2019. the international financial services centre (ifsc) is located in gujarat international finance tec-city (gift city).
== history ==
=== developments that led to the ifsca ===
in december 2015 the first international financial services centre (ifsc) in india was set up in gujarat international finance tec-city (gift city).
in 2019 the government of india enacted an act of parliament called the international financial services centres authority act, 2019.
=== establishment ===
on april 27, 2020, the ifsca was established as a statutory body under the international financial services centres authority act, 20

In [17]:
len(document)

1

In [18]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(document)
texts

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority', 'title': 'International Financial Services Centres Authority'}, page_content='the international financial services centres authority (ifsca) is the regulatory body for the indian special economic zones such as the gift international financial services centre for international financial services and commodity markets under the ownership of the government of india. it was established in 2020, under the international financial services centres authority act, 2019. the international financial services centre (ifsc) is located in gujarat international finance tec-city (gift'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority', 'title': 'International Financial Services Centres Authority'}, page_content='tec-city (gift city).'),
 Document(metadata={'source': 'https://en.wikipedia.org/wiki/International_Financial_Servi

In [19]:
len(texts)

26

In [20]:
model_name = 'dunzhang/stella_en_400M_v5'

In [21]:
# Define model parameters
model_kwargs = {'device': 'cuda', 'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': True}

# Initialize HuggingFace embeddings
embeddings_model = HuggingFaceEmbeddings(model_name=model_name,model_kwargs=model_kwargs,encode_kwargs=encode_kwargs)

  from tqdm.autonotebook import tqdm, trange
Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
# Define the URL for Qdrant
url="https://a1fbbfce-ca47-437d-b308-e79724650692.us-east4-0.gcp.cloud.qdrant.io:6333"

In [23]:
# Initialize Qdrant client
client =  QdrantClient(url=url, api_key=qdrant_api_key)

In [24]:
# Get vector size by embedding a test query
vector_size = len(embeddings_model.embed_query("test"))
# Collection name
collection_name = "finance_collection"

In [25]:
# Check if the collection already exists
if client.collection_exists(collection_name):
    # Create a collection with the calculated vector size
    client.delete_collection(collection_name=collection_name)
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )
else:
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )

In [26]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 1.233602523803711 seconds


In [27]:
query = "IN which year the first IFSC wa set up in india?"
query_vector = embeddings_model.embed_query(query)

In [28]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [29]:
scored_points[0]

ScoredPoint(id='555bd481-c84e-4fed-8145-61436551c463', version=0, score=0.6917827, payload={'metadata': {'source': 'https://en.wikipedia.org/wiki/International_Financial_Services_Centres_Authority', 'title': 'International Financial Services Centres Authority'}, 'page_content': '== history ==\n=== developments that led to the ifsca ===\nin december 2015 the first international financial services centre (ifsc) in india was set up in gujarat international finance tec-city (gift city).\nin 2019 the government of india enacted an act of parliament called the international financial services centres authority act, 2019.\n=== establishment ==='}, vector=None, shard_key=None, order_value=None)

In [30]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.6917827
Page Content: == history ==
=== developments that led to the ifsca ===
in december 2015 the first international financial services centre (ifsc) in india was set up in gujarat international finance tec-city (gift city).
in 2019 the government of india enacted an act of parliament called the international financial services centres authority act, 2019.
=== establishment ===

Score: 0.61307615
Page Content: on april 27, 2020, the ifsca was established as a statutory body under the international financial services centres authority act, 2019. on april 27, 2020, inauguration of the ifsca by the finance minister of india, nirmala sitharaman.
on may 27, 2020, the ifsca released its first set of regulations, namely the ifsca (banking) regulations, 2020.
=== developments ===

Score: 0.5648826
Page Content: the international financial services centres authority (ifsca) has been established to regulate and develop financial services in the international financial services centre

## LLM result

In [31]:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
# # completion llm
# llm = ChatOpenAI(
#     openai_api_key=os.getenv("OPENAI_API_KEY"),
#     model_name='gpt-4o-mini',
#     temperature=0.0
# )
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=qdrant.as_retriever()
# )
# qa.run(query)

#### URL

In [32]:
#n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
urls = []
inp = input("Enter Link:")
urls.append(inp)
print("URL:", urls)

Enter Link:https://finance.ec.europa.eu/financial-crime/anti-money-laundering-and-countering-financing-terrorism-eu-level_en#legislation
URL: ['https://finance.ec.europa.eu/financial-crime/anti-money-laundering-and-countering-financing-terrorism-eu-level_en#legislation']


In [33]:
loader = UnstructuredURLLoader(urls=urls, show_progress_bar=True)
loaded_url = loader.load()

100%|██████████| 1/1 [00:00<00:00,  2.43it/s]


In [34]:
loaded_url[0].metadata

{'source': 'https://finance.ec.europa.eu/financial-crime/anti-money-laundering-and-countering-financing-terrorism-eu-level_en#legislation'}

In [35]:
raw_text =loaded_url[0].page_content
len(raw_text)

17220

In [36]:
clean_url_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [37]:
len(clean_url_text)

17037

In [38]:
from langchain.schema import Document
metadata = {
    "source": loaded_url[0].metadata['source']
}

url_doc = Document(page_content=clean_url_text, metadata=metadata)
url_document =[url_doc]

In [39]:
print(url_document[0].page_content)

page contents
what the eu is doing and why
it is essential that gatekeepers (banks and other obliged entities) apply measures to prevent money laundering and terrorist financing. traceability of financial information has an important deterrent effect.
the european commission carries out risk assessments in order to identify and respond to risks affecting the euinternal market.the european union adopted robust legislation to fight against money laundering and terrorist financing (aml). the first anti-money laundering directive was adopted in 1990 in order to prevent the misuse of the financial system for the purpose of money laundering. it provides that obliged entities shall apply customer due diligence requirements when entering into a business relationship (i.e. identify and verify the identity of clients, monitor transactions and report suspicious transactions).
the eulaws have been constantly revised in order to mitigate new risks relating to money laundering and terrorist financin

In [40]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
url_texts = text_splitter.split_documents(url_document)


In [41]:
len(url_texts)

45

In [42]:
collection_name = "finance_collection"

In [43]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(url_texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 2.0419375896453857 seconds


In [44]:
query = "What the EU is doing and why?"
query_vector = embeddings_model.embed_query(query)

In [45]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [46]:
scored_points[0]

ScoredPoint(id='bb506bcd-35d6-4a26-96d4-af445cbf9cb5', version=1, score=0.59980524, payload={'metadata': {'source': 'https://finance.ec.europa.eu/financial-crime/anti-money-laundering-and-countering-financing-terrorism-eu-level_en#legislation'}, 'page_content': 'page contents\nwhat the eu is doing and why\nit is essential that gatekeepers (banks and other obliged entities) apply measures to prevent money laundering and terrorist financing. traceability of financial information has an important deterrent effect.'}, vector=None, shard_key=None, order_value=None)

In [47]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.59980524
Page Content: page contents
what the eu is doing and why
it is essential that gatekeepers (banks and other obliged entities) apply measures to prevent money laundering and terrorist financing. traceability of financial information has an important deterrent effect.

Score: 0.56939924
Page Content: 7 may 2020action plan - financial crimethe european commission adopted an action plan for a comprehensive union policy on preventing money laundering and the financing of terrorism built on six pillars. to gather the views of citizens and stakeholder on these measures, the commission launched a public consultation in parallel to the adoption of this action plan.

Score: 0.56109047
Page Content: the new regulation on the anti-money laundering authority (amla) will create a new eu authority. it will establish a new coordination mechanism for aml/cft supervision in the eu and support cooperation amongst financial intelligence units (fius).
more information on the amla
eu-wide c

In [48]:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
# # completion llm
# llm = ChatOpenAI(
#     openai_api_key=os.getenv("OPENAI_API_KEY"),
#     model_name='gpt-3.5-turbo',
#     temperature=0.0
# )
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )
# print(qa.run(query))

## Data Privacy


#### Wikipedia

In [49]:
from langchain_community.document_loaders import WikipediaLoader

In [50]:
docs = WikipediaLoader(query="General Data Protection Regulation", doc_content_chars_max = 100000, load_max_docs=1).load()

In [51]:
docs[0].metadata  # meta-information of the Document

{'title': 'General Data Protection Regulation',
 'summary': 'The General Data Protection Regulation (Regulation (EU) 2016/679, abbreviated GDPR) is a European Union regulation on information privacy in the European Union (EU) and the European Economic Area (EEA). The GDPR is an important component of EU privacy law and human rights law, in particular Article 8(1) of the Charter of Fundamental Rights of the European Union. It also governs the transfer of personal data outside the EU and EEA. The GDPR\'s goals are to enhance individuals\' control and rights over their personal information and to simplify the regulations for international business.  It supersedes the Data Protection Directive 95/46/EC and, among other things, simplifies the terminology.\nThe European Parliament and Council of the European Union adopted the GDPR on 14 April 2016, to become effective on 25 May 2018. As an EU regulation (instead of a directive), GDPR is directly applicable with force of law on its own withou

In [52]:
raw_text =docs[0].page_content
len(raw_text)

50878

In [53]:
clean_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [54]:
len(clean_text)

50763

In [55]:
from langchain.schema import Document
metadata = {
    "source": docs[0].metadata['source'],
    "title": docs[0].metadata['title']
}

custom_doc = Document(page_content=clean_text, metadata=metadata)
document =[custom_doc]

In [56]:
document



In [57]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(document)

In [58]:
len(texts)

145

In [59]:
# Collection name
collection_name = "DataProtection_collection"

In [60]:
# Check if the collection already exists
if client.collection_exists(collection_name):
    # Create a collection with the calculated vector size
    client.delete_collection(collection_name=collection_name)
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )
else:
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )

In [61]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 4.9224090576171875 seconds


In [62]:
query = "What is GDPR?"
query_vector = embeddings_model.embed_query(query)

In [63]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [64]:
scored_points[0]

ScoredPoint(id='24a79b3e-a972-45eb-9835-5908a5dc0387', version=0, score=0.70240414, payload={'metadata': {'source': 'https://en.wikipedia.org/wiki/General_Data_Protection_Regulation', 'title': 'General Data Protection Regulation'}, 'page_content': "the general data protection regulation (regulation (eu) 2016/679, abbreviated gdpr) is a european union regulation on information privacy in the european union (eu) and the european economic area (eea). the gdpr is an important component of eu privacy law and human rights law, in particular article 8(1) of the charter of fundamental rights of the european union. it also governs the transfer of personal data outside the eu and eea. the gdpr's goals are to enhance individuals' control and rights"}, vector=None, shard_key=None, order_value=None)

In [65]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.70240414
Page Content: the general data protection regulation (regulation (eu) 2016/679, abbreviated gdpr) is a european union regulation on information privacy in the european union (eu) and the european economic area (eea). the gdpr is an important component of eu privacy law and human rights law, in particular article 8(1) of the charter of fundamental rights of the european union. it also governs the transfer of personal data outside the eu and eea. the gdpr's goals are to enhance individuals' control and rights

Score: 0.6717965
Page Content: 2(2)(a) and 88 of the gdpr).

Score: 0.636055
Page Content: related concepts:
convention on cybercrime
data portability
do not track legislation
eprivacy regulation
privacy impact assessment
compliance tactics by certain companies:
consent or pay
== footnotes ==
== references ==
== external links ==
general data protection regulation consolidated text on eur-lex
general data protection regulation initial legal act in the ojeu
data pr

In [66]:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
# # completion llm
# llm = ChatOpenAI(
#     openai_api_key=os.getenv("OPENAI_API_KEY"),
#     model_name='gpt-4o-mini',
#     temperature=0.0
# )
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )
# qa.run(query)

#### URLs

In [67]:
#n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
urls = []
inp = input("Enter Link:")
urls.append(inp)
print("URL:", urls)

Enter Link:https://www.proofpoint.com/us/threat-reference/ccpa-compliance#:~:text=CCPA%20compliance%20is%20a%20set%20of%20regulations%20that,implement%20reasonable%20security%20measures%20to%20protect%20user%20data
URL: ['https://www.proofpoint.com/us/threat-reference/ccpa-compliance#:~:text=CCPA%20compliance%20is%20a%20set%20of%20regulations%20that,implement%20reasonable%20security%20measures%20to%20protect%20user%20data']


In [68]:
loader = UnstructuredURLLoader(urls=urls, show_progress_bar=True)
loaded_url= loader.load()

100%|██████████| 1/1 [00:00<00:00,  4.15it/s]


In [69]:
loaded_url[0].metadata

{'source': 'https://www.proofpoint.com/us/threat-reference/ccpa-compliance#:~:text=CCPA%20compliance%20is%20a%20set%20of%20regulations%20that,implement%20reasonable%20security%20measures%20to%20protect%20user%20data'}

In [70]:
raw_text =loaded_url[0].page_content
len(raw_text)

14402

In [71]:
clean_url_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [72]:
len(clean_url_text)

14270

In [73]:
from langchain.schema import Document
metadata = {
    "source": loaded_url[0].metadata['source']
}

doc = Document(page_content=clean_url_text, metadata=metadata)
document =[doc]

In [74]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
url_texts = text_splitter.split_documents(document)
#texts

In [75]:
len(url_texts)

37

In [76]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(url_texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 1.606715440750122 seconds


In [77]:
query = "What is CCPA?"
query_vector = embeddings_model.embed_query(query)

In [78]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [79]:
scored_points[0]

ScoredPoint(id='923ff0d4-9338-48e5-a614-5b5356b5b5e0', version=3, score=0.6975296, payload={'metadata': {'source': 'https://www.proofpoint.com/us/threat-reference/ccpa-compliance#:~:text=CCPA%20compliance%20is%20a%20set%20of%20regulations%20that,implement%20reasonable%20security%20measures%20to%20protect%20user%20data'}, 'page_content': "glossary\nwhat is ccpa compliance?\nwhat is ccpa compliance?\naccess the archiving & compliance solution brief\nstart your free trial\ntable of contents\ndefinition\nwho must comply with ccpa?\nwhat does ccpa cover?\nwhat are key privacy provisions in ccpa?\nwhat's the difference between ccpa vs. gdpr?\nwhat is ccpa compliance training?\nwhat are ccpa penalties for violating compliance requirements?\nwhat does ccpa mean for cybersecurity?\nhow to become ccpa compliant\nhow proofpoint can help\ndefinition"}, vector=None, shard_key=None, order_value=None)

In [80]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.6975296
Page Content: glossary
what is ccpa compliance?
what is ccpa compliance?
access the archiving & compliance solution brief
start your free trial
table of contents
definition
who must comply with ccpa?
what does ccpa cover?
what are key privacy provisions in ccpa?
what's the difference between ccpa vs. gdpr?
what is ccpa compliance training?
what are ccpa penalties for violating compliance requirements?
what does ccpa mean for cybersecurity?
how to become ccpa compliant
how proofpoint can help
definition

Score: 0.6972457
Page Content: ccpa compliance is a set of regulations that organizations must follow to protect the data privacy rights of california residents. it requires organizations to be transparent about their data collection and usage practices, to respond to consumer requests, and to implement reasonable security measures to protect user data.
cybersecurity education and training begins here
start a free trial
here's how your free trial works:

Score: 0.665061

In [81]:
# from langchain_openai import ChatOpenAI
# from langchain.chains import RetrievalQA
# # completion llm
# llm = ChatOpenAI(
#     openai_api_key=os.getenv("OPENAI_API_KEY"),
#     model_name='gpt-4o-mini',
#     temperature=0.0
# )
# qa = RetrievalQA.from_chain_type(
#     llm=llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever()
# )
# print(qa.run(query))

**HEALTHCARE**

**WIKIPEDIA**

In [82]:
#Food_and_Drug_Administration
docs = WikipediaLoader(query="Food_and_Drug_Administration", doc_content_chars_max = 100000, load_max_docs=1).load()



In [83]:
docs[0].metadata  # meta-information of the Document

{'title': 'Food and Drug Administration',
 'summary': "The United States Food and Drug Administration (FDA or US FDA) is a federal agency of the Department of Health and Human Services. The FDA is responsible for protecting and promoting public health through the control and supervision of food safety, tobacco products, caffeine products, dietary supplements, prescription and over-the-counter pharmaceutical drugs (medications), vaccines, biopharmaceuticals, blood transfusions, medical devices, electromagnetic radiation emitting devices (ERED), cosmetics, animal foods & feed and veterinary products.\nThe FDA's primary focus is enforcement of the Federal Food, Drug, and Cosmetic Act (FD&C). However, the agency also enforces other laws, notably Section 361 of the Public Health Service Act as well as associated regulations. Much of this regulatory-enforcement work is not directly related to food or drugs but involves other factors like regulating lasers, cellular phones, and condoms. In ad

In [84]:
docs[0].metadata['source']

'https://en.wikipedia.org/wiki/Food_and_Drug_Administration'

In [85]:
raw_text =docs[0].page_content
len(raw_text)

50519

In [86]:
clean_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [87]:
len(clean_text)

50388

In [88]:
from langchain.schema import Document
metadata = {
    "source": docs[0].metadata['source'],
    "title": docs[0].metadata['title']
}

custom_doc = Document(page_content=clean_text, metadata=metadata)
document =[custom_doc]

In [89]:
len(document)

1

In [90]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
texts = text_splitter.split_documents(document)

In [91]:
len(texts)

152

In [92]:
collection_name = "Healthcare_collection"

In [93]:
# Check if the collection already exists
if client.collection_exists(collection_name):
    # Create a collection with the calculated vector size
    client.delete_collection(collection_name=collection_name)
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )
else:
    client.create_collection(
            collection_name=collection_name,
            vectors_config=VectorParams(size=vector_size, distance="Dot")
        )

In [94]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 3.8517978191375732 seconds


In [95]:
query = "Summarize Post-market safety surveillance?"
query_vector = embeddings_model.embed_query(query)

In [96]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [97]:
scored_points[0]

ScoredPoint(id='807862dc-63d5-4779-a386-1113d526b260', version=1, score=0.79561704, payload={'metadata': {'source': 'https://en.wikipedia.org/wiki/Food_and_Drug_Administration', 'title': 'Food and Drug Administration'}, 'page_content': '=== post-marketing drug safety monitoring ==='}, vector=None, shard_key=None, order_value=None)

In [98]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.79561704
Page Content: === post-marketing drug safety monitoring ===

Score: 0.68829995
Page Content: fda's procedures for pre- and post-market drug safety regulation.

Score: 0.5879502
Page Content: while this remains the primary tool of post-market safety surveillance, fda requirements for post-marketing risk management are increasing. as a condition of approval, a sponsor may be required to conduct additional clinical trials, called phase iv trials. in some cases, the fda requires risk management plans called risk evaluation and mitigation strategies (rems) for some drugs that require actions to be taken to ensure that the drug is used safely. for example, thalidomide can cause birth



**URLS**

In [99]:
#n_weblinks = int(input("How many web links you want the RAG agent to refer for response generation & insights? Enter here: "))
urls = []
inp = input("Enter Link:")
urls.append(inp)
print("URL:", urls)

Enter Link:https://burgeon.co.in/blog/regulatory-compliance-in-healthcare-industry/
URL: ['https://burgeon.co.in/blog/regulatory-compliance-in-healthcare-industry/']


In [100]:
loader = UnstructuredURLLoader(urls=urls, show_progress_bar=True)
loaded_url = loader.load()

100%|██████████| 1/1 [00:05<00:00,  5.60s/it]


In [101]:
loaded_url[0].metadata

{'source': 'https://burgeon.co.in/blog/regulatory-compliance-in-healthcare-industry/'}

In [102]:
raw_text =loaded_url[0].page_content
len(raw_text)

16831

In [103]:
clean_url_text = clean(text=raw_text,
            fix_unicode=True,
            to_ascii=True,
            lower=True,
            no_line_breaks=False,
            no_urls=False,
            no_emails=False,
            no_phone_numbers=False,
            no_numbers=False,
            no_digits=False,
            no_currency_symbols=False,
            no_punct=False,
            lang="en"
            )

In [104]:
len(clean_url_text)

16658

In [105]:
from langchain.schema import Document
metadata = {
    "source": loaded_url[0].metadata['source']
}

url_doc = Document(page_content=clean_url_text, metadata=metadata)
url_document =[url_doc]

In [106]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
url_texts = text_splitter.split_documents(url_document)

In [107]:
len(url_texts)

44

In [108]:
start =time.time()
# Insert documents into the collection
qdrant = Qdrant.from_documents(url_texts,embeddings_model,url=url,prefer_grpc=False,api_key=qdrant_api_key,collection_name=collection_name,force_recreate=False,distance_func="Dot")
end =time.time()
print("Time taken is:", end-start,"seconds")

Time taken is: 1.7285022735595703 seconds


In [109]:
query = "Summarize The Information Technology Act, 2000 ?"
query_vector = embeddings_model.embed_query(query)

In [110]:
scored_points = client.search( collection_name=collection_name,
                        query_vector=query_vector,
                        limit=3,
                        with_payload=True
                        )

In [111]:
scored_points[0]

ScoredPoint(id='e51b7da5-a0b3-4886-97d4-1a8fa5e2a576', version=3, score=0.60609996, payload={'metadata': {'source': 'https://burgeon.co.in/blog/regulatory-compliance-in-healthcare-industry/'}, 'page_content': 'compliance with the information technology act, 2000 is essential for healthcare industries handling sensitive patient information, as it mandates the implementation of robust data protection measures and ensures the confidentiality and integrity of patient data.\nthis act requires healthcare industries to implement robust data protection measures to safeguard patient data against unauthorized access, disclosure, or misuse.'}, vector=None, shard_key=None, order_value=None)

In [112]:
for point in scored_points:
    score = point.score
    page_content = point.payload['page_content']
    print(f"Score: {score}")
    print(f"Page Content: {page_content}\n")

Score: 0.60609996
Page Content: compliance with the information technology act, 2000 is essential for healthcare industries handling sensitive patient information, as it mandates the implementation of robust data protection measures and ensures the confidentiality and integrity of patient data.
this act requires healthcare industries to implement robust data protection measures to safeguard patient data against unauthorized access, disclosure, or misuse.

Score: 0.5472753
Page Content: the guidelines lay down specific requirements for ensuring the confidentiality, integrity, and security of patient health information during virtual consultations.
healthcare professionals offering telemedicine services are bound by professional codes of conduct and ethics.
compliance with the telemedicine practice guidelines is essential for healthcare industries to operate legally and ethically in the telemedicine space.
5. the information technology act, 2000

Score: 0.51332283
Page Content: key regul

In [113]:
# from langchain_huggingface import HuggingFaceEndpoint
# repo_id="google/flan-t5-xxl"
# llm=HuggingFaceEndpoint(repo_id=repo_id,max_new_token=512,temperature=0.4,token=hf_token)
# query ='What is regulatory compliance in the healthcare industry?'
# from langchain.chains import RetrievalQA

# qa = RetrievalQA.from_chain_type(
#      llm=llm,
#      chain_type="stuff",
#      retriever=qdrant.as_retriever()
#  )
# qa.run(query)