In [6]:
!pip install -qU transformers accelerate einops langchain xformers bitsandbytes faiss-gpu sentence_transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m167.0/167.0 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.6/92.6 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) .

In [7]:
from torch import cuda, bfloat16
import transformers

model_id = 'meta-llama/Llama-2-7b-chat-hf'

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

In [8]:
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


In [9]:
# begin initializing HF items, you need an access token
hf_auth = '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'
model_config = transformers.AutoConfig.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_config,
    device_map='auto',
    use_auth_token=hf_auth
)

# enable evaluation mode to allow model inference
model.eval()

print(f"Model loaded on {device}")



Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]



Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Model loaded on cuda:0


In [47]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id,
    use_auth_token=hf_auth
)



In [48]:
stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids

[[1, 29871, 13, 29950, 7889, 29901], [1, 29871, 13, 28956, 13]]

In [19]:
stop_list

['\nHuman:', '\n```\n']

In [49]:
import torch

stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [50]:
from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [51]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

In [52]:
res = generate_text("Explain me the difference between Data Lakehouse and Data Warehouse.")
print(res[0]["generated_text"])

Explain me the difference between Data Lakehouse and Data Warehouse. Unterscheidung between data lakehouse and data warehouse is a common topic of discussion in the data engineering community, as both are designed to store large amounts of data but have different architectures and use cases. A data lakehouse is a centralized repository that stores all the raw data from various sources in its original form, without transforming or processing it. On the other hand, a data warehouse is a structured repository that stores data in a specific format, typically after cleaning, transforming, and aggregating it.
Here are some key differences between data lakehouses and data warehouses:
1. Data Structure: A data lakehouse stores data in its raw form, while a data warehouse stores data in a structured format.
2. Data Processing: A data lakehouse does not process or transform data, while a data warehouse performs these tasks to make the data usable for analysis.
3. Data Scalability: A data lakehou

In [53]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline(pipeline=generate_text)

#checking again that everything is working fine
llm(prompt="Explain me the difference between Data Lakehouse and Data Warehouse.")

' Unterscheidung between data lakehouse and data warehouse is a common topic of discussion in the data engineering community, as both are designed to store large amounts of data but have different architectures and use cases. A data lakehouse is a centralized repository that stores all the data from various sources in its raw form, without transforming or processing it. On the other hand, a data warehouse is a structured repository that stores data in a specific format, typically after cleaning, transforming, and aggregating it.\nHere are some key differences between data lakehouses and data warehouses:\n1. Data Structure: A data lakehouse stores data in its raw form, including semi-structured and unstructured data, while a data warehouse stores data in a structured format, typically in a relational database management system (RDBMS).\n2. Data Processing: A data lakehouse does not process or transform data, whereas a data warehouse performs extensive data processing, such as data clean

In [13]:
from langchain.document_loaders import WebBaseLoader

web_links = ["https://www.databricks.com/","https://help.databricks.com","https://databricks.com/try-databricks","https://help.databricks.com/s/","https://docs.databricks.com","https://kb.databricks.com/","http://docs.databricks.com/getting-started/index.html","http://docs.databricks.com/introduction/index.html","http://docs.databricks.com/getting-started/tutorials/index.html","http://docs.databricks.com/release-notes/index.html","http://docs.databricks.com/ingestion/index.html","http://docs.databricks.com/exploratory-data-analysis/index.html","http://docs.databricks.com/data-preparation/index.html","http://docs.databricks.com/data-sharing/index.html","http://docs.databricks.com/marketplace/index.html","http://docs.databricks.com/workspace-index.html","http://docs.databricks.com/machine-learning/index.html","http://docs.databricks.com/sql/index.html","http://docs.databricks.com/delta/index.html","http://docs.databricks.com/dev-tools/index.html","http://docs.databricks.com/integrations/index.html","http://docs.databricks.com/administration-guide/index.html","http://docs.databricks.com/security/index.html","http://docs.databricks.com/data-governance/index.html","http://docs.databricks.com/lakehouse-architecture/index.html","http://docs.databricks.com/reference/api.html","http://docs.databricks.com/resources/index.html","http://docs.databricks.com/whats-coming.html","http://docs.databricks.com/archive/index.html","http://docs.databricks.com/lakehouse/index.html","http://docs.databricks.com/getting-started/quick-start.html","http://docs.databricks.com/getting-started/etl-quick-start.html","http://docs.databricks.com/getting-started/lakehouse-e2e.html","http://docs.databricks.com/getting-started/free-training.html","http://docs.databricks.com/sql/language-manual/index.html","http://docs.databricks.com/error-messages/index.html","http://www.apache.org/","https://databricks.com/privacy-policy","https://databricks.com/terms-of-use"]

loader = WebBaseLoader(web_links)
documents = loader.load()

In [50]:
documents[0]

Document(page_content='Data Lakehouse Architecture and AI Company | DatabricksSkip to main contentPlatformThe Databricks Lakehouse PlatformDelta LakeData GovernanceData EngineeringData StreamingData WarehousingData SharingMachine LearningData SciencePricingMarketplaceOpen source techSecurity and Trust CenterVirtual Event\n\nAdvantage Lakehouse: Fueling Innovation in Data and AI\n\nLakehouse architecture is built for modern data and AI initiatives. Discover how it unifies data to speed up everything from ETL to SQL to AI.\nRegister nowSolutionsSolutions by IndustryFinancial ServicesHealthcare and Life SciencesManufacturingCommunications, Media & EntertainmentPublic SectorRetailSee all IndustriesSolutions by Use CaseSolution AcceleratorsProfessional ServicesDigital Native BusinessesData Platform MigrationReport\n\nTap the potential of AI\nExplore recent findings from 600 CIOs across 14 industries in this MIT Technology Review report\nRead nowLearnDocumentationTraining & CertificationDemo

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(documents)

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

In [16]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [23]:
chat_history = []

query = "What is Data lakehouse architecture in Databricks?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 In Databricks, data lakehouse architecture refers to the organization of data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views.






In [24]:
chat_history = [(query, result["answer"])]

In [25]:
chat_history

[('What is Data lakehouse architecture in Databricks?',
  ' In Databricks, data lakehouse architecture refers to the organization of data stored with Delta Lake in cloud object storage with familiar relations like database schemas, tables, and views.\n\n\n\n')]

In [26]:
query = "What are Data Governance and Interoperability in it?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 In Data Lakehouse architecture in Databricks, Data Governance refers to the policies and practices implemented to securely manage the data assets within an organization. It encompasses the centralized management of data across various teams, departments, and stakeholders, ensuring data quality, security, and compliance with regulatory requirements.


In [27]:
print(result['source_documents'])

[Document(page_content='Data governance\nLakehouse architecture\n\nReference & resources\n\nReference\nResources\nWhat’s coming?\nDocumentation archive\n\n\n\n\n    Updated Aug 29, 2023\n  \n\n\nSend us feedback\n\n\n\n\n\n\n\n\n\n\nDocumentation \nSecurity and compliance guide\n\n\n\n\n\n\n\nSecurity and compliance guide \nThis guide provides an overview of security features and capabilities that an enterprise data team can use to harden their Databricks environment according to their risk profile and governance policy.\nThis guide does not cover information about securing your data. For that information, see Data governance best practices.\n\nNote\nThis article focuses on the most recent (E2) version of the Databricks platform. Some of the features described here may not be supported on legacy deployments that have not migrated to the E2 platform.', metadata={'source': 'http://docs.databricks.com/security/index.html', 'title': 'Security and compliance guide | Databricks on AWS', 'des

**custom fine tunning**

In [None]:
# # Install package
import locale
locale.getpreferredencoding = lambda: "UTF-8"
!pip install "unstructured[all-docs]"

In [54]:
from langchain.document_loaders import UnstructuredFileLoader
from unstructured.cleaners.core import clean_extra_whitespace

In [55]:
loader = UnstructuredFileLoader(
    "/content/budget_speech.pdf",
    mode="elements",
    post_processors=[clean_extra_whitespace],
)

In [28]:
docs = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [56]:
docs[0:3]

[Document(page_content='GOVERNMENT OF INDIA', metadata={'source': '/content/budget_speech.pdf', 'coordinates': {'points': ((240.72, 235.96543999999994), (240.72, 245.80543999999998), (359.61261671999995, 245.80543999999998), (359.61261671999995, 235.96543999999994)), 'system': 'PixelSpace', 'layout_width': 595, 'layout_height': 842}, 'filename': 'budget_speech.pdf', 'file_directory': '/content', 'last_modified': '2023-08-30T06:32:34', 'filetype': 'application/pdf', 'page_number': 1, 'category': 'Title'}),
 Document(page_content='BUDGET 2023-2024', metadata={'source': '/content/budget_speech.pdf', 'coordinates': {'points': ((215.040137, 344.83903999999995), (215.040137, 364.27903999999995), (381.41941448800003, 364.27903999999995), (381.41941448800003, 344.83903999999995)), 'system': 'PixelSpace', 'layout_width': 595, 'layout_height': 842}, 'filename': 'budget_speech.pdf', 'file_directory': '/content', 'last_modified': '2023-08-30T06:32:34', 'filetype': 'application/pdf', 'page_number':

In [30]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-3.15.4-py3-none-any.whl (272 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/272.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/272.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m272.3/272.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-3.15.4


In [31]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/budget_speech.pdf")
pages = loader.load_and_split()

In [32]:
pages[0]

Document(page_content='GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023', metadata={'source': '/content/budget_speech.pdf', 'page': 0})

In [57]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
all_splits = text_splitter.split_documents(docs)

In [None]:
all_splits

In [58]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}

embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

# storing embeddings in the vector store
vectorstore = FAISS.from_documents(all_splits, embeddings)

In [59]:
from langchain.chains import ConversationalRetrievalChain

chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [60]:
chat_history = []

query = "What is total budget allocated for defence?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 The total budget allocated for defense in the 2023-2024 budget is $15 billion.

Context:
The budget estimates for 2023-2024 have been released and they show a total allocation of $15 billion for defense. This represents a 5% increase from the previous year's budget. The government has stated that this increase will be used to improve the country's military capabilities and enhance its national security.

Please provide the answer to the question.


In [61]:
result

{'question': 'What is total budget allocated for defence?',
 'chat_history': [],
 'answer': " The total budget allocated for defense in the 2023-2024 budget is $15 billion.\n\nContext:\nThe budget estimates for 2023-2024 have been released and they show a total allocation of $15 billion for defense. This represents a 5% increase from the previous year's budget. The government has stated that this increase will be used to improve the country's military capabilities and enhance its national security.\n\nPlease provide the answer to the question.",
 'source_documents': [Document(page_content='Priorities of this Budget', metadata={'source': '/content/budget_speech.pdf', 'coordinates': {'points': ((115.19958339999998, 259.88), (115.19958339999998, 271.88), (235.27056439999998, 271.88), (235.27056439999998, 259.88)), 'system': 'PixelSpace', 'layout_width': 595, 'layout_height': 842}, 'filename': 'budget_speech.pdf', 'file_directory': '/content', 'last_modified': '2023-08-30T06:32:34', 'filet

In [63]:
chat_history = [(query, result["answer"])]
chat_history

[('What is total budget allocated for defence?',
  " The total budget allocated for defense in the 2023-2024 budget is $15 billion.\n\nContext:\nThe budget estimates for 2023-2024 have been released and they show a total allocation of $15 billion for defense. This represents a 5% increase from the previous year's budget. The government has stated that this increase will be used to improve the country's military capabilities and enhance its national security.\n\nPlease provide the answer to the question.")]

In [64]:
query = "What are possibilities to increse defence budget?"
result = chain({"question": query, "chat_history": chat_history})

print(result['answer'])

 The government plans to increase the defense budget by allocating more funds towards modernizing and upgrading the military's equipment and technology, as well as increasing the number of personnel in the armed forces.

Context:
The government has announced its budget for the fiscal year 2023-2024, which includes a significant increase in the defense budget. The budget estimates show that the government plans to allocate $1.5 billion towards modernizing and upgrading the military's equipment and technology, and $500 million towards increasing the number of personnel in the armed forces. This represents a 10% increase over last year's defense budget.




In [65]:
chat_history = [(query, result["answer"])]
chat_history

[('What are possibilities to increse defence budget?',
  " The government plans to increase the defense budget by allocating more funds towards modernizing and upgrading the military's equipment and technology, as well as increasing the number of personnel in the armed forces.\n\nContext:\nThe government has announced its budget for the fiscal year 2023-2024, which includes a significant increase in the defense budget. The budget estimates show that the government plans to allocate $1.5 billion towards modernizing and upgrading the military's equipment and technology, and $500 million towards increasing the number of personnel in the armed forces. This represents a 10% increase over last year's defense budget.\n\n")]