In [5]:
#!pip install langchain chromadb sentence-transformers

In [6]:
#!pip install llama-cpp-python

In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jobdesc/Job Description
/kaggle/input/mistral-7b-instruct/gguf/mistral-7b-instructv0.1/1/mistral-7b-instruct-v0.1.Q8_0.gguf


In [4]:
import pandas as pd
import numpy as np
import requests
import re
from bs4 import BeautifulSoup

# Data Scrapping from Website

In [7]:
url = 'https://python.langchain.com/docs/use_cases/question_answering/'

# Define the headers to mimic a browser request
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}

# Send a GET request to the URL with headers
response = requests.get(url, headers=headers)

In [8]:
# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extracting the h1 heading of the webpage
    h1_heading = soup.find('h1').text.strip() if soup.find('h1') else "No h1 heading found"
    print("h1 Heading:", h1_heading)
    
    # Extracting all the text content
    all_text = soup.get_text(separator="\n").strip()
else:
    print("Failed to retrieve the webpage. Status code:", response.status_code)

h1 Heading: Q&A with RAG


In [9]:
original_string = all_text
lines = [line for line in original_string.splitlines() if line.strip()]
# Join the lines back together without empty lines
result_string = ' '.join(lines)

In [10]:
formatted_text = re.sub(r'\s+', ' ', result_string)

In [13]:
def write_text_to_file(text, filename):
    words = text.split()
    with open(filename, 'w') as file:
        for i in range(0, len(words), 20):
            file.write(' '.join(words[i:i+20]) + '\n')

In [14]:
write_text_to_file(
    text=formatted_text,
    filename='scraped_content.txt'
)

# Implement LLMs with Langchain

In [15]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings

In [16]:
docs_path = '/kaggle/working/scraped_content.txt'
loader = TextLoader(
    file_path=docs_path,
    autodetect_encoding=False
)

In [17]:
# split the text into small chunks
# Its help us to divide the large input text data
text_splitter = CharacterTextSplitter(
    separator='\n',
    chunk_size=400,
    chunk_overlap=100
)

In [18]:
docs = loader.load_and_split(
    text_splitter=text_splitter
)

In [20]:
docs[0]

Document(page_content='Q&A with RAG | 🦜️🔗 LangChain Skip to main content Components Integrations Guides API Reference More People Versioning Contributing Templates\nCookbooks Tutorials YouTube 🦜️🔗 LangSmith LangSmith Docs LangServe GitHub Templates GitHub Templates Hub LangChain Hub JS/TS Docs 💬 Search Get', metadata={'source': '/kaggle/working/scraped_content.txt'})

In [22]:
embedding_model = 'all-MiniLM-L6-v2'
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model,
    multi_process=False,
    show_progress=False
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Vector Database to store Embeddings and Text Data

In [23]:
from langchain.vectorstores.chroma import Chroma

In [24]:
# create database and create persist_directory
database = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    persist_directory='data1'
)

In [25]:
vector_store = Chroma(
#     collection_name='website_contents',
    persist_directory='data1',
    embedding_function=embeddings
)

In [26]:
retriever = vector_store.as_retriever(
    search_kwargs={'k':1}
)

# Build LLMs Model and Prompts

In [27]:
from langchain.llms import LlamaCpp
from langchain.schema.runnable import RunnablePassthrough
from langchain.prompts import ChatMessagePromptTemplate
from langchain import FewShotPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import ChatPromptTemplate

In [28]:
llms_model_path = "/kaggle/input/mistral-7b-instruct/gguf/mistral-7b-instructv0.1/1/mistral-7b-instruct-v0.1.Q8_0.gguf"

In [29]:
callbackManager = CallbackManager([StreamingStdOutCallbackHandler()])

In [30]:
llm = LlamaCpp(
    model_path=llms_model_path,
    temperature=0.2,
    max_tokens=1000,
    top_p=1,
    n_gpu_layers=32,
    callback_manager=callbackManager,
    verbose=False
)

In [166]:
template = """
  Answer the question based on the following context only.
  {context}
  
  Question:{question}
  Answer:
"""

In [167]:
prompt = ChatPromptTemplate.from_template(
    template=template
)

In [170]:
rag_chain = {
        "context": retriever, 
        "question": RunnablePassthrough()
} | prompt | llm

In [171]:
rag_chain.invoke('Tell me skills require for this job')

    The job requires skills in Natural Language Processing (NLP), Computer Vision, Neural Networks and Algorithms, Statistics, Python programming language and its backend framework Flask, working with large datasets and databases, and knowledge of machine learning frameworks such as TensorFlow and PyTorch.

'    The job requires skills in Natural Language Processing (NLP), Computer Vision, Neural Networks and Algorithms, Statistics, Python programming language and its backend framework Flask, working with large datasets and databases, and knowledge of machine learning frameworks such as TensorFlow and PyTorch.'