In [None]:
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

download_folder = 'downloaded_images/'
visited_urls = set()
output_file = 'scraped_content.txt'

def save_image(image_url, folder):
    try:
        img_response = requests.get(image_url)
        if img_response.status_code == 200:
            img_name = os.path.basename(urlparse(image_url).path)
            img_path = os.path.join(folder, img_name)
            with open(img_path, 'wb') as img_file:
                img_file.write(img_response.content)
            return img_name
        else:
            print(f"Failed to retrieve image. Status code: {img_response.status_code}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while saving image: {e}")
        return None

def print_element(element, url, level=0):
    indent = '    ' * level
    with open(output_file, 'a', encoding='utf-8') as f:
        if element.name == 'h1':
            f.write(f"\n# {element.get_text()}\n")
        elif element.name == 'h2':
            f.write(f"\n## {element.get_text()}\n")
        elif element.name == 'h3':
            f.write(f"\n### {element.get_text()}\n")
        elif element.name == 'p':
            f.write(f"{indent}{element.get_text()}\n")
        elif element.name == 'ul':
            for li in element.find_all('li'):
                f.write(f"{indent} - {li.get_text()}\n")
        elif element.name == 'img':
            img_url = urljoin(url, element.get('src'))
            img_name = save_image(img_url, download_folder)
            if img_name:
                img_path = os.path.join(download_folder, img_name)
                f.write(f"{indent}![{element.get('alt', 'Image')}]({img_path})\n")
        elif element.name == 'table':
            for row in element.find_all('tr'):
                cols = row.find_all(['th', 'td'])
                col_texts = [col.get_text(strip=True) for col in cols]
                f.write(indent + ' | '.join(col_texts) + '\n')

def scrape_page(url, base_url, download_folder='downloaded_images', level=0):
    if url in visited_urls:
        return
    visited_urls.add(url)

    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')

            main_content = soup.find('div', {'class': 'main-content'})
            if not main_content:
                main_content = soup.find('div', {'class': 'content'})

            if main_content:
                for element in main_content.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'img', 'table', 'a']):
                    print_element(element, url, level)
            else:
                print(f"Main content not found on {url}. Please check the class name used to identify the main content.")

            for link in soup.find_all('a', href=True):
                link_url = urljoin(base_url, link['href'])
                if base_url in link_url and link_url not in visited_urls:
                    scrape_page(link_url, base_url, download_folder, level + 1)
        else:
            print(f"Failed to retrieve the website. Status code: {response.status_code}")
    except requests.exceptions.RequestException as e:
        print(f"An error occurred while retrieving {url}: {e}")

project_url = input("Enter the URL of the project page: ").strip()

if project_url.startswith('http://') or project_url.startswith('https://'):
    base_url = "{0.scheme}://{0.netloc}".format(urlparse(project_url))

    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(f"Scraping website: {project_url}\n\n")

    scrape_page(project_url, base_url)
else:
    print("Invalid URL. Please ensure the URL starts with http:// or https://")

Enter the URL of the project page: https://ohsl.us
Failed to retrieve the website. Status code: 404


  k = self.parse_starttag(i)


Main content not found on https://ohsl.us/rss.xml. Please check the class name used to identify the main content.
Main content not found on https://ohsl.us/sites/default/files/2020-08/Poster%20November%2019%20Background.pdf. Please check the class name used to identify the main content.
Main content not found on https://ohsl.us/sites/default/files/2020-08/Tony%20Joseph%20Poster.pdf. Please check the class name used to identify the main content.
Main content not found on https://ohsl.us/sites/default/files/2020-08/Philip%20Bourne%20Announcement.pdf. Please check the class name used to identify the main content.
Main content not found on https://ohsl.us/sites/default/files/2020-08/Buetow_Announcement.pdf. Please check the class name used to identify the main content.
Failed to retrieve the website. Status code: 404
Failed to retrieve the website. Status code: 404
Failed to retrieve the website. Status code: 404
Main content not found on https://ohsl.us/sites/default/files/2022-03/Explain

In [None]:
!pip install gradio
!pip install pypdf
!pip install -q transformers einops accelerate langchain bitsandbytes
!pip install sentence_transformers
!pip install llama_index
%pip install llama-index-llms-huggingface
%pip install llama-index-embeddings-langchain
!pip install -U langchain-community

Collecting gradio
  Downloading gradio-4.37.1-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==1.0.2 (from gradio)
  Downloading gradio_client-1.0.2-py3-none-any.whl (318 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.2/318.2 kB[0m [31m43.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [None]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Document
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core import PromptTemplate
import torch
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index.core import ServiceContext
from llama_index.embeddings.langchain import LangchainEmbedding
import gradio as gr




In [None]:
# Read the scraped content from the .txt file
def load_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Load the scraped content into a Document
text_content = load_text_file('scraped_content.txt')
documents = [Document(text=text_content)]

system_prompt = """
You are a Q&A assistant. Your goal is to answer questions as
accurately as possible based on the instructions and context provided.
"""

# Default format supportable by LLama2
query_wrapper_prompt = PromptTemplate("{query_str}")

# Load the model with optimized settings
llm = HuggingFaceLLM(
    context_window=4096,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.0, "do_sample": False},
    system_prompt=system_prompt,
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="daryl149/llama-2-7b-chat-hf",
    model_name="daryl149/llama-2-7b-chat-hf",
    device_map="auto",  # Use device_map for automatic device placement
    model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True}
)

# Use an efficient embedding model
embed_model = LangchainEmbedding(HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))

service_context = ServiceContext.from_defaults(
    chunk_size=512,  # Reduced chunk size
    llm=llm,
    embed_model=embed_model
)

index = VectorStoreIndex.from_documents(documents, service_context=service_context)

query_engine = index.as_query_engine()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  warn_deprecated(


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  service_context = ServiceContext.from_defaults(


In [None]:
# Function to handle chat queries and return responses
def chat(query, history=[]):
    response = query_engine.query(query)
    history.append((query, str(response)))
    return history, history

# Gradio interface
with gr.Blocks(css=".gradio-container {height: 100vh !important}") as demo:
    gr.Markdown("# Simple Chatbot", elem_id="title")
    chatbot = gr.Chatbot()
    query = gr.Textbox(placeholder="Enter your query here...", show_label=False)
    submit = gr.Button("Submit")

    def submit_query(query, history):
        return chat(query, history)

    submit.click(submit_query, [query, chatbot], [chatbot, chatbot])

demo.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6a506fe4701eb616cb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
# Query the index
response = query_engine.query("what is project goals")
print(response)




The project aims to create an AI-Powered Sign Language Virtual Assistant to address the challenges faced by deaf or mute individuals in utilizing voice-based virtual assistants. The goal of the project is to develop an interface that will help the Deaf-mutes to use these Virtual Assistants easily. Designing such an interface will make them find their freedom while using such technologies and might boost their confidence in this Digital Age.
