In [1]:
#!pip install farm-haystack[preprocessing] -q
#!pip install farm-haystack[inference] -q
#!pip install farm-haystack[elasticsearch] -q
#!pip install nltk -q

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def get_visitable_links(base_url):
    # Send a GET request to the base URL
    response = requests.get(base_url)
    # Check if the request was successful
    if response.status_code != 200:
        print(f"Failed to retrieve {base_url}")
        return []
    
    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    
    # Find all anchor tags in the page
    for a_tag in soup.find_all('a', href=True):
        # Get the full URL
        full_url = urljoin(base_url, a_tag['href'])
        
        # Check if the URL is visitable (not a mailto or javascript link)
        if full_url.startswith(base_url):
            links.append(full_url)
    
    return links

# Example usage
base_url = 'https://github.com/hissain/'  # Replace with your desired URL
visitable_links = get_visitable_links(base_url)

# Print the collected links
for link in visitable_links:
    print(link)


https://github.com/hissain/#start-of-content
https://github.com/hissain/
https://github.com/hissain/
https://github.com/hissain/
https://github.com/hissain/hissain
https://github.com/hissain/#about-me
https://github.com/hissain/#stackoverflow
https://github.com/hissain/CoronaTracker
https://github.com/hissain/CoronaTracker/stargazers
https://github.com/hissain/CoronaTracker/forks
https://github.com/hissain/minimal-cmake
https://github.com/hissain/minimal-cmsis-dsp
https://github.com/hissain/mlworks
https://github.com/hissain/mlworks/stargazers
https://github.com/hissain/dsp-spectrogram


In [3]:
# Step 1: Scrape webpages and prepare documents

import requests
from bs4 import BeautifulSoup
from haystack.schema import Document

# Replace this list with your actual list of blog URLs
#urls = [
#    'https://en.wikipedia.org/wiki/Bangladesh',
#    'https://en.wikipedia.org/wiki/India',
    # Add more URLs as needed
#]

urls = visitable_links

documents = []

for url in urls:
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract text from all paragraph tags
        paragraphs = soup.find_all('p')
        content = '\n'.join([para.get_text() for para in paragraphs])

        # Create a Haystack Document with content and optional metadata
        doc = Document(content=content, meta={'url': url})
        documents.append(doc)
    except Exception as e:
        print(f'Error processing {url}: {e}')

In [4]:
print(f'Total no of documents: {len(documents)}')

Total no of documents: 15


In [5]:
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader, BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers
from haystack.nodes import PreProcessor
from haystack.schema import Document

# Step 2: Initialize an in-memory document store
document_store = InMemoryDocumentStore(use_bm25=True)
#document_store = ElasticsearchDocumentStore(host="localhost", port=9200)

# Initialize PreProcessor with desired settings
preprocessor = PreProcessor(
    split_by="word",            # Can also be "sentence" if sentence-based splitting is preferred
    split_length=300,           # Number of words per chunk
    split_respect_sentence_boundary=True,  # Ensure splitting occurs at sentence boundaries
    split_overlap=50            # Overlap between chunks to preserve context
)

# Process the scraped documents to split them
processed_documents = preprocessor.process(documents)

# Now, 'processed_documents' will contain smaller chunks ready for the document store
document_store.write_documents(processed_documents)

# Verify the number of chunks after splitting
print(f"Number of documents after splitting: {document_store.get_document_count()}")

# Example: Print a chunk to inspect
sample_chunk = document_store.get_all_documents()[0]
print(sample_chunk.content)
print(sample_chunk.meta)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hissain/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Preprocessing: 100%|████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 44.33docs/s]
Updating BM25 representation...: 100%|██████████████████████████████████████████████████████| 13/13 [00:00<00:00, 12997.84 docs/s]

Number of documents after splitting: 13
We read every piece of feedback, and take your input very seriously.

To see all available qualifiers, see our documentation.

Prevent this user from interacting with your repositories and sending you notifications.
Learn more about blocking users.

You must be logged in to block users.

Contact GitHub support about this user’s behavior.
Learn more about reporting abuse.

I am an accomplished Associate Architect with over 13 years of experience in Mobile and Wearables Software Development. With a deep understanding of the challenges inherent in developing performant, robust, testable, and maintainable applications, from requirement analysis to architecture, design, development, and maintenance, I am confident in my ability to lead software development and commercialization for any organization. My experience includes working on Samsung Health for iOS, an application boasting over 6 million users and approximately 160K DAU on the App Store market,




In [9]:
# Step 4: Initialize a reader model (FARMReader)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)

# Step 4: Initialize BM25Retriever for BM25 retrieval (keyword-based retrieval)
retriever = BM25Retriever(document_store=document_store)

# Step 5: Create a pipeline using BM25Retriever and the FARMReader
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

In [10]:
# Step 7: Ask a question to the RAG pipeline
query = "What are the challenges for Covid-19?"

# Get answers from the pipeline
prediction = pipeline.run(
    query=query,
    params={
        "Retriever": {"top_k": 5},  # Number of documents to retrieve
        "Reader": {"top_k": 2}      # Number of answers to return
    }
)

print('\n')

# Print the answers
print_answers(prediction, details="minimum")

Inferencing Samples: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.25s/ Batches]



'Query: What are the challenges for Covid-19?'
'Answers:'
[   {   'answer': 'information accuracy, information retrieval delay etc.',
        'context': ' this approach has many limitations in terms of '
                   'information accuracy, information retrieval delay etc. due '
                   'to mostly,\n'
                   'Detail architecture can be explo'},
    {   'answer': 'to flatten the curve of infection',
        'context': ' challenges for any government of the affected counties is '
                   'to flatten the curve of infection. Early detection of '
                   'already affected patients is the most'}]





In [13]:
# Step 7: Ask a question to the RAG pipeline
query = "What interests Hissain most?"

# Get answers from the pipeline
prediction = pipeline.run(
    query=query,
    params={
        "Retriever": {"top_k": 5},  # Number of documents to retrieve
        "Reader": {"top_k": 2}      # Number of answers to return
    }
)

print('\n')

# Print the answers
print_answers(prediction, details="minimum")

Inferencing Samples: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.45s/ Batches]



'Query: What interests Hissain most?'
'Answers:'
[   {   'answer': 'Technological Innovation, Human-Machine Interaction, '
                  'Information Theory, Astronomy, Probability, Theory of '
                  'Relativity, Philosophy of Science, Piano, Guitar, and '
                  'Poetry',
        'context': ' Technological Innovation, Human-Machine Interaction, '
                   'Information Theory, Astronomy, Probability, Theory of '
                   'Relativity, Philosophy of Science, Piano, Guitar, and '
                   'Poetry'},
    {   'answer': 'Technological Innovation, Human-Machine Interaction',
        'context': 'ss score generation.\n'
                   'My special interests include Technological Innovation, '
                   'Human-Machine Interaction, Information Theory, Astronomy, '
                   'Probability, The'}]



