In [1]:
#!pip install farm-haystack[preprocessing] -q
#!pip install farm-haystack[inference] -q
#!pip install farm-haystack[elasticsearch] -q
#!pip install nltk -q

In [107]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from tqdm.notebook import tqdm

valid_links = []

def get_visitable_links(base_url):
    response = requests.get(base_url, verify=False)
    if response.status_code != 200:
        print(f"Failed to retrieve {base_url}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    links = []
    
    for a_tag in soup.find_all('a', href=True):
        full_url = urljoin(base_url, a_tag['href'])
        if '#' not in full_url and '%' not in full_url and full_url.startswith(base_url) and full_url not in links:
            links.append(full_url)
    
    return links


base_url = 'https://en.wikipedia.org/wiki/Bangladesh'
valid_links = get_visitable_links(base_url)


print(f'Total visitable link count: {len(valid_links)}')
for link in valid_links:
    print(link)


Total visitable link count: 57
https://en.wikipedia.org/wiki/Bangladesh
https://en.wikipedia.org/wiki/Bangladesh_(disambiguation)
https://en.wikipedia.org/wiki/Bangladeshi_English
https://en.wikipedia.org/wiki/Bangladeshis
https://en.wikipedia.org/wiki/Bangladeshi_taka
https://en.wikipedia.org/wiki/Bangladesh_Standard_Time
https://en.wikipedia.org/wiki/Bangladesh_Liberation_War
https://en.wikipedia.org/wiki/Bangladesh_Supreme_Court
https://en.wikipedia.org/wiki/Bangladesh_Armed_Forces
https://en.wikipedia.org/wiki/Bangladeshi_Declaration_of_Independence
https://en.wikipedia.org/wiki/Bangladesh_Forces
https://en.wikipedia.org/wiki/Bangladesh_famine_of_1974
https://en.wikipedia.org/wiki/Bangladesh_Krishak_Sramik_Awami_League
https://en.wikipedia.org/wiki/Bangladesh_Rifles_revolt
https://en.wikipedia.org/wiki/Bangladeshi_art
https://en.wikipedia.org/wiki/Bangladesh_Nationalist_Party
https://en.wikipedia.org/wiki/Bangladesh_Environment_Conservation_Act
https://en.wikipedia.org/wiki/Banglad

In [108]:
import requests
from bs4 import BeautifulSoup
from haystack.schema import Document
from tqdm.notebook import tqdm

documents = []

for url in tqdm(valid_links):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')

        paragraphs = soup.find_all('p')
        content = '\n'.join([para.get_text() for para in paragraphs])

        doc = Document(content=content, meta={'url': url})
        documents.append(doc)
    except Exception as e:
        print(f'Error processing {url}: {e}')

  0%|          | 0/57 [00:00<?, ?it/s]

In [109]:
print(f'Total no of documents: {len(documents)}')

Total no of documents: 57


In [111]:
from haystack.document_stores import InMemoryDocumentStore, ElasticsearchDocumentStore, FAISSDocumentStore
from haystack.nodes import DensePassageRetriever, FARMReader, BM25Retriever
from haystack.pipelines import ExtractiveQAPipeline
from haystack.utils import print_answers
from haystack.nodes import PreProcessor
from tqdm.notebook import tqdm

document_store = InMemoryDocumentStore(use_bm25=True)
#document_store = ElasticsearchDocumentStore(host="localhost", port=9200)

preprocessor = PreProcessor(
    split_by="word",            # Can also be "sentence" if sentence-based splitting is preferred
    split_length=200,           # Number of words per chunk
    split_respect_sentence_boundary=True,  # Ensure splitting occurs at sentence boundaries
    split_overlap=20            # Overlap between chunks to preserve context
)

processed_documents = preprocessor.process(documents)
document_store.write_documents(processed_documents)
print(f"Number of documents after splitting: {document_store.get_document_count()}")

if len(document_store.get_all_documents()) > 0:
    sample_chunk = document_store.get_all_documents()[0]
    print(sample_chunk.content)
    print(sample_chunk.meta)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/hissain/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Preprocessing: 100%|████████████████████████████████████████████████████████████████████████████| 57/57 [00:01<00:00, 33.95docs/s]
Updating BM25 representation...: 100%|████████████████████████████████████████████████████| 729/729 [00:00<00:00, 13591.36 docs/s]


Number of documents after splitting: 729


Bangladesh,[a] officially the People's Republic of Bangladesh,[b] is a country in South Asia. It is the eighth-most populous country in the world and among the most densely populated with a population of 170 million in an area of 148,460 square kilometres (57,320 sq mi). Bangladesh shares land borders with India to the north, west, and east, and Myanmar to the southeast. To the south, it has a coastline along the Bay of Bengal. To the north, it is separated from Bhutan and Nepal by the Siliguri Corridor, and from China by the mountainous Indian state of Sikkim. Dhaka, the capital and largest city, is the nation's political, financial, and cultural centre. Chittagong is the second-largest city and the busiest port. The official language is Bengali, with Bangladeshi English also used in government.

Bangladesh is part of the historic and ethnolinguistic region of Bengal, which was divided during the Partition of British India in 1947 as the east

In [112]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=False)
retriever = BM25Retriever(document_store=document_store)
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)



In [118]:
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import BM25Retriever, Seq2SeqGenerator
from haystack.pipelines import GenerativeQAPipeline

# Initialize FAISS document store
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")

# Initialize retriever
retriever = BM25Retriever(document_store=document_store)

# Initialize generator (using a Seq2Seq model like BART)
generator = Seq2SeqGenerator(model_name_or_path="facebook/bart-large-cnn")

# Create a pipeline with retriever and generator
pipeline = GenerativeQAPipeline(generator=generator, retriever=retriever)

# Ask a question to the pipeline
query = "Is Bangladesh a secular country?"
result = pipeline.run(query=query, params={"Retriever": {"top_k": 5}})

# Print the generated answers
for answer in result['answers']:
    print(answer.answer)


ImportError: cannot import name 'Seq2SeqGenerator' from 'haystack.nodes' (/Users/hissain/myenv/lib/python3.11/site-packages/haystack/nodes/__init__.py)

In [93]:
# Step 7: Ask a question to the RAG pipeline
query = "How is China Bangladesh relationship?"

# Get answers from the pipeline
prediction = pipeline.run(
    query=query,
    params={
        "Retriever": {"top_k": 10},  # Number of documents to retrieve
        "Reader": {"top_k": 1}      # Number of answers to return
    }
)

print('\n')

# Print the answers
print_answers(prediction, details="minimum")

Inferencing Samples: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.97s/ Batches]



'Query: How is China Bangladesh relationship?'
'Answers:'
[   {   'answer': 'highly tilted in favour of Beijing',
        'context': 'porting to China.[27]\n'
                   'Bangladesh-China bilateral trade is highly tilted in '
                   "favour of Beijing,[28][29][30]  and Bangladesh's bilateral "
                   'trade deficit wi'}]





In [106]:
# Step 7: Ask a question to the RAG pipeline
query = "Tell me about Bangladesh?"

# Get answers from the pipeline
prediction = pipeline.run(
    query=query,
    params={
        "Retriever": {"top_k": 10},  # Number of documents to retrieve
        "Reader": {"top_k": 1}      # Number of answers to return
    }
)

print('\n')

# Print the answers
print_answers(prediction, details="all")

Inferencing Samples: 100%|████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.80s/ Batches]



'Query: Tell me about Bangladesh?'
'Answers:'
[   <Answer {'answer': "Bangladesh's flagship export-oriented ready-made garment industry", 'type': 'extractive', 'score': 0.3153889775276184, 'context': "n Bangladesh's formal economy was minimal. Bangladesh's flagship export-oriented ready-made garment industry, however, with female labor accounting fo", 'offsets_in_document': [{'start': 723, 'end': 788}], 'offsets_in_context': [{'start': 43, 'end': 108}], 'document_ids': ['8859bc28fd21591b67d7df7cf70de8bc'], 'meta': {'url': 'https://en.wikipedia.org/wiki/Bangladesh_textile_industry', '_split_id': 20, '_split_overlap': [{'doc_id': '59a8281c4baa6e06eefc890ab2e83999', 'range': (0, 285)}, {'doc_id': 'f58d1e90a66185ecb7e843173691d3bf', 'range': (1048, 1240)}]}}>]



