In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

# Function to fetch and parse a webpage
def fetch_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return BeautifulSoup(response.text, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to get all internal links from a webpage (limited to 5)
def get_internal_links(base_url, soup, limit=5):
    internal_links = set()
    count = 0
    for link in soup.find_all('a', href=True):
        if count >= limit:
            break
        href = link['href']
        full_url = urljoin(base_url, href)
        if urlparse(full_url).netloc == urlparse(base_url).netloc:
            internal_links.add(full_url)
            count += 1
    return internal_links

# Function to crawl the website
def crawl_website(base_url, link_limit=5):
    to_visit = {base_url}
    visited = set()
    while to_visit:
        current_url = to_visit.pop()
        if current_url not in visited:
            print(f"Crawling: {current_url}")
            visited.add(current_url)
            soup = fetch_page(current_url)
            if soup:
                internal_links = get_internal_links(base_url, soup, link_limit)
                to_visit.update(internal_links - visited)
            time.sleep(1)  # Respectful crawling by adding delay
    return visited

# Base URL of the website to crawl
base_url = 'https://www.pratham.org'

# Start crawling
urls = crawl_website(base_url,100)

# Output the results
print(f"Total pages found: {len(urls)}")
for page in urls:
    print(page)

import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to scrape a webpage
def scrape_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

# Function to extract relevant data
def extract_data(soup):
    data = []
    for p in soup.find_all('p'):
        data.append(p.get_text())
    return data


# Collect data
all_data = []
for url in urls:
    soup = scrape_page(url)
    data = extract_data(soup)
    all_data.extend(data)

# Convert to DataFrame for further processing
df = pd.DataFrame(all_data, columns=['text'])

# Clean text data
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=50)
X = vectorizer.fit_transform(df['cleaned_text'])
feature_names = vectorizer.get_feature_names_out()

# Extract top 50 features for each document
def extract_summary(row):
    indices = X[row.name].nonzero()[1]
    top_features = [feature_names[i] for i in indices]
    return ' '.join(top_features)

df['summary'] = df.apply(extract_summary, axis=1)

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Function to generate a response
def generate_response(question, knowledge_base):
    context = ' '.join(knowledge_base[:5])  # Using a subset of the knowledge base for clarity
    input_text = f"Context: {context}\nQ: {question}\nA:"
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    max_input_length = inputs.shape[1]
    # Adjust max_length to accommodate both input length and generated tokens
    outputs = model.generate(
        inputs, 
        max_length=max_input_length + 50,  # Ensure space for generating new tokens
        num_return_sequences=1, 
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Combine summaries for the knowledge base
knowledge_base = df['summary'].tolist()

# Example queries
questions = [
    "What is Pratham's mission?",
    "How does Pratham help in education?",
    "Where is Pratham located?",
    "What are the latest updates from Pratham?",
    "How can I contact Pratham?"
]

# Generate responses for each question
for question in questions:
    response = generate_response(question, knowledge_base)
    print(f"Q: {question}\nA: {response}\n")

df['summary']

Crawling: https://www.pratham.org
Crawling: https://www.pratham.org/about/board/
Crawling: https://www.pratham.org/2019/05/21/vibha-paul/
Crawling: https://www.pratham.org/about/leadership/
Crawling: https://www.pratham.org/about/recognition/
Crawling: https://www.pratham.org/get-involved/donate
Error fetching https://www.pratham.org/get-involved/donate: Exceeded 30 redirects.
Crawling: https://www.pratham.org/2019/06/03/csr-leader-award-by-ceo-india-to-co-founders-madhav-chavan-farida-lambay/
Crawling: https://www.pratham.org/slider/second-chance-slider/
Crawling: https://www.pratham.org/programs/education/
Crawling: https://www.pratham.org/contact/
Crawling: https://www.pratham.org/2019/05/04/arvind-sanger/
Crawling: https://www.pratham.org/2019/06/03/agent-of-social-change-as-part-of-gq-men-of-the-year-awards-to-madhav-chavan-2015/
Crawling: https://www.pratham.org/2019/05/21/bbva-foundation-frontiers-of-knowledge-award-2014/
Crawling: https://www.pratham.org/2019/12/25/vilas-gadkar

KeyboardInterrupt: 

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define important URLs to scrape
important_urls = [
    "https://www.pratham.org/about/",
    "https://www.pratham.org/about/board/",
    "https://www.pratham.org/about/leadership/",
    "https://www.pratham.org/programs/",
    "https://www.pratham.org/programs/learning-with-the-children/",
    "https://www.pratham.org/programs/digital-initiatives/",
    "https://www.pratham.org/programs/vocational-training/",
    "https://www.pratham.org/get-involved/",
    "https://www.pratham.org/get-involved/donate/",
    "https://www.pratham.org/contact/",
    # Add more specific URLs as needed
]

# Function to scrape a webpage
def scrape_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract relevant data
def extract_data(soup):
    data = []
    for p in soup.find_all('p'):
        data.append(p.get_text())
    return data

# Collect data from important URLs
all_data = []
for url in important_urls:
    soup = scrape_page(url)
    if soup:  # Check if the page was successfully fetched
        data = extract_data(soup)
        all_data.extend(data)

# Convert to DataFrame for further processing
df = pd.DataFrame(all_data, columns=['text'])

# Clean text data
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=50)
X = vectorizer.fit_transform(df['cleaned_text'])
feature_names = vectorizer.get_feature_names_out()

# Extract top 50 features for each document
def extract_summary(row):
    indices = X[row.name].nonzero()[1]
    top_features = [feature_names[i] for i in indices]
    return ' '.join(top_features)

df['summary'] = df.apply(extract_summary, axis=1)

# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Function to generate a response
def generate_response(question, knowledge_base):
    context = ' '.join(knowledge_base[:5])  # Using a subset of the knowledge base for clarity
    input_text = f"Context: {context}\nQ: {question}\nA:"
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    max_input_length = inputs.shape[1]
    # Adjust max_length to accommodate both input length and generated tokens
    outputs = model.generate(
        inputs, 
        max_length=max_input_length + 50,  # Ensure space for generating new tokens
        num_return_sequences=1, 
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Check if the response is relevant (simple check for now)
    if "contact" in response.lower() or len(response.strip()) < 50:
        return "I'm sorry, I couldn't find the information you're looking for. Please visit Pratham's website or contact them directly for more details."

    return response

# Combine summaries for the knowledge base
knowledge_base = df['summary'].tolist()

# Example queries
questions = [
    "What is Pratham's mission?",
    "How does Pratham help in education?",
    "Where is Pratham located?",
    "What are the latest updates from Pratham?",
    "How can I contact Pratham?",
    "What is Pratham's annual budget?"
]

# Generate responses for each question
for question in questions:
    response = generate_response(question, knowledge_base)
    print(f"Q: {question}\nA: {response}\n")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Error fetching https://www.pratham.org/programs/learning-with-the-children/: 404 Client Error: Not Found for url: https://www.pratham.org/programs/learning-with-the-children/
Error fetching https://www.pratham.org/get-involved/donate/: Exceeded 30 redirects.
Q: What is Pratham's mission?
A: Context: pratham learning education india children mumbai pratham education india children also school programs state pratham learning programs years pratham learning education children also programs
Q: What is Pratham's mission?
A: Pratham is a non-profit organization that provides educational services to children in the state of India. It is a non-profit organization that provides educational services to children in the state of India. It is a non-profit organization that provides educational services

Q: How does Pratham help in education?
A: Context: pratham learning education india children mumbai pratham education india children also school programs state pratham learning programs years pratha

In [15]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

# Ensure you have the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Define a hierarchical structure for the URLs
urls_structure = {
    "https://www.pratham.org/about/": [
        "https://www.pratham.org/about/board/",
        "https://www.pratham.org/about/leadership/",
        "https://www.pratham.org/about/partners/",
        "https://www.pratham.org/about/teaching-at-the-right-level/",
        "https://www.pratham.org/about/recognition/",
        "https://www.pratham.org/about/news/",
        "https://www.pratham.org/about/legal-financial-information/"
    ],
    "https://www.pratham.org/programs/": [
        "https://www.pratham.org/programs/education/",
        "https://www.pratham.org/about/hamara-gaon/",
        "https://www.pratham.org/programs/education/early-childhood-education/",
        "https://www.pratham.org/programs/education/elementary/",
        "https://www.pratham.org/programs/education/beyond-elementary/",
        "https://www.pratham.org/programs/education/beyond-elementary/",
        "https://www.pratham.org/programs/education/digital-initiatives/",
        "https://www.pratham.org/programs/digital-initiatives/",
        "https://www.pratham.org/programs/vocational-training/",
        "https://www.pratham.org/programs/pratham-council-for-vulnerablechildren/",
        "https://www.pratham.org/programs/education/aser/"
    ],
    "https://www.pratham.org/get-involved/": [
        "https://www.pratham.org/get-involved/job-opportunities/",
        "https://www.pratham.org/get-involved/internships/"
    ],
    "https://www.pratham.org/contact/": [
        "https://www.pratham.org/contact/"
    ]
}

# Function to scrape a webpage
def scrape_page(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

# Function to extract relevant data
def extract_data(soup):
    data = []
    if soup:
        for p in soup.find_all('p'):
            data.append(p.get_text())
    return data

# Collect data from all URLs in the structure
all_data = []
for main_url, sub_urls in urls_structure.items():
    soup = scrape_page(main_url)
    if soup:  # Check if the main page was successfully fetched
        data = extract_data(soup)
        all_data.extend(data)
    for sub_url in sub_urls:
        soup = scrape_page(sub_url)
        if soup:  # Check if the sub-page was successfully fetched
            data = extract_data(soup)
            all_data.extend(data)

# Convert to DataFrame for further processing
df = pd.DataFrame(all_data, columns=['text'])

# Clean text data
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(clean_text)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer(max_features=50)
X = vectorizer.fit_transform(df['cleaned_text'])
feature_names = vectorizer.get_feature_names_out()

# Extract top 50 features for each document
def extract_summary(row):
    indices = X[row.name].nonzero()[1]
    top_features = [feature_names[i] for i in indices]
    return ' '.join(top_features)

df['summary'] = df.apply(extract_summary, axis=1)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import pandas as pd

# Load model and tokenizer from Hugging Face
model_name = "deepset/bert-base-cased-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Initialize the QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Create a combined knowledge base from the summary column
knowledge_base = " ".join(df['summary'].dropna().tolist())

# Function to generate a response
def generate_response(question, knowledge_base):
    if not knowledge_base.strip():
        return "I'm sorry, I couldn't find the information you're looking for. Please visit Pratham's website or contact them directly for more details."

    result = qa_pipeline(question=question, context=knowledge_base)
    answer = result.get('answer', '')

    if not answer:
        return "I'm sorry, I couldn't find the information you're looking for. Please visit Pratham's website or contact them directly for more details."
    return answer

# Example queries
questions = [
    "What is Pratham's mission?",
    "How does Pratham help in education?",
    "Where is Pratham located?",
    "What are the latest updates from Pratham?",
    "How can I contact Pratham?"
]

# Generate responses for each question
for question in questions:
    response = generate_response(question, knowledge_base)
    print(f"Q: {question}\nA: {response}\n")


Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Q: What is Pratham's mission?
A: children  learning   job opportunities

Q: How does Pratham help in education?
A: pratham vocational training

Q: Where is Pratham located?
A: india

Q: What are the latest updates from Pratham?
A: programs education vocational training education annual status report

Q: How can I contact Pratham?
A: contact number chavan maharashtra 022



In [8]:
df['summary']

0       pratham learning education india children mumbai
1                                                pratham
2      education india children also school programs ...
3                        pratham learning programs years
4      pratham learning education children also programs
                             ...                        
349                                         delhi office
350    delhi enclave 1st floor new email infoprathamo...
351                                        mumbai office
352    mumbai floor email infoprathamorg contact numb...
353                                                     
Name: summary, Length: 354, dtype: object