In [None]:
!pip install requests beautifulsoup4  transformers
import sqlite3



In [None]:
import os
import requests
from bs4 import BeautifulSoup
import sqlite3
from transformers import pipeline

# Function to scrape Wikipedia data
def scrape_wikipedia(url):
    try:
        # Send an HTTP request to the Wikipedia page
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the page content using BeautifulSoup
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract the page title
            title = soup.find('h1', {'id': 'firstHeading'}).text

            # Extract the main content of the page (bodyContent div contains the entire page content)
            content_section = soup.find('div', {'id': 'bodyContent'})

            # Initialize an empty list to store the content
            full_content = []

            # Iterate over the content in the main section
            for element in content_section.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol']):
                if element.name.startswith('h'):
                    # For headings (h1, h2, h3), append the text as a section header
                    full_content.append(f"\n{element.text.strip()}\n{'=' * len(element.text)}\n")
                elif element.name == 'p':
                    # For paragraphs, append the paragraph text
                    full_content.append(element.text.strip())
                elif element.name in ['ul', 'ol']:
                    # For lists (ul, ol), extract each list item and append
                    for li in element.find_all('li'):
                        full_content.append(f" - {li.text.strip()}")

            # Combine all content into a single string
            full_text = "\n".join(full_content)

            return {"title": title, "content": full_text}
        else:
            return {"error": f"Failed to retrieve page. Status code: {response.status_code}"}

    except Exception as e:
        return {"error": f"An error occurred: {str(e)}"}

# Function to create an SQLite database and insert data
def setup_database():
    # Remove the existing database file if it exists
    if os.path.exists('wikipedia_data.db'):
        os.remove('wikipedia_data.db')

    # Connect to SQLite (or create it if it doesn't exist)
    conn = sqlite3.connect('wikipedia_data.db')
    cursor = conn.cursor()

    # Create a table to store Wikipedia data
    cursor.execute('''CREATE TABLE IF NOT EXISTS WikipediaPages (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        title TEXT NOT NULL,
                        content TEXT NOT NULL
                    )''')
    conn.commit()
    return conn, cursor

# Function to insert scraped data into the SQLite database
def insert_data_into_db(cursor, title, content):
    cursor.execute('''INSERT INTO WikipediaPages (title, content) VALUES (?, ?)''', (title, content))

# Function to retrieve content from the database
def get_content_from_db(cursor):
    cursor.execute('SELECT content FROM WikipediaPages ORDER BY id DESC LIMIT 1')
    result = cursor.fetchone()
    return result[0] if result else None

# Function to delete all content from the database
def delete_content_from_db(cursor):
    cursor.execute('DELETE FROM WikipediaPages')
    print("Content deleted from database.")

# Main loop for question answering
def question_answer_loop(cursor):
    # Load the question-answering model
    qa_model = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

    while True:
        # Get question from user
        question = input("Ask a question (or type 'quit' to exit): ").strip()

        # Check if user wants to quit
        if question.lower() == "quit":
            delete_content_from_db(cursor)
            break

        # Get content from the database
        content = get_content_from_db(cursor)

        if content:
            # Perform question answering
            result = qa_model(question=question, context=content)
            answer = result['answer']
            # Truncate the answer to two lines
            truncated_answer = '\n'.join(answer.split('\n')[:2])
            print(f"Answer: {truncated_answer}")
        else:
            print("No content found in the database.")

# Input: Wikipedia URL
wikipedia_url = input("Enter a Wikipedia URL: ")
scraped_data = scrape_wikipedia(wikipedia_url)

# Output the scraped data and store it in the database
if 'error' in scraped_data:
    print(scraped_data['error'])
else:
    # Set up the database
    conn, cursor = setup_database()

    # Insert data into the database
    insert_data_into_db(cursor, scraped_data['title'], scraped_data['content'])

    # Commit the changes
    conn.commit()

    # Start the question answering loop
    question_answer_loop(cursor)

    # Close the database connection
    conn.close()

Enter a Wikipedia URL: https://en.wikipedia.org/wiki/Social_media


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Ask a question (or type 'quit' to exit): what is social media?
Answer: Viral Advertising
Ask a question (or type 'quit' to exit): quit
Content deleted from database.


In [1]:
!pip install fastapi uvicorn pyngrok nest_asyncio pydantic transformers beautifulsoup4 requests


Collecting fastapi
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.31.1-py3-none-any.whl.metadata (6.6 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Collecting starlette<0.41.0,>=0.37.2 (from fastapi)
  Downloading starlette-0.40.0-py3-none-any.whl.metadata (6.0 kB)
Collecting h11>=0.8 (from uvicorn)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading fastapi-0.115.2-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.7/94.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.31.1-py3-none-any.whl (63 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Downloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m4.6

In [7]:
import os
import requests
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from bs4 import BeautifulSoup
import sqlite3
from transformers import pipeline
import uvicorn
from pyngrok import ngrok
import nest_asyncio
import logging
import threading

# Allow nested async calls in Google Colab
nest_asyncio.apply()

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize FastAPI app
app = FastAPI()

# Function to scrape Wikipedia data
def scrape_wikipedia(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.find('h1', {'id': 'firstHeading'}).text
            content_section = soup.find('div', {'id': 'bodyContent'})
            full_content = []

            for element in content_section.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol']):
                if element.name.startswith('h'):
                    full_content.append(f"\n{element.text.strip()}\n{'=' * len(element.text)}\n")
                elif element.name == 'p':
                    full_content.append(element.text.strip())
                elif element.name in ['ul', 'ol']:
                    for li in element.find_all('li'):
                        full_content.append(f" - {li.text.strip()}")

            full_text = "\n".join(full_content)
            return {"title": title, "content": full_text}
        else:
            logger.error(f"Failed to retrieve page. Status code: {response.status_code}")
            return {"error": f"Failed to retrieve page. Status code: {response.status_code}"}
    except Exception as e:
        logger.exception("An error occurred while scraping Wikipedia.")
        return {"error": f"An error occurred: {str(e)}"}

# Function to create an SQLite database and insert data
def setup_database():
    if os.path.exists('wikipedia_data.db'):
        os.remove('wikipedia_data.db')
    conn = sqlite3.connect('wikipedia_data.db')
    cursor = conn.cursor()
    cursor.execute('''CREATE TABLE IF NOT EXISTS WikipediaPages (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        title TEXT NOT NULL,
                        content TEXT NOT NULL
                    )''')
    conn.commit()
    return conn, cursor

# Function to insert scraped data into the SQLite database
def insert_data_into_db(cursor, title, content):
    cursor.execute('''INSERT INTO WikipediaPages (title, content) VALUES (?, ?)''', (title, content))

# Pydantic models for request bodies
class LoadDataRequest(BaseModel):
    url: str

# Endpoint to load data from Wikipedia URL
@app.post("/load")
async def load_data(request: LoadDataRequest):
    scraped_data = scrape_wikipedia(request.url)
    if 'error' in scraped_data:
        raise HTTPException(status_code=400, detail=scraped_data['error'])

    conn, cursor = setup_database()
    insert_data_into_db(cursor, scraped_data['title'], scraped_data['content'])
    conn.commit()
    conn.close()

    logger.info(f"Data loaded successfully for page: {scraped_data['title']}")
    return {"message": f"Data loaded successfully for page: {scraped_data['title']}"}

# Function to run the FastAPI server
def run_fastapi():
    uvicorn.run(app, host="0.0.0.0", port=8000)

# Function to start ngrok tunnel
def start_ngrok():
    logger.info("Starting ngrok...")
    public_url = ngrok.connect(8000)
    logger.info(f"Ngrok tunnel \"{public_url}\" is live!")
    return public_url

# Main function to run FastAPI and ngrok in the background
def run_fastapi_and_ngrok():
    logger.info("Starting FastAPI server in a new thread...")

    # Start FastAPI server in a separate thread
    fastapi_thread = threading.Thread(target=run_fastapi)
    fastapi_thread.start()

    # Start ngrok
    public_url = start_ngrok()

    return public_url

# Run the FastAPI server and ngrok
public_url = run_fastapi_and_ngrok()


ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-10' coro=<Server.serve() done, defined at /usr/local/lib/python3.10/dist-packages/uvicorn/server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/main.py", line 579, in run
    server.run()
  File "/usr/local/lib/python3.10/dist-packages/uvicorn/server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 30, in run
    return loop.run_until_complete(task)
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 92, in run_until_complete
    self._run_once()
  File "/usr/local/lib/python3.10/dist-packages/nest_asyncio.py", line 133, in _run_once
    handle._run()
  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/lib/python3.10/asyncio/tasks.py", 