In [1]:
pip install beautifulsoup4 youtube-transcript-api requests typing



In [2]:
import re
import logging
from typing import List, Optional
import requests
from bs4 import BeautifulSoup
from youtube_transcript_api import YouTubeTranscriptApi

In [8]:


class ContentError(Exception):
    """Custom exception for content fetching errors."""
    pass

def fetch_wikipedia_content(wiki_url: str) -> str:
    """
    Fetch and clean content from Wikipedia URL.

    Args:
        wiki_url: Wikipedia article URL

    Returns:
        str: Cleaned article text

    Raises:
        ContentError: If content cannot be fetched or processed
    """
    try:
        # Validate Wikipedia URL
        if not re.match(r'https?://[a-z]+\.wikipedia\.org/wiki/', wiki_url):
            raise ContentError("Invalid Wikipedia URL format")

        # Fetch the page
        response = requests.get(wiki_url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unwanted sections
        for unwanted in soup.find_all(['table', 'script', 'style', 'sup', 'span.mw-editsection']):
            unwanted.decompose()

        # Get the main content
        content_div = soup.find(id='mw-content-text')
        if not content_div:
            raise ContentError("Could not find main content")

        # Extract paragraphs
        paragraphs = content_div.find_all('p')

        # Clean and join the text
        content = ' '.join(
            p.get_text().strip()
            for p in paragraphs
            if p.get_text().strip()  # Skip empty paragraphs
        )

        # Clean up special characters and extra whitespace
        content = re.sub(r'\[\d+\]', '', content)  # Remove reference numbers
        content = re.sub(r'\s+', ' ', content)  # Normalize whitespace

        if not content:
            raise ContentError("No content found in the article")

        return content

    except requests.exceptions.RequestException as e:
        raise ContentError(f"Failed to fetch Wikipedia page: {str(e)}")
    except Exception as e:
        raise ContentError(f"Error processing Wikipedia content: {str(e)}")

def preprocess_content(content: str, chunk_size: int = 200) -> List[str]:
    """
    Preprocess content into chunks with improved text splitting.
    Works for both Wikipedia and YouTube content.

    Args:
        content: Input text content
        chunk_size: Maximum size of each chunk

    Returns:
        List of text chunks
    """
    # Split on sentence boundaries
    sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', content) if s.strip()]
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence)
        if current_length + sentence_length <= chunk_size:
            current_chunk.append(sentence)
            current_length += sentence_length
        else:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def query_llama_model(api_key: str, prompt: str) -> Optional[str]:
    """Existing Groq API query function"""
    if not api_key:
        raise ValueError("API key is required")

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": "llama-3.1-8b-instant",
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant that answers questions about content based on provided context."
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        "temperature": 0.7
    }

    try:
        response = requests.post(
            "https://api.groq.com/openai/v1/chat/completions",
            headers=headers,
            json=payload,
            timeout=30
        )

        if response.status_code != 200:
            logging.error(f"API Error: {response.status_code} - {response.text}")
            return None

        result = response.json()
        if not result.get("choices"):
            raise ValueError("No choices in response")

        return result["choices"][0]["message"]["content"].strip()

    except requests.exceptions.RequestException as e:
        logging.error(f"API request failed: {str(e)}")
        return None
    except (KeyError, ValueError) as e:
        logging.error(f"Error parsing API response: {str(e)}")
        return None

def content_qa_system(url: str, question: str, api_key: str) -> str:
    """
    Main QA system that handles both Wikipedia and YouTube URLs.

    Args:
        url: Wikipedia or YouTube URL
        question: User question
        api_key: Groq API key

    Returns:
        Answer string or error message
    """
    try:
        # Input validation
        if not url or not question or not api_key:
            raise ValueError("Missing required parameters")

        # Determine content type and fetch accordingly
        if 'wikipedia.org' in url:
            content = fetch_wikipedia_content(url)
            content_type = "Wikipedia article"
        elif 'youtu' in url:  # Handles youtube.com and youtu.be
            content = fetch_youtube_transcript(url)
            content_type = "video transcript"
        else:
            raise ValueError("Unsupported URL type. Please provide a Wikipedia or YouTube URL.")

        if not content:
            return f"Could not extract content from {content_type}"

        # Process content
        chunks = preprocess_content(content)
        if not chunks:
            return "Failed to process content"

        # Prepare prompt
        context = " ".join(chunks)
        prompt = (
            f"Based on the following {content_type}, please answer the question.\n\n"
            f"Content: {context}\n\n"
            f"Question: {question}\n\n"
            f"Answer:"
        )

        # Get model response
        answer = query_llama_model(api_key, prompt)
        if not answer:
            return "Failed to generate answer"

        return answer

    except Exception as e:
        logging.error(f"Error in QA system: {str(e)}")
        return f"An error occurred: {str(e)}"

# Example usage
if __name__ == "__main__":
    url = "https://simple.wikipedia.org/wiki/Photosynthesis"
    question ="Expain process of Photosynthesis?"
    groq_api_key = ""  # Replace with actual API key

    logging.basicConfig(level=logging.INFO)
    answer = content_qa_system(url, question, groq_api_key)
    print("Answer:", answer)

Answer: Photosynthesis is a process in which green plants make their own food from sunlight through the help of leaves to produce sugar. It is the process by which these organisms convert light energy into chemical energy, which is used to produce food. The process of photosynthesis occurs in the chloroplasts in leaves (or other green tissues) and involves two main sets of reactions: light-dependent reactions and light-independent reactions.

Here's a step-by-step explanation of the process:

1. **Light-Dependent Reactions**: Light energy from the sun hits the chloroplasts in the plant, causing an enzyme to split water molecules (photolysis). This results in the production of oxygen, hydrogen, and electrons.

2. **Production of ATP and NADPH**: The sunlight energized electrons convert NADP into NADPH, which is then used in the light-independent reactions. Oxygen gas diffuses out of the plant as a waste product of photosynthesis, and ATP is synthesized from ADP and inorganic phosphate.
