<a href="https://colab.research.google.com/github/kalavakuriGanesh/Aptitude_Practice_Questions/blob/master/ATS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install gradio transformers torch pymupdf requests beautifulsoup4 langdetect deep-translator gtts regex



In [1]:
!pip uninstall torch torchvision -y
!pip install torch torchvision --no-cache-dir


Found existing installation: torch 2.6.0
Uninstalling torch-2.6.0:
  Successfully uninstalled torch-2.6.0
Found existing installation: torchvision 0.21.0
Uninstalling torchvision-0.21.0:
  Successfully uninstalled torchvision-0.21.0
Collecting torch
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl (766.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m145.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl (7.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m160.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch, torchvision
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This beh

In [2]:
import gradio as gr
from transformers import pipeline
import torch
import fitz  # PyMuPDF
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from deep_translator import GoogleTranslator
from gtts import gTTS
import tempfile
import re

# Initialize summarizer
summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)

def extract_text_from_pdf(pdf_path):
    """Extracts text from a PDF file."""
    try:
        with fitz.open(pdf_path) as doc:
            text = ""
            for page in doc:
                text += page.get_text()
            return text
    except Exception as e:
        raise gr.Error(f"PDF processing error: {str(e)}")

def extract_text_from_website(url):
    """Extracts text from a given website URL."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except Exception as e:
        raise gr.Error(f"Website error: {str(e)}")

    soup = BeautifulSoup(response.text, 'lxml')

    # Remove unwanted elements
    for element in soup(['script', 'style', 'nav', 'footer', 'header', 'form']):
        element.decompose()

    # Get text content from main content areas
    main_content = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p'])
    text = " ".join([element.get_text(strip=True) for element in main_content])
    text = re.sub(r'\s+', ' ', text)  # Clean whitespace
    return text

def summarize_text(input_text, max_length=200):
    """Summarizes text using BART model."""
    try:
        summary = summarizer(input_text, max_length=max_length, min_length=50, do_sample=False)
        return summary[0]["summary_text"]
    except Exception as e:
        raise gr.Error(f"Summarization error: {str(e)}")

def translate_summary(summary_text, target_language):
    """Translates summary text to target language."""
    try:
        if not summary_text.strip():
            return ""

        detected_language = detect(summary_text)
        if detected_language != target_language:
            translated = GoogleTranslator(source='auto', target=target_language).translate(summary_text)
            return translated
        return summary_text
    except Exception as e:
        raise gr.Error(f"Translation error: {str(e)}")

def text_to_speech(summary_text, target_lang):
    """Converts text to speech with proper language mapping."""
    lang_mapping = {
        'zh': 'zh',  # Chinese
        'hi': 'hi',  # Hindi
        'ta': 'ta',  # Tamil
        'te': 'te',  # Telugu
        'en': 'en',
        'es': 'es',
        'fr': 'fr',
        'de': 'de'
    }
    lang = lang_mapping.get(target_lang, 'en')

    try:
        tts = gTTS(text=summary_text, lang=lang)
        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
        tts.save(temp_file.name)
        return temp_file.name
    except Exception as e:
        raise gr.Error(f"Audio conversion error: {str(e)}")

def process_input(input_text=None, pdf_file=None, website_url=None, target_language="en"):
    """Main processing function."""
    try:
        # Validate input
        inputs_provided = sum([bool(input_text), bool(pdf_file), bool(website_url)])
        if inputs_provided == 0:
            raise gr.Error("Please provide text, PDF, or website URL")
        if inputs_provided > 1:
            raise gr.Error("Please provide only one input method at a time")

        # Extract text
        if input_text:
            text = input_text
        elif pdf_file:
            text = extract_text_from_pdf(pdf_file)
        else:
            text = extract_text_from_website(website_url)

        # Process text
        summary = summarize_text(text)
        translated = translate_summary(summary, target_language)
        audio_path = text_to_speech(translated, target_language)

        return summary, translated, audio_path

    except Exception as e:
        raise gr.Error(str(e))

# Gradio Interface
with gr.Blocks(title="Text Summarization Suite") as interface:
    gr.Markdown("# Text Summarization Suite")
    gr.Markdown("Summarize text from multiple sources, translate, and convert to speech.")

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Direct Text Input", lines=5, placeholder="Paste your text here...")
            pdf_file = gr.File(label="PDF Upload", file_types=[".pdf"])
            website_url = gr.Textbox(label="Website URL", placeholder="Enter website URL...")
            target_lang = gr.Dropdown(
                label="Target Language",
                choices=["en", "es", "fr", "de", "zh", "hi", "ta", "te"],
                value="en"
            )
            submit_btn = gr.Button("Process", variant="primary")

        with gr.Column():
            summary_output = gr.Textbox(label="Original Summary")
            translated_output = gr.Textbox(label="Translated Summary")
            audio_output = gr.Audio(label="Audio Preview", autoplay=False)

    submit_btn.click(
        fn=process_input,
        inputs=[input_text, pdf_file, website_url, target_lang],
        outputs=[summary_output, translated_output, audio_output]
    )

interface.launch(debug=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cuda:0


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c6283569393b54cf09.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


