In [1]:
!pip install docx2txt
!pip install -q gradio transformers python-docx pytesseract
!sudo apt install -y tesseract-ocr poppler-utils



Collecting docx2txt
  Downloading docx2txt-0.9-py3-none-any.whl.metadata (529 bytes)
Downloading docx2txt-0.9-py3-none-any.whl (4.0 kB)
Installing collected packages: docx2txt
Successfully installed docx2txt-0.9
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (194 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot b

In [2]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [3]:
!pip install transformers pytesseract Pillow fpdf python-docx gradio
!apt install tesseract-ocr -y


Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=3a181abfc62c110d2979bef414a6a875bcc2be5052d8817a77b32c5f0b34080c
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [4]:
import gradio as gr
from transformers import pipeline
import pytesseract
from PIL import Image
import numpy as np
import os

from fpdf import FPDF
import docx

# Disable TensorFlow
os.environ["TRANSFORMERS_NO_TF"] = "1"

# Load summarizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")

# Text extraction from image
def extract_text_from_image(image_input):
    try:
        image = Image.fromarray(image_input.astype('uint8')) if isinstance(image_input, np.ndarray) else Image.open(image_input)
        return pytesseract.image_to_string(image)
    except Exception as e:
        return f"Error reading image: {e}"

# Summarization logic
def summarize_content(text_input, webcam_input, word_limit):
    content = ""

    if webcam_input is not None:
        content = extract_text_from_image(webcam_input)
    elif text_input:
        content = text_input
    else:
        return "⚠️ Please provide text or image.", None

    if content.strip() == "":
        return "⚠️ No content extracted.", None

    max_len = int(word_limit * 1.5)
    min_len = int(word_limit * 0.5)

    try:
        summary = summarizer(content, max_length=max_len, min_length=min_len, do_sample=False)[0]['summary_text']
        return summary, None
    except Exception as e:
        return f"❌ Summarization error: {e}", None

# File download function
def download_summary(summary_text, format_choice):
    if not summary_text.strip():
        return None

    filename = "summary"
    filepath = ""

    if format_choice == ".txt":
        filepath = f"{filename}.txt"
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(summary_text)

    elif format_choice == ".pdf":
        filepath = f"{filename}.pdf"
        pdf = FPDF()
        pdf.add_page()
        pdf.set_auto_page_break(auto=True, margin=15)
        pdf.set_font("Arial", size=12)
        for line in summary_text.split('\n'):
            pdf.multi_cell(0, 10, line)
        pdf.output(filepath)

    elif format_choice == ".docx":
        filepath = f"{filename}.docx"
        doc = docx.Document()
        doc.add_paragraph(summary_text)
        doc.save(filepath)

    return filepath

# Reset function
def reset_fields():
    return "", None, 100, ".txt", "", None

# Gradio interface
with gr.Blocks() as demo:
    gr.HTML("""
        <h2 style='text-align: center;'>📝 Documentation to Summary Converter</h2>
        <style>
            #summarize-btn button {
                background-color: green !important;
                color: white !important;
            }
            #cancel-btn button {
                background-color: red !important;
                color: white !important;
            }
            #download-container {
                display: flex;
                justify-content: flex-end;
                margin-top: 10px;
            }
        </style>
    """)

    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 🔽 Input Section")
            text_input = gr.Textbox(label="✍️ Paste Text", lines=5, placeholder="Enter your text here...")
            webcam_input = gr.Image(label="📷 Capture or Upload Image", sources=["webcam", "upload"], type="numpy")
            word_limit = gr.Slider(label="🧮 Number of Words in Summary", minimum=30, maximum=300, value=100, step=10)

            with gr.Row():
                submit = gr.Button("🚀 Summarize", elem_id="summarize-btn")
                cancel = gr.Button("❌ Reset", elem_id="cancel-btn")

        with gr.Column(scale=1):
            gr.Markdown("### 📘 Summary Output")
            output = gr.Textbox(label="Summary", lines=12)

            gr.HTML('<div id="download-container">')
            download_btn = gr.Button("📥 Download Summary", elem_id="download-btn")
            gr.HTML('</div>')

            file_format = gr.Dropdown(label="🗂️ Choose File Format to Download", choices=[".txt", ".pdf", ".docx"], value=".txt")
            download_file = gr.File(label="Download Summary", interactive=True)

    # Button functionality
    submit.click(fn=summarize_content, inputs=[text_input, webcam_input, word_limit], outputs=[output, download_file])
    cancel.click(fn=reset_fields, inputs=[], outputs=[text_input, webcam_input, word_limit, file_format, output, download_file])
    download_btn.click(fn=download_summary, inputs=[output, file_format], outputs=[download_file])

# Launch the app
demo.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f17d678a05282dbf23.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


