In [2]:
!pip install gradio textblob spacy langdetect PyPDF2 pytesseract pillow

Collecting gradio
  Downloading gradio-5.44.1-py3-none-any.whl.metadata (16 kB)
Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting brotli>=1.1.0 (from gradio)
  Downloading Brotli-1.1.0-cp312-cp312-macosx_10_13_universal2.whl.metadata (5.5 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.116.1-py3-none-any.whl.metadata (28 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.1-py3-none-any.w

In [10]:
import os
import gradio as gr
from textblob import TextBlob
import spacy
from langdetect import detect
import PyPDF2
from PIL import Image
import pytesseract

In [8]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [12]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [14]:
MAX_SIZE_MB = 250  # size limit

In [16]:
def extract_text(file):
    """Extract text from txt, pdf, or image file"""
    ext = os.path.splitext(file.name)[-1].lower()
    
    if ext == ".txt":
        with open(file.name, "r", encoding="utf-8", errors="ignore") as f:
            return f.read()
    
    elif ext == ".pdf":
        text = ""
        try:
            reader = PyPDF2.PdfReader(file.name)
            for page in reader.pages:
                text += page.extract_text() or ""
        except Exception as e:
            return f"Error reading PDF: {e}"
        return text
    
    elif ext in [".jpg", ".jpeg", ".png"]:
        try:
            img = Image.open(file.name)
            return pytesseract.image_to_string(img)
        except Exception as e:
            return f"Error processing image: {e}"
    
    else:
        return "Unsupported file type. Please upload .txt, .pdf, .jpg, or .png"


In [20]:
def process_file(file):
    if file is None:
        return "No file uploaded."

    # Check size
    size_mb = os.path.getsize(file.name) / (1024 * 1024)
    if size_mb > MAX_SIZE_MB:
        return f"❌ File size {size_mb:.2f} MB > {MAX_SIZE_MB} MB. Not supportable."

    # Extract text
    text = extract_text(file)
    if not isinstance(text, str) or not text.strip():
        return "No readable text found."

    # Language detection
    try:
        lang = detect(text)
    except:
        lang = "Unknown"

    # Sentiment
    blob = TextBlob(text)
    sentiment = blob.sentiment

    # Keywords
    doc = nlp(text)
    keywords = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"]]
    keywords = list(set(keywords))[:15]

    # Entities
    entities = [(ent.text, ent.label_) for ent in doc.ents]

    # Summary (first 3 sentences)
    summary = " ".join([sent.text for sent in doc.sents][:3])

    return f"""
    File size: {size_mb:.2f} MB  
    **Language:** {lang}  
    **Sentiment (polarity):** {sentiment.polarity:.2f}  
    **Keywords (sample):** {keywords}  
    **Entities:** {entities}  
    **Summary:** {summary}
    """


In [22]:
# Gradio app
demo = gr.Interface(
    fn=process_file,
    inputs=gr.File(file_types=[".txt", ".pdf", ".jpg", ".jpeg", ".png"], label="Upload a File"),
    outputs="markdown",
    title="Multi-format NLP Application (with 250MB limit)"
)

demo.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


