# NLP Workbench (All-in-one) 
This consolidated notebook meets Documentation & Deliverables requirements and adds bonus features.

## Submission Checklist
- Functional .ipynb (this file)
- Clear quick-start instructions
- Rigorous evaluation (CV, macro-F1, confusion matrix, CI)
- Visualizations (metrics & topics)
- Report PDF export (≤10 pages)
- Extra features: Prediction Explained, Batch Classify, Export CSV, caching

In [1]:
%gui tk

In [None]:
# Packages and libraries needed
from __future__ import annotations

import os
import threading
import tkinter as tk
from tkinter import filedialog, messagebox, simpledialog, ttk
from typing import Iterable, List, Tuple

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import spacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    f1_score,
    confusion_matrix,
    precision_recall_fscore_support,
)
from sklearn.model_selection import StratifiedKFold, cross_val_score

from datasets import load_dataset
from PyPDF2 import PdfReader
import docx as docx_mod
import joblib

import matplotlib
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg

# Conditional ReportLab import
try:
    from reportlab.lib.pagesizes import A4
    from reportlab.lib.styles import getSampleStyleSheet
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
    from reportlab.lib import colors
    REPORTLAB_AVAILABLE = True
except Exception:
    REPORTLAB_AVAILABLE = False
# ------------------------
# Resources
# -------------------------
def ensure_nltk_resources() -> None:
    try:
        nltk.data.find("tokenizers/punkt")
    except LookupError:
        nltk.download("punkt", quiet=True)
    try:
        nltk.data.find("corpora/stopwords")
    except LookupError:
        nltk.download("stopwords", quiet=True)
    try:
        nltk.data.find("corpora/wordnet")
    except LookupError:
        nltk.download("wordnet", quiet=True)
    try:
        nltk.data.find("taggers/averaged_perceptron_tagger")
    except LookupError:
        nltk.download("averaged_perceptron_tagger", quiet=True)
ensure_nltk_resources()
STOPWORDS = set(stopwords.words("english"))
LEMMATIZER = WordNetLemmatizer()
# -------------------------
# Classifier wrapper
# -------------------------
ARXIV_DATASET_ID = "ccdv/arxiv-classification"
MODEL_CACHE = "arxiv_textclf.joblib"
class ArxivTextClassifier:
    def __init__(self) -> None:
        self.pipeline: Pipeline | None = None
        self.label_names: List[str] | None = None
        self.report_text: str = ""
        self.test_f1_per_class: np.ndarray | None = None
        self.test_cm: np.ndarray | None = None
        
    def _build_pipeline(self, max_features: int = 100_000) -> Pipeline:
        tfidf = TfidfVectorizer(
            max_features=max_features,
            stop_words="english",
            strip_accents="ascii",
            max_df=0.7,
            ngram_range=(1, 2),
            min_df=3,
            sublinear_tf=True,
        )
        clf = SGDClassifier(
            loss="hinge",
            alpha=1e-5,
            max_iter=2_000,
            tol=1e-4,
            class_weight="balanced",
            random_state=42,
        )
        return Pipeline([("tfidf", tfidf), ("clf", clf)])

    def _bootstrap_acc_ci(self, y_true: np.ndarray, y_pred: np.ndarray, n_boot: int = 200, seed: int = 42) -> Tuple[float, float]:
        rng = np.random.default_rng(seed)
        n = len(y_true)
        accs = []
        for _ in range(n_boot):
            idx = rng.integers(0, n, n)
            accs.append(accuracy_score(y_true[idx], y_pred[idx]))
        low, high = np.percentile(accs, [2.5, 97.5])
        return float(low), float(high)

    def train(self, max_features: int = 100_000) -> None:
        ds = load_dataset(ARXIV_DATASET_ID)
        self.label_names = ds["train"].features["label"].names
        train_df = ds["train"].to_pandas()
        test_df = ds["test"].to_pandas()
        X_train = train_df["text"].astype(str).values
        y_train = train_df["label"].astype(int).values
        X_test = test_df["text"].astype(str).values
        y_test = test_df["label"].astype(int).values

        pipeline = self._build_pipeline(max_features)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        cv_acc = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring="accuracy")
        cv_f1 = cross_val_score(pipeline, X_train, y_train, cv=skf, scoring="f1_macro")

        self.pipeline = self._build_pipeline(max_features)
        self.pipeline.fit(X_train, y_train)

        y_pred = self.pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        f1m = f1_score(y_test, y_pred, average="macro")
        cm = confusion_matrix(y_test, y_pred)
        ci_low, ci_high = self._bootstrap_acc_ci(y_test, y_pred)
        self.test_cm = cm
        _, _, f1s, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
        self.test_f1_per_class = f1s

        lines = []
        lines.append("=== Cross-Validation (5-fold, train set) ===")
        lines.append(f"Accuracy: {cv_acc.mean():.3f} ± {cv_acc.std():.3f}")
        lines.append(f"Macro-F1: {cv_f1.mean():.3f} ± {cv_f1.std():.3f}")
        lines.append("")
        lines.append("=== Hold-out Test (official split) ===")
        lines.append(f"Accuracy: {acc:.3f} (95% bootstrap CI: {ci_low:.3f}–{ci_high:.3f})")
        lines.append(f"Macro-F1: {f1m:.3f}")
        lines.append("Confusion Matrix (counts):")
        lines.append(str(cm))
        lines.append("")
        lines.append("Classification Report:")
        lines.append(classification_report(y_test, y_pred))
        self.report_text = "\n".join(lines)

    def predict_label(self, text: str) -> str:
        if not self.pipeline or not self.label_names:
            raise RuntimeError("Classifier not trained. Call train() first.")
        pred_idx = int(self.pipeline.predict([text])[0])
        return self.label_names[pred_idx]

    def explain(self, text: str, top_k: int = 10) -> List[Tuple[str, float]]:
        tfidf: TfidfVectorizer = self.pipeline.named_steps["tfidf"]
        clf: SGDClassifier = self.pipeline.named_steps["clf"]
        vec = tfidf.transform([text])
        pred_idx = int(clf.predict(vec)[0])
        coefs = clf.coef_[pred_idx]
        nz = vec.nonzero()
        indices = nz[1]
        data = vec.data
        contribs = []
        for col, val in zip(indices, data):
            contribs.append((tfidf.get_feature_names_out()[col], float(val * coefs[col])))
        contribs.sort(key=lambda x: abs(x[1]), reverse=True)
        return contribs[:top_k]

# -------------------------
# Text utilities
# -------------------------

def treebank_to_wordnet(tag: str) -> str:
    if tag.startswith("J"):
        return wordnet.ADJ
    if tag.startswith("V"):
        return wordnet.VERB
    if tag.startswith("N"):
        return wordnet.NOUN
    if tag.startswith("R"):
        return wordnet.ADV
    return wordnet.NOUN

def preprocess_text(text: str) -> str:
    tokens = [t for t in word_tokenize(text.lower()) if t.isalpha()]
    tokens = [t for t in tokens if t not in STOPWORDS]
    tagged = nltk.pos_tag(tokens)
    lemmatized = [LEMMATIZER.lemmatize(w, treebank_to_wordnet(tag)) for w, tag in tagged]
    return " ".join(lemmatized)

# -------------------------
# GUI Application
# -------------------------

class NLPWorkbench:
    TECH_TERMS = [
        "Python", "Java", "C", "C#", "C++", "SQL", "TypeScript", "JavaScript",
        "HTML", "CSS", "PHP",
        "NumPy", "Keras", "TensorFlow", "PyTorch", "OpenCV", "Librosa",
        "Pandas", "sklearn", "YOLO", "matplotlib",
        "Power BI", "Excel", "Excel Analytics", "SQL Server", "Microsoft SQL", "n8n",
        "React", ".NET", "Spring Boot", "Postman", "MongoDB", "MySQL",
        "Azure", "Git", "GitHub", "Jira", "Word", "PowerPoint", "Teams",
        "NLP", "Machine Learning", "Deep Learning", "CRUD",
        "Object Oriented Programming", "Database", "Data Modeling",
    ]

    def __init__(self, root: tk.Tk) -> None:
        self.root = root
        self.root.title("NLP Workbench (All-in-one)")
        self._init_style()
        self._init_layout()

        self.nlp = spacy.load("en_core_web_sm")
        self._add_entity_ruler()

        self.classifier = ArxivTextClassifier()
        self._model_ready = False
        self._training_thread: threading.Thread | None = None
        self._train_error = None

        self._lda_model: LatentDirichletAllocation | None = None
        self._lda_vectorizer: CountVectorizer | None = None
        self._lda_top_words: List[List[str]] | None = None

        self._load_cached_model()
        self._bind_shortcuts()

    # ---------- UI ----------
    def _init_style(self) -> None:
        style = ttk.Style()
        style.theme_use("clam")
        self.bg_color = "#f4f4f5"
        self.panel_bg = "#ffffff"
        self.accent_color = "#2563eb"
        self.accent_dark = "#1d4ed8"
        self.text_color = "#111827"
        self.root.configure(bg=self.bg_color)
        style.configure("TLabelFrame", background=self.bg_color, borderwidth=1)
        style.configure("TFrame", background=self.bg_color)
        style.configure("TLabel", background=self.bg_color, foreground=self.text_color)
        style.configure("Accent.TButton", foreground="white", background=self.accent_color)
        style.map("Accent.TButton", background=[("active", self.accent_dark)])

    def _init_layout(self) -> None:
        default_font = ("Segoe UI", 10)
        title_font = ("Segoe UI", 16, "bold")

        menubar = tk.Menu(self.root)
        file_menu = tk.Menu(menubar, tearoff=0)
        file_menu.add_command(label="Load File", command=self.load_file)
        file_menu.add_separator()
        file_menu.add_command(label="Export CSV", command=self.export_csv)
        file_menu.add_separator()
        file_menu.add_command(label="Exit", command=self.root.quit)
        menubar.add_cascade(label="File", menu=file_menu)

        view_menu = tk.Menu(menubar, tearoff=0)
        view_menu.add_command(label="Clear Input", command=self.clear_input)
        view_menu.add_command(label="Clear Output", command=self.clear_output)
        view_menu.add_command(label="Clear Input & Output", command=self.clear_both)
        menubar.add_cascade(label="View", menu=view_menu)

        help_menu = tk.Menu(menubar, tearoff=0)
        help_menu.add_command(label="Quick Start", command=self.quick_start)
        help_menu.add_command(label="About", command=self.show_about)
        help_menu.add_command(label="Submission Checklist", command=self.submission_checklist)
        menubar.add_cascade(label="Help", menu=help_menu)
        self.root.config(menu=menubar)

        main_frame = ttk.Frame(self.root, padding=10)
        main_frame.pack(fill="both", expand=True)
        title_label = ttk.Label(main_frame, text="NLP Workbench", font=title_font)
        title_label.pack(anchor="center", pady=(0, 6))
        subtitle = ttk.Label(main_frame, text=(
            "Step 1: Load → Step 2: Preprocess/Sentiment/NER → Step 3: LDA or Train Classifier (bg) → Step 4: Classify/Visualize/Report"
        ), foreground="#6b7280")
        subtitle.pack(anchor="center", pady=(0, 10))

        io_frame = ttk.Frame(main_frame)
        io_frame.pack(fill="both", expand=True)

        input_frame = ttk.LabelFrame(io_frame, text="Input Text")
        input_frame.pack(side="left", fill="both", expand=True, padx=(0, 5))
        self.text_input = tk.Text(input_frame, height=18, wrap="word", font=default_font)
        self.text_input.pack(side="left", fill="both", expand=True, padx=5, pady=5)
        self.text_input.configure(bg=self.panel_bg, fg=self.text_color, insertbackground=self.accent_color)
        input_scroll = ttk.Scrollbar(input_frame, orient="vertical", command=self.text_input.yview)
        input_scroll.pack(side="right", fill="y")
        self.text_input.configure(yscrollcommand=input_scroll.set)

        output_frame = ttk.LabelFrame(io_frame, text="Output")
        output_frame.pack(side="left", fill="both", expand=True, padx=(5, 0))
        self.output = tk.Text(output_frame, height=18, wrap="word", font=default_font, state="normal")
        self.output.pack(side="left", fill="both", expand=True, padx=5, pady=5)
        self.output.configure(bg=self.panel_bg, fg=self.text_color, insertbackground=self.accent_color)
        output_scroll = ttk.Scrollbar(output_frame, orient="vertical", command=self.output.yview)
        output_scroll.pack(side="right", fill="y")
        self.output.configure(yscrollcommand=output_scroll.set)

        controls_frame = ttk.Frame(main_frame)
        controls_frame.pack(fill="x", pady=(10, 5))

        file_frame = ttk.LabelFrame(controls_frame, text="Data")
        file_frame.pack(side="left", padx=5, pady=5, fill="x", expand=True)
        btn_load = ttk.Button(file_frame, text="Load File", command=self.load_file, style="Accent.TButton")
        btn_load.pack(side="left", padx=5, pady=5)

        docs_frame = ttk.LabelFrame(controls_frame, text="Docs & Export")
        docs_frame.pack(side="left", padx=5, pady=5, fill="x", expand=True)
        # Button text and behavior depend on reportlab availability
        report_btn_text = "Save Report (PDF)" if REPORTLAB_AVAILABLE else "Save Report (.docx)"
        self.report_btn = ttk.Button(docs_frame, text=report_btn_text, command=self.save_report_auto, style="Accent.TButton")
        self.report_btn.pack(side="left", padx=5, pady=5)
        self.batch_btn = ttk.Button(docs_frame, text="Batch Classify (lines)", command=self.batch_classify, style="Accent.TButton")
        self.batch_btn.pack(side="left", padx=5, pady=5)

        basic_frame = ttk.LabelFrame(controls_frame, text="Basic NLP")
        basic_frame.pack(side="left", padx=5, pady=5, fill="x", expand=True)
        btn_pp = ttk.Button(basic_frame, text="Preprocess", command=self.preprocess, style="Accent.TButton")
        btn_pp.pack(side="left", padx=5, pady=5)
        btn_sent = ttk.Button(basic_frame, text="Sentiment", command=self.sentiment, style="Accent.TButton")
        btn_sent.pack(side="left", padx=5, pady=5)
        btn_ner = ttk.Button(basic_frame, text="NER", command=self.ner, style="Accent.TButton")
        btn_ner.pack(side="left", padx=5, pady=5)

        model_frame = ttk.LabelFrame(controls_frame, text="Modeling")
        model_frame.pack(side="left", padx=5, pady=5, fill="x", expand=True)
        btn_lda = ttk.Button(model_frame, text="Topic Modeling (LDA)", command=self.lda, style="Accent.TButton")
        btn_lda.pack(side="left", padx=5, pady=5)
        self.train_button = ttk.Button(model_frame, text="Train Classifier (bg)", command=self.train_classifier_async, style="Accent.TButton")
        self.train_button.pack(side="left", padx=5, pady=5)
        self.classify_button = ttk.Button(model_frame, text="Text Classification", command=self.text_classify, style="Accent.TButton")
        self.classify_button.pack(side="left", padx=5, pady=5)
        self.report_button = ttk.Button(model_frame, text="Model Report", command=self.model_report, style="Accent.TButton")
        self.report_button.pack(side="left", padx=5, pady=5)
        self.explain_button = ttk.Button(model_frame, text="Explain Prediction", command=self.explain_prediction, style="Accent.TButton")
        self.explain_button.pack(side="left", padx=5, pady=5)
        self.visual_button = ttk.Button(model_frame, text="Visualize Metrics", command=self.visualize_metrics, style="Accent.TButton")
        self.visual_button.pack(side="left", padx=5, pady=5)
        self.topic_vis_button = ttk.Button(model_frame, text="Visualize Topics", command=self.visualize_topics, style="Accent.TButton")
        self.topic_vis_button.pack(side="left", padx=5, pady=5)

        self.status_var = tk.StringVar(value="Ready")
        status_bar = ttk.Label(self.root, textvariable=self.status_var, relief="sunken", anchor="w", padding=(5, 2))
        status_bar.pack(side="bottom", fill="x")
        self.progress = ttk.Progressbar(self.root, mode="indeterminate")
        self.progress.pack(side="bottom", fill="x")
        self.progress.stop()

        self._set_classify_enabled(False)

    def _add_entity_ruler(self) -> None:
        ruler = self.nlp.add_pipe("entity_ruler", before="ner", config={"phrase_matcher_attr": "LOWER"})
        patterns = [{"label": "TECH", "pattern": term} for term in self.TECH_TERMS]
        ruler.add_patterns(patterns)

    def _bind_shortcuts(self) -> None:
        self.root.bind("<Control-l>", lambda e: self.load_file())
        self.root.bind("<Control-p>", lambda e: self.preprocess())
        self.root.bind("<Control-s>", lambda e: self.sentiment())
        self.root.bind("<Control-n>", lambda e: self.ner())
        self.root.bind("<Control-t>", lambda e: self.lda())
        self.root.bind("<Control-r>", lambda e: self.train_classifier_async())
        self.root.bind("<Control-c>", lambda e: self.text_classify())
        self.root.bind("<Control-m>", lambda e: self.model_report())
        self.root.bind("<Control-e>", lambda e: self.explain_prediction())
        self.root.bind("<Control-v>", lambda e: self.visualize_metrics())
        self.root.bind("<Control-Shift-v>", lambda e: self.visualize_topics())
        self.root.bind("<Control-b>", lambda e: self.batch_classify())
        self.root.bind("<Control-w>", lambda e: self.save_report_auto())
        self.root.bind("<Control-x>", lambda e: self.export_csv())

    # ---------- Helpers ----------
    def _set_classify_enabled(self, enabled: bool) -> None:
        state = "normal" if enabled else "disabled"
        self.classify_button.configure(state=state)
        self.explain_button.configure(state=state)

    def _load_cached_model(self) -> None:
        if os.path.exists(MODEL_CACHE):
            try:
                cache = joblib.load(MODEL_CACHE)
                self.classifier.pipeline = cache.get("pipeline")
                self.classifier.label_names = cache.get("labels")
                self.classifier.report_text = cache.get("report", "")
                self.classifier.test_cm = cache.get("cm", None)
                self.classifier.test_f1_per_class = cache.get("f1s", None)
                if self.classifier.pipeline and self.classifier.label_names:
                    self._model_ready = True
                    self._set_classify_enabled(True)
                    self.update_status("Loaded cached classifier.")
            except Exception as exc:
                self.update_status(f"Failed to load cache: {exc}")

    def _save_cached_model(self) -> None:
        if self.classifier.pipeline and self.classifier.label_names:
            joblib.dump({
                "pipeline": self.classifier.pipeline,
                "labels": self.classifier.label_names,
                "report": self.classifier.report_text,
                "cm": self.classifier.test_cm,
                "f1s": self.classifier.test_f1_per_class,
            }, MODEL_CACHE)

    def update_status(self, message: str) -> None:
        self.status_var.set(message)
        self.root.update_idletasks()

    def clear_input(self) -> None:
        self.text_input.delete("1.0", tk.END)
        self.update_status("Input cleared.")

    def clear_output(self) -> None:
        self.output.delete("1.0", tk.END)
        self.update_status("Output cleared.")

    def clear_both(self) -> None:
        self.text_input.delete("1.0", tk.END)
        self.output.delete("1.0", tk.END)
        self.update_status("Input and output cleared.")

    def show_about(self) -> None:
        messagebox.showinfo(
            "About",
            "NLP Workbench (All-in-one)\n\n"
            "• Async training + caching\n"
            "• Rigorous evaluation (CV, macro-F1, CI)\n"
            "• Visualizations & Explainability\n"
            "• Docs: report export & submission checklist"
        )

    def quick_start(self) -> None:
        msg = (
            "Quick Start:\n\n"
            "1) Load file (Ctrl+L)\n"
            "2) Preprocess (Ctrl+P), Sentiment (Ctrl+S), NER (Ctrl+N)\n"
            "3) LDA (Ctrl+T) or Train Classifier (Ctrl+R)\n"
            "4) Classify (Ctrl+C); Visualize Metrics (Ctrl+V); Explain (Ctrl+E)\n"
            "5) Batch Classify (Ctrl+B); Export CSV (Ctrl+X); Save Report (Ctrl+W)\n"
        )
        messagebox.showinfo("Quick Start", msg)

    def submission_checklist(self) -> None:
        msg = (
            "Submission Checklist:\n\n"
            "• Functional notebook (.ipynb) — this all-in-one build\n"
            "• Code quality: PEP8, documentation, error handling\n"
            "• Evaluation: CV + test metrics, confusion matrix, macro-F1, CI\n"
            "• Visualizations: metrics & topics\n"
            "• Report (PDF or DOCX ≤ 10 pages): use 'Save Report'\n"
            "• Extra includes: Explain Prediction, Batch Classify, Export CSV\n"
        )
        messagebox.showinfo("Submission Checklist", msg)

    # ---------- File loading ----------
    def load_file(self) -> None:
        file_path = filedialog.askopenfilename(
            title="Select a file",
            filetypes=[
                ("Text files", "*.txt"),
                ("Word documents", "*.docx"),
                ("PDF files", "*.pdf"),
                ("Excel files", "*.xlsx *.xls"),
                ("All files", "."),
            ],
        )
        if not file_path:
            return
        try:
            content = self._read_file_content(file_path)
            self.text_input.delete("1.0", tk.END)
            self.text_input.insert(tk.END, content)
            self.update_status(f"Loaded file: {file_path}")
        except Exception as exc:
            messagebox.showerror("Error", f"Could not read file:\n{exc}")
            self.update_status("Failed to load file.")

    def _read_file_content(self, file_path: str) -> str:
        lower = file_path.lower()
        if lower.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                return f.read()
        if lower.endswith(".docx"):
            document = docx_mod.Document(file_path)
            return "\n".join(p.text for p in document.paragraphs if p.text.strip())
        if lower.endswith(".pdf"):
            reader = PdfReader(file_path)
            pages_text: List[str] = []
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    pages_text.append(text.strip())
            return "\n\n".join(pages_text)
        if lower.endswith((".xlsx", ".xls")):
            engine = "openpyxl" if lower.endswith(".xlsx") else "xlrd"
            df = pd.read_excel(file_path, engine=engine)
            rows_as_text: List[str] = []
            for row in df.itertuples(index=False):
                cells = [str(v) for v in row if pd.notna(v)]
                if cells:
                    rows_as_text.append(" ".join(cells))
            return "\n".join(rows_as_text) if rows_as_text else ""
        raise ValueError("Unsupported file type. Use .txt, .pdf, .docx, .xlsx, or .xls.")

    # ---------- NLP actions ----------
    def preprocess(self) -> None:
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            self.update_status("No input to preprocess.")
            return
        processed = preprocess_text(text)
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, processed)
        self.update_status("Preprocessing completed!")

    def sentiment(self) -> None:
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            self.update_status("No input for sentiment analysis.")
            return
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        if polarity > 0:
            sentiment = f"Positive ({polarity:.2f})"
            advice = "Highlight strengths or positive cues."
        elif polarity < 0:
            sentiment = f"Negative ({polarity:.2f})"
            advice = "Address concerns or clarify issues."
        else:
            sentiment = "Neutral"
            advice = "Add specific details or examples for clarity."
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, sentiment + "\n" + advice)
        self.update_status("Sentiment analysis completed!")

    def ner(self) -> None:
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            self.update_status("No input for NER.")
            return
        doc = self.nlp(text)
        results = "\n".join(f"{ent.text} — {ent.label_}" for ent in doc.ents)
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, results)
        self.update_status(f"NER completed with {len(doc.ents)} entities.")

    def _topic_modeling(self, docs: Iterable[str], num_topics: int = 4, num_words: int = 5) -> str:
        processed_docs = [preprocess_text(doc) for doc in docs]
        vectorizer = CountVectorizer()
        X = vectorizer.fit_transform(processed_docs)
        lda_model = LatentDirichletAllocation(n_components=num_topics, random_state=42)
        lda_model.fit(X)
        words = vectorizer.get_feature_names_out()
        lines = []
        top_words_all: List[List[str]] = []
        for idx, topic in enumerate(lda_model.components_):
            top_indices = topic.argsort()[:-num_words - 1:-1]
            top_words = [words[i] for i in top_indices]
            top_words_all.append(top_words)
            lines.append(f"Topic {idx + 1}: {', '.join(top_words)}")
        perplex = lda_model.perplexity(X)
        lines.append("")
        lines.append(f"Perplexity (lower is better): {perplex:.2f}")
        self._lda_model = lda_model
        self._lda_vectorizer = vectorizer
        self._lda_top_words = top_words_all
        return "\n".join(lines)

    def lda(self) -> None:
        raw_text = self.text_input.get("1.0", tk.END).strip()
        if not raw_text:
            self.update_status("No input for LDA.")
            return
        docs = [line.strip() for line in raw_text.split("\n") if line.strip()]
        if not docs:
            self.update_status("No documents found for LDA.")
            return
        max_docs = simpledialog.askinteger("Number of Documents", f"How many documents to use? (max {len(docs)})", minvalue=1, maxvalue=len(docs), parent=self.root)
        if max_docs is None:
            return
        docs = docs[:max_docs]
        num_topics = simpledialog.askinteger("Number of Topics", "Enter number of topics:", minvalue=1, maxvalue=10, parent=self.root)
        if num_topics is None:
            return
        num_words = simpledialog.askinteger("Words per Topic", "Enter number of words per topic:", minvalue=1, maxvalue=20, parent=self.root)
        if num_words is None:
            return
        result = self._topic_modeling(docs, num_topics=num_topics, num_words=num_words)
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, result)
        self.update_status(f"LDA completed with {num_topics} topics on {len(docs)} documents.")

    # ---------- Classification ----------
    def train_classifier_async(self) -> None:
        if self._training_thread and self._training_thread.is_alive():
            self.update_status("Training already in progress…")
            return
        self.update_status("Training classifier in background (CV + test)…")
        self.progress.start(10)
        self.train_button.configure(state="disabled")
        self._training_thread = threading.Thread(target=self._train_worker, daemon=True)
        self._training_thread.start()
        self.root.after(250, self._check_training_done)

    def _train_worker(self) -> None:
        try:
            self.classifier.train()
            self._model_ready = True
            self._save_cached_model()
        except Exception as exc:
            self._train_error = exc
        else:
            self._train_error = None

    def _check_training_done(self) -> None:
        if self._training_thread and self._training_thread.is_alive():
            self.root.after(250, self._check_training_done)
            return
        self.progress.stop()
        self.train_button.configure(state="normal")
        if self._train_error:
            messagebox.showerror("Training Error", str(self._train_error))
            self.update_status("Classifier training failed.")
        else:
            self._set_classify_enabled(True)
            self.update_status("Classifier trained (cached + evaluated). Use 'Model Report'.")

    def text_classify(self) -> None:
        if not self._model_ready:
            self.update_status("Classifier not ready yet.")
            return
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            self.update_status("No input text to classify.")
            return
        try:
            result = self.classifier.predict_label(text)
        except Exception as exc:
            messagebox.showerror("Classification Error", str(exc))
            self.update_status("Classification failed.")
            return
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, result)
        self.update_status(f"Text classified as: {result}")

    def batch_classify(self) -> None:
        if not self._model_ready:
            self.update_status("Classifier not ready yet.")
            return
        raw = self.text_input.get("1.0", tk.END).strip()
        docs = [ln.strip() for ln in raw.split("\n") if ln.strip()]
        if not docs:
            self.update_status("No lines to classify.")
            return
        preds = []
        for d in docs:
            try:
                preds.append(self.classifier.predict_label(d))
            except Exception as exc:
                preds.append(f"ERROR: {exc}")
        out = "\n".join(f"{i+1}. {p}" for i, p in enumerate(preds))
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, out)
        self.update_status(f"Batch classified {len(docs)} lines.")

    def export_csv(self) -> None:
        if not self._model_ready:
            self.update_status("Classifier not ready yet.")
            return
        raw = self.text_input.get("1.0", tk.END).strip()
        docs = [ln.strip() for ln in raw.split("\n") if ln.strip()]
        if not docs:
            self.update_status("No lines to export.")
            return
        preds = []
        for d in docs:
            try:
                preds.append(self.classifier.predict_label(d))
            except Exception as exc:
                preds.append(f"ERROR: {exc}")
        df = pd.DataFrame({"text": docs, "prediction": preds})
        save_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV", "*.csv")], title="Save predictions CSV")
        if not save_path:
            return
        try:
            df.to_csv(save_path, index=False)
            self.update_status(f"Saved CSV to {save_path}")
        except Exception as exc:
            messagebox.showerror("Export Error", str(exc))
            self.update_status("CSV export failed.")

    def explain_prediction(self) -> None:
        if not self._model_ready:
            self.update_status("Classifier not ready yet.")
            return
        text = self.text_input.get("1.0", tk.END).strip()
        if not text:
            self.update_status("No input to explain.")
            return
        try:
            contribs = self.classifier.explain(text, top_k=10)
        except Exception as exc:
            messagebox.showerror("Explain Error", str(exc))
            self.update_status("Explain failed.")
            return
        lines = ["Top contributing tokens (feature × weight):"]
        for tok, val in contribs:
            lines.append(f"{tok}: {val:.4f}")
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, "\n".join(lines))
        self.update_status("Explanation generated.")

    def model_report(self) -> None:
        if not self._model_ready:
            self.update_status("Classifier not ready yet.")
            return
        report = self.classifier.report_text or "No report available. Train the classifier first."
        self.output.delete("1.0", tk.END)
        self.output.insert(tk.END, report)
        self.update_status("Model report displayed.")

    # ---------- Visualizations ----------
    def visualize_metrics(self) -> None:
        if not self._model_ready:
            self.update_status("Train classifier first or load cache.")
            return
        cm = self.classifier.test_cm
        f1s = self.classifier.test_f1_per_class
        labels = self.classifier.label_names
        if cm is None or f1s is None or labels is None:
            self.update_status("Metrics not available.")
            return
        fig, axes = plt.subplots(1, 2, figsize=(12, 4))
        ax = axes[0]
        im = ax.imshow(cm, cmap="Blues")
        ax.set_title("Confusion Matrix")
        ax.set_xlabel("Predicted")
        ax.set_ylabel("True")
        ax.set_xticks(np.arange(len(labels)))
        ax.set_yticks(np.arange(len(labels)))
        ax.set_xticklabels(range(len(labels)))
        ax.set_yticklabels(range(len(labels)))
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, cm[i, j], ha="center", va="center", color="black", fontsize=8)
        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
        ax2 = axes[1]
        ax2.bar(np.arange(len(labels)), f1s, color="#2563eb")
        ax2.set_title("Per-class F1 (macro view)")
        ax2.set_xlabel("Class index")
        ax2.set_ylabel("F1 score")
        ax2.set_ylim(0, 1)
        ax2.grid(True, axis="y", linestyle="--", alpha=0.4)
        fig.tight_layout()
        win = tk.Toplevel(self.root)
        win.title("Model Metrics")
        canvas = FigureCanvasTkAgg(fig, master=win)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        self.update_status("Metrics visualized.")

    def visualize_topics(self) -> None:
        if not self._lda_model or not self._lda_vectorizer:
            self.update_status("Run LDA first to visualize topics.")
            return
        num_topics = self._lda_model.n_components
        topic_idx = simpledialog.askinteger("Visualize Topic", f"Enter topic number (1–{num_topics})", minvalue=1, maxvalue=num_topics, parent=self.root)
        if topic_idx is None:
            return
        topic = self._lda_model.components_[topic_idx - 1]
        words = self._lda_vectorizer.get_feature_names_out()
        top_idx = topic.argsort()[:-10 - 1:-1]
        top_words = [words[i] for i in top_idx]
        top_vals = topic[top_idx]
        fig, ax = plt.subplots(figsize=(8, 4))
        ax.barh(top_words[::-1], top_vals[::-1], color="#10b981")
        ax.set_title(f"Topic {topic_idx}: Top words (weights)")
        ax.set_xlabel("Weight")
        ax.set_ylabel("Word")
        ax.grid(True, axis="x", linestyle="--", alpha=0.4)
        fig.tight_layout()
        win = tk.Toplevel(self.root)
        win.title("Topic Visualization")
        canvas = FigureCanvasTkAgg(fig, master=win)
        canvas.draw()
        canvas.get_tk_widget().pack(fill="both", expand=True)
        self.update_status("Topic visualized.")

    # ---------- Report (Auto: PDF or DOCX) ----------
    def save_report_auto(self) -> None:
        if not self._model_ready or not self.classifier.report_text:
            self.update_status("Train classifier first to generate a report.")
            return
        if REPORTLAB_AVAILABLE:
            self._save_report_pdf()
        else:
            self._save_report_docx()

    def _save_report_pdf(self) -> None:
        try:
            save_path = filedialog.asksaveasfilename(defaultextension=".pdf", filetypes=[("PDF", "*.pdf")], title="Save Report PDF")
            if not save_path:
                return
            styles = getSampleStyleSheet()
            doc = SimpleDocTemplate(save_path, pagesize=A4)
            story = []
            story.append(Paragraph("NLP Workbench — Submission Report", styles['Title']))
            story.append(Spacer(1, 12))
            story.append(Paragraph("Author: Student", styles['Normal']))
            story.append(Paragraph("Tool: NLP Workbench (All-in-one)", styles['Normal']))
            story.append(Spacer(1, 12))
            story.append(Paragraph("1. Overview", styles['Heading2']))
            story.append(Paragraph("This report summarizes the dataset, methodology, evaluation metrics, and UI/UX features. The classifier is TF–IDF + linear SVM (SGD) with stratified 5-fold CV. Topics are extracted via LDA.", styles['Normal']))
            story.append(Spacer(1, 8))
            story.append(Paragraph("2. Methodology & Evaluation", styles['Heading2']))
            for line in self.classifier.report_text.split("\n"):
                story.append(Paragraph(line, styles['Code']))
            story.append(Spacer(1, 8))
            story.append(Paragraph("3. UI/UX & Deliverables", styles['Heading2']))
            bullets = [
                "Responsive GUI (async training, progress bar)",
                "Workflow cues, tooltips, keyboard shortcuts",
                "Visualizations: confusion matrix, per-class F1, topic bars",
                "Explain Prediction (top contributing tokens)",
                "Batch classify + CSV export",
                "Report export",
            ]
            tbl = Table([[b] for b in bullets])
            tbl.setStyle(TableStyle([
                ('BACKGROUND', (0,0), (-1,-1), colors.whitesmoke),
                ('TEXTCOLOR', (0,0), (-1,-1), colors.black),
                ('FONTNAME', (0,0), (-1,-1), 'Helvetica'),
                ('FONTSIZE', (0,0), (-1,-1), 10),
                ('LEFTPADDING', (0,0), (-1,-1), 6),
            ]))
            story.append(tbl)
            story.append(Spacer(1, 8))
            story.append(Paragraph("4. Insights & Recommendations", styles['Heading2']))
            story.append(Paragraph("Use per-class F1 and confusion hotspots to target categories with lower recall.", styles['Normal']))
            story.append(Spacer(1, 8))
            story.append(Paragraph("5. Limitations & Future Work", styles['Heading2']))
            story.append(Paragraph("Model uses linear SVM without probability calibration.", styles['Normal']))
            doc.build(story)
            self.update_status(f"Saved report to {save_path}")
        except Exception as exc:
            messagebox.showerror("Report Error", str(exc))
            self.update_status("Report generation failed.")

    def _save_report_docx(self) -> None:
        try:
            save_path = filedialog.asksaveasfilename(defaultextension=".docx", filetypes=[("Word Document", "*.docx")], title="Save Report DOCX")
            if not save_path:
                return
            doc = docx_mod.Document()
            doc.add_heading('NLP Workbench — Submission Report', 0)
            doc.add_paragraph('Author: Student')
            doc.add_paragraph('Tool: NLP Workbench (All-in-one)')
            doc.add_heading('1. Overview', level=1)
            doc.add_paragraph('This report summarizes the dataset, methodology, evaluation metrics, and UI/UX features. The classifier is TF–IDF + linear SVM (SGD) with stratified 5-fold CV. Topics are extracted via LDA.')
            doc.add_heading('2. Methodology & Evaluation', level=1)
            for line in self.classifier.report_text.split("\n"):
                doc.add_paragraph(line)
            doc.add_heading('3. UI/UX & Deliverables', level=1)
            bullets = [
                'Responsive GUI (async training, progress bar)',
                'Workflow cues, tooltips, keyboard shortcuts',
                'Visualizations: confusion matrix, per-class F1, topic bars',
                'Explain Prediction (top contributing tokens)',
                'Batch classify + CSV export',
                'Report export',
            ]
            for b in bullets:
                doc.add_paragraph(b, style='List Bullet')
            doc.add_heading('4. Insights & Recommendations', level=1)
            doc.add_paragraph('Use per-class F1 and confusion hotspots to target categories with lower recall.')
            doc.add_heading('5. Limitations & Future Work', level=1)
            doc.add_paragraph('Model uses linear SVM without probability calibration.')
            doc.save(save_path)
            self.update_status(f"Saved report to {save_path}")
        except Exception as exc:
            messagebox.showerror("Report Error", str(exc))
            self.update_status("DOCX report generation failed.")

# ---------- Entrypoint ----------
if __name__ == "__main__":
    root = tk.Tk()
    app = NLPWorkbench(root)
    root.mainloop()


Note: If `reportlab` is not installed, the app will automatically offer **Save Report (DOCX)** as a fallback. If available, it uses **Save Report (PDF)**.