In [None]:
# hybrid_summarization_system_colab.py
# Complete Google Colab Optimized Version

print("🚀 Initializing HybridSumm-Pro for Google Colab...")

# ==================== INSTALLATION & SETUP ====================
import subprocess
import sys

def install_colab_dependencies():
    """Install all required dependencies for Google Colab"""
    print("📦 Installing dependencies...")

    # Install core ML and NLP libraries
    !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
    !pip install transformers==4.35.0
    !pip install sentence-transformers
    !pip install datasets
    !pip install accelerate

    # Install text processing libraries
    !pip install nltk
    !pip install spacy
    !pip install scikit-learn
    !pip install networkx
    !pip install textstat
    !pip install rouge-score
    !pip install bert-score
    !pip install lime
    !pip install shap

    # Install file processing libraries
    !pip install PyPDF2
    !pip install python-docx
    !pip install beautifulsoup4
    !pip install requests

    # Install multimodal processing libraries
    !pip install opencv-python
    !pip install pytesseract
    !pip install Pillow
    !pip install SpeechRecognition
    !pip install librosa
    !pip install soundfile

    # Install web framework and UI
    !pip install streamlit
    !pip install fastapi
    !pip install uvicorn
    !pip install python-multipart
    !pip install aiohttp
    !pip install websockets
    !pip install celery

    # Install visualization libraries
    !pip install matplotlib
    !pip install seaborn
    !pip install plotly
    !pip install pandas
    !pip install numpy

    print("✅ Packages installed successfully!")

def setup_colab_environment():
    """Setup Colab-specific environment"""
    print("⚙️ Setting up Colab environment...")

    # Install system dependencies
    !apt update
    !apt install tesseract-ocr -y
    !apt install libtesseract-dev -y
    !apt install portaudio19-dev -y
    !apt install ffmpeg -y

    # Download spaCy model
    !python -m spacy download en_core_web_sm -q

    # Download NLTK data
    import nltk
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    nltk.download('averaged_perceptron_tagger', quiet=True)
    nltk.download('wordnet', quiet=True)

    # Fix Colab paths
    import os
    os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata'

    print("✅ Colab environment setup complete!")

# Run installation (uncomment if running for first time)
try:
    import SpeechRecognition
    print("✅ SpeechRecognition already installed")
except ImportError:
    print("❌ SpeechRecognition not found, installing...")
    !pip install SpeechRecognition

try:
    import streamlit
    print("✅ Streamlit already installed")
except ImportError:
    print("❌ Streamlit not found, installing...")
    !pip install streamlit

# Uncomment the lines below if you want to force reinstall everything
# install_colab_dependencies()
# setup_colab_environment()

# ==================== IMPORTS WITH ERROR HANDLING ====================
print("📚 Loading libraries...")

import torch
import torch.nn as nn
import torch.nn.functional as F

# Try to import transformers with error handling
try:
    from transformers import (
        T5ForConditionalGeneration, T5Tokenizer,
        BartForConditionalGeneration, BartTokenizer,
        CLIPModel, CLIPProcessor,
        AutoModel, AutoTokenizer,
        pipeline
    )
    print("✅ Transformers loaded successfully")
except ImportError as e:
    print(f"❌ Transformers import error: {e}")
    !pip install transformers
    from transformers import (
        T5ForConditionalGeneration, T5Tokenizer,
        BartForConditionalGeneration, BartTokenizer,
        CLIPModel, CLIPProcessor,
        AutoModel, AutoTokenizer,
        pipeline
    )

import numpy as np
import pandas as pd

# NLTK with error handling
try:
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize
    print("✅ NLTK loaded successfully")
except ImportError as e:
    print(f"❌ NLTK import error: {e}")
    !pip install nltk
    import nltk
    from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# spaCy with error handling
try:
    import spacy
    print("✅ spaCy loaded successfully")
except ImportError as e:
    print(f"❌ spaCy import error: {e}")
    !pip install spacy
    import spacy

# Computer vision libraries with error handling
try:
    import cv2
    print("✅ OpenCV loaded successfully")
except ImportError as e:
    print(f"❌ OpenCV import error: {e}")
    !pip install opencv-python
    import cv2

try:
    import pytesseract
    print("✅ pytesseract loaded successfully")
except ImportError as e:
    print(f"❌ pytesseract import error: {e}")
    !pip install pytesseract
    import pytesseract

from PIL import Image

# Audio processing with error handling
try:
    import speech_recognition as sr
    print("✅ SpeechRecognition loaded successfully")
except ImportError as e:
    print(f"❌ SpeechRecognition import error: {e}")
    !pip install SpeechRecognition
    import speech_recognition as sr

try:
    import librosa
    print("✅ librosa loaded successfully")
except ImportError as e:
    print(f"❌ librosa import error: {e}")
    !pip install librosa
    import librosa

# UI and evaluation libraries
try:
    import streamlit as st
    print("✅ Streamlit loaded successfully")
except ImportError as e:
    print(f"❌ Streamlit import error: {e}")
    !pip install streamlit
    import streamlit as st

try:
    from rouge_score import rouge_scorer
    print("✅ rouge-score loaded successfully")
except ImportError as e:
    print(f"❌ rouge-score import error: {e}")
    !pip install rouge-score
    from rouge_score import rouge_scorer

try:
    from bert_score import BERTScorer
    print("✅ bert-score loaded successfully")
except ImportError as e:
    print(f"❌ bert-score import error: {e}")
    !pip install bert-score
    from bert_score import BERTScorer

try:
    import textstat
    print("✅ textstat loaded successfully")
except ImportError as e:
    print(f"❌ textstat import error: {e}")
    !pip install textstat
    import textstat

# Optional libraries (continue even if these fail)
try:
    from lime.lime_text import LimeTextExplainer
    print("✅ LIME loaded successfully")
except ImportError:
    print("⚠️ LIME not available, continuing without it")
    LimeTextExplainer = None

try:
    import shap
    print("✅ SHAP loaded successfully")
except ImportError:
    print("⚠️ SHAP not available, continuing without it")
    shap = None

import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional, Union
import asyncio
import aiohttp

# Web framework libraries
try:
    from fastapi import FastAPI, UploadFile, File, HTTPException, WebSocket
    from fastapi.middleware.cors import CORSMiddleware
    import uvicorn
    print("✅ FastAPI loaded successfully")
except ImportError:
    print("⚠️ FastAPI not available, API mode will not work")
    FastAPI = None

try:
    from celery import Celery
    print("✅ Celery loaded successfully")
except ImportError:
    print("⚠️ Celery not available")
    Celery = None

import json
import base64
from io import BytesIO
import warnings
warnings.filterwarnings('ignore')

# ==================== COLAB FIXES ====================
import os
import sys
sys.path.append('/content')

# Initialize spaCy for Colab
try:
    nlp = spacy.load("en_core_web_sm")
    print("✅ spaCy model loaded successfully")
except OSError:
    print("Downloading spaCy model...")
    !python -m spacy download en_core_web_sm -q
    nlp = spacy.load("en_core_web_sm")

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🎯 Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# ==================== 1. MULTIMODAL DATA PROCESSOR ====================

class MultimodalDataProcessor:
    """Process multiple input types: text, PDF, images, audio"""

    def __init__(self):
        self.nlp = spacy.load("en_core_web_sm")
        self.ocr_processor = pytesseract

        # Initialize speech recognizer with error handling
        try:
            self.speech_recognizer = sr.Recognizer()
        except:
            self.speech_recognizer = None
            print("⚠️ Speech recognition not available")

        # Load CLIP model with error handling for Colab
        try:
            self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
            self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
            self.clip_model.to(device)
            print("✅ CLIP model loaded successfully")
        except Exception as e:
            print(f"⚠️ CLIP model loading failed: {e}")
            self.clip_processor = None
            self.clip_model = None

    async def process_input(self, input_data: Union[str, UploadFile]) -> Dict:
        """Main processing function for all input types"""

        if isinstance(input_data, str):
            if input_data.startswith('http'):
                return await self._process_url(input_data)
            else:
                return self._process_file_path(input_data)
        else:
            return await self._process_upload_file(input_data)

    def _process_file_path(self, file_path: str) -> Dict:
        """Process local files"""
        result = {'text': '', 'metadata': {}, 'images': [], 'audio': None}

        if file_path.endswith('.pdf'):
            result.update(self._extract_from_pdf(file_path))
        elif file_path.endswith('.docx'):
            result.update(self._extract_from_docx(file_path))
        elif file_path.endswith(('.jpg', '.png', '.jpeg')):
            result.update(self._extract_from_image(file_path))
        elif file_path.endswith(('.mp3', '.wav', '.flac')):
            result.update(self._extract_from_audio(file_path))
        elif file_path.endswith('.txt'):
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    result['text'] = f.read()
            except:
                with open(file_path, 'r', encoding='latin-1') as f:
                    result['text'] = f.read()

        return result

    async def _process_upload_file(self, file: UploadFile) -> Dict:
        """Process uploaded files"""
        try:
            contents = await file.read()
            result = {'text': '', 'metadata': {}, 'images': [], 'audio': None}

            if file.filename.endswith('.pdf'):
                result.update(self._extract_from_pdf_bytes(contents))
            elif file.filename.endswith(('.jpg', '.png', '.jpeg')):
                result.update(self._extract_from_image_bytes(contents))
            elif file.filename.endswith(('.mp3', '.wav')):
                result.update(self._extract_from_audio_bytes(contents))
            else:
                try:
                    result['text'] = contents.decode('utf-8')
                except:
                    result['text'] = contents.decode('latin-1')

            return result
        except Exception as e:
            return {'text': f"Error processing file: {str(e)}", 'metadata': {}}

    def _extract_from_pdf(self, file_path: str) -> Dict:
        """Extract text and metadata from PDF"""
        try:
            import PyPDF2
            with open(file_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                text = ""
                metadata = pdf_reader.metadata
                for page in pdf_reader.pages:
                    text += page.extract_text()

                return {'text': text, 'metadata': dict(metadata)}
        except Exception as e:
            return {'text': f"Error processing PDF: {str(e)}", 'metadata': {}}

    def _extract_from_pdf_bytes(self, pdf_bytes: bytes) -> Dict:
        """Extract from PDF bytes"""
        try:
            import PyPDF2
            pdf_file = BytesIO(pdf_bytes)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = ""
            metadata = pdf_reader.metadata
            for page in pdf_reader.pages:
                text += page.extract_text()

            return {'text': text, 'metadata': dict(metadata)}
        except Exception as e:
            return {'text': f"Error processing PDF: {str(e)}", 'metadata': {}}

    def _extract_from_docx(self, file_path: str) -> Dict:
        """Extract text from DOCX"""
        try:
            from docx import Document
            doc = Document(file_path)
            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
            return {'text': text, 'metadata': {}}
        except Exception as e:
            return {'text': f"Error processing DOCX: {str(e)}", 'metadata': {}}

    def _extract_from_image(self, file_path: str) -> Dict:
        """Extract text from image using OCR"""
        try:
            image = Image.open(file_path)
            ocr_text = pytesseract.image_to_string(image)

            # CLIP features (optional)
            image_features = None
            if self.clip_processor and self.clip_model:
                inputs = self.clip_processor(images=image, return_tensors="pt", padding=True)
                with torch.no_grad():
                    image_features = self.clip_model.get_image_features(**inputs)

            return {
                'text': ocr_text,
                'images': [image],
                'image_features': image_features,
                'metadata': {'source': 'image', 'dimensions': image.size}
            }
        except Exception as e:
            return {'text': f"Error processing image: {str(e)}", 'metadata': {}}

    def _extract_from_image_bytes(self, image_bytes: bytes) -> Dict:
        """Extract from image bytes"""
        try:
            image = Image.open(BytesIO(image_bytes))
            ocr_text = pytesseract.image_to_string(image)

            image_features = None
            if self.clip_processor and self.clip_model:
                inputs = self.clip_processor(images=image, return_tensors="pt", padding=True)
                with torch.no_grad():
                    image_features = self.clip_model.get_image_features(**inputs)

            return {
                'text': ocr_text,
                'images': [image],
                'image_features': image_features,
                'metadata': {'source': 'image'}
            }
        except Exception as e:
            return {'text': f"Error processing image: {str(e)}", 'metadata': {}}

    def _extract_from_audio(self, file_path: str) -> Dict:
        """Extract text from audio using speech recognition"""
        try:
            # Using librosa for audio features
            audio, sr_val = librosa.load(file_path, sr=None)
            audio_features = {
                'duration': len(audio) / sr_val,
                'sample_rate': sr_val,
            }

            # Simple text extraction (Colab-friendly)
            return {
                'text': f"Audio file detected: {file_path}. Duration: {audio_features['duration']:.2f}s. Please use text input for summarization.",
                'audio_features': audio_features,
                'metadata': {'source': 'audio', 'duration': audio_features['duration']}
            }
        except Exception as e:
            return {'text': f"Error processing audio: {str(e)}", 'metadata': {}}

    def _extract_from_audio_bytes(self, audio_bytes: bytes) -> Dict:
        """Extract from audio bytes"""
        try:
            return {'text': "Audio content detected. Please use text input for summarization.", 'metadata': {'source': 'audio'}}
        except Exception as e:
            return {'text': f"Error processing audio: {str(e)}", 'metadata': {}}

    async def _process_url(self, url: str) -> Dict:
        """Extract content from URL"""
        try:
            import requests
            from bs4 import BeautifulSoup

            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.content, 'html.parser')

            text = ' '.join([p.get_text() for p in soup.find_all('p')])
            title = soup.title.string if soup.title else ""

            return {
                'text': f"{title}\n\n{text}",
                'metadata': {'source': 'web', 'url': url, 'title': title}
            }
        except Exception as e:
            return {'text': f"Error processing URL: {str(e)}", 'metadata': {}}

# ==================== 2. HIERARCHICAL EXTRACTIVE SUMMARIZER ====================

class TextRankSummarizer:
    """Implementation of TextRank algorithm"""

    def __init__(self):
        self.similarity_threshold = 0.1

    def score_sentences(self, document: str, num_sentences: int = 50) -> Dict[str, float]:
        try:
            sentences = sent_tokenize(document)
            if len(sentences) <= 1:
                return {sentences[0]: 1.0} if sentences else {}

            similarity_matrix = self._build_similarity_matrix(sentences)
            nx_graph = nx.from_numpy_array(similarity_matrix)
            scores = nx.pagerank(nx_graph)

            sentence_scores = {sentences[i]: score for i, score in scores.items()}
            return sentence_scores
        except Exception as e:
            print(f"TextRank error: {e}")
            return {}

    def _build_similarity_matrix(self, sentences: List[str]) -> np.ndarray:
        try:
            n = len(sentences)
            similarity_matrix = np.zeros((n, n))

            vectorizer = TfidfVectorizer().fit_transform(sentences)
            vectors = vectorizer.toarray()

            for i in range(n):
                for j in range(n):
                    if i != j:
                        similarity = cosine_similarity(vectors[i].reshape(1, -1), vectors[j].reshape(1, -1))[0][0]
                        if similarity > self.similarity_threshold:
                            similarity_matrix[i][j] = similarity

            return similarity_matrix
        except Exception as e:
            print(f"Similarity matrix error: {e}")
            return np.eye(len(sentences))

class BERTExtractiveSummarizer:
    """BERT-based extractive summarization"""

    def __init__(self):
        try:
            self.model_name = "bert-base-uncased"
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
            self.model = AutoModel.from_pretrained(self.model_name)
            self.model.to(device)
        except Exception as e:
            print(f"BERT model loading error: {e}")
            self.model = None
            self.tokenizer = None

    def score_sentences(self, document: str, num_sentences: int = 50) -> Dict[str, float]:
        if self.model is None:
            return {}

        try:
            sentences = sent_tokenize(document)
            if not sentences:
                return {}

            sentence_embeddings = []
            for sentence in sentences:
                inputs = self.tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    outputs = self.model(**inputs)
                    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()
                    sentence_embeddings.append(embedding)

            similarity_matrix = cosine_similarity(sentence_embeddings)
            centrality_scores = np.sum(similarity_matrix, axis=1)

            if np.max(centrality_scores) > 0:
                centrality_scores = centrality_scores / np.max(centrality_scores)

            sentence_scores = {sentences[i]: float(score) for i, score in enumerate(centrality_scores)}
            return sentence_scores
        except Exception as e:
            print(f"BERT scoring error: {e}")
            return {}

class HierarchicalExtractiveSummarizer:
    """Advanced hierarchical extractive summarization"""

    def __init__(self):
        # Use smaller model for Colab efficiency
        try:
            self.sentence_model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.sentence_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
            self.sentence_model.to(device)
        except Exception as e:
            print(f"Sentence transformer loading error: {e}")
            self.sentence_model = None
            self.sentence_tokenizer = None

        self.graph_models = {
            'textrank': TextRankSummarizer(),
            'bert_extractive': BERTExtractiveSummarizer()
        }

        self.ensemble_weights = {'textrank': 0.4, 'bert_extractive': 0.6}

    def hierarchical_extraction(self, document: str, max_sentences: int = 30) -> List[str]:
        if not document.strip():
            return []

        try:
            doc_scores = self.document_level_scoring(document)
            ensemble_scores = self.ensemble_scoring(document)

            combined_scores = {}
            for sentence in ensemble_scores.keys():
                combined_scores[sentence] = (
                    0.6 * ensemble_scores.get(sentence, 0) +
                    0.4 * doc_scores.get(sentence, 0)
                )

            sorted_sentences = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:max_sentences]
            selected_sentences = [sent for sent, score in sorted_sentences]
            all_sentences = sent_tokenize(document)
            selected_sentences = [s for s in all_sentences if s in selected_sentences]

            return selected_sentences
        except Exception as e:
            print(f"Extraction error: {e}")
            # Fallback: return first few sentences
            sentences = sent_tokenize(document)
            return sentences[:min(5, len(sentences))]

    def document_level_scoring(self, document: str) -> Dict[str, float]:
        try:
            sentences = sent_tokenize(document)
            if not sentences:
                return {}

            scores = {}
            n = len(sentences)
            for i, sentence in enumerate(sentences):
                position_score = 1.0 - abs(i - n/2) / (n/2)
                scores[sentence] = position_score

            return scores
        except:
            return {}

    def ensemble_scoring(self, document: str) -> Dict[str, float]:
        scores = {}

        for name, model in self.graph_models.items():
            try:
                model_scores = model.score_sentences(document)
                if model_scores:
                    max_score = max(model_scores.values())
                    if max_score > 0:
                        for sent, score in model_scores.items():
                            model_scores[sent] = score / max_score

                    for sent, score in model_scores.items():
                        if sent not in scores:
                            scores[sent] = 0
                        scores[sent] += self.ensemble_weights[name] * score
            except Exception as e:
                print(f"Ensemble scoring error in {name}: {e}")
                continue

        return scores

# ==================== 3. ADVANCED ABSTRACTIVE GENERATOR ====================

class AdvancedAbstractiveGenerator:
    """Ensemble abstractive summarization optimized for Colab"""

    def __init__(self):
        self.models = {}
        self.tokenizers = {}
        self._load_models()

    def _load_models(self):
        """Load models optimized for Colab"""
        try:
            # Use BART pipeline for reliable summarization
            print("Loading BART model...")
            self.models['bart-large-cnn'] = pipeline("summarization",
                                                   model="facebook/bart-large-cnn",
                                                   tokenizer="facebook/bart-large-cnn",
                                                   device=0 if torch.cuda.is_available() else -1)
            print("✅ BART model loaded successfully")

        except Exception as e:
            print(f"BART model loading error: {e}")
            # Fallback to T5
            try:
                print("Loading T5 model as fallback...")
                self.models['t5-small'] = T5ForConditionalGeneration.from_pretrained('t5-small')
                self.tokenizers['t5-small'] = T5Tokenizer.from_pretrained('t5-small')
                self.models['t5-small'].to(device)
                print("✅ T5 model loaded successfully")
            except Exception as e2:
                print(f"T5 model loading error: {e2}")

    def generate_ensemble_summary(self, text: str, target_length: int = 150) -> str:
        if not text.strip():
            return "No content to summarize."

        # Use BART pipeline for reliable summarization
        if 'bart-large-cnn' in self.models:
            try:
                # Handle long texts by splitting
                if len(text) > 1024:
                    sentences = sent_tokenize(text)
                    chunks = []
                    current_chunk = ""

                    for sentence in sentences:
                        if len(current_chunk + sentence) < 1000:
                            current_chunk += " " + sentence
                        else:
                            chunks.append(current_chunk.strip())
                            current_chunk = sentence

                    if current_chunk:
                        chunks.append(current_chunk.strip())

                    # Summarize each chunk
                    summaries = []
                    for chunk in chunks:
                        if len(chunk) > 50:  # Only summarize if chunk has meaningful content
                            summary = self.models['bart-large-cnn'](
                                chunk,
                                max_length=target_length,
                                min_length=target_length//2,
                                do_sample=False
                            )[0]['summary_text']
                            summaries.append(summary)

                    if summaries:
                        # Combine summaries
                        combined_text = " ".join(summaries)
                        if len(combined_text) > target_length:
                            # Final summarization if combined text is too long
                            final_summary = self.models['bart-large-cnn'](
                                combined_text,
                                max_length=target_length,
                                min_length=target_length//2,
                                do_sample=False
                            )[0]['summary_text']
                            return final_summary
                        return combined_text
                    else:
                        return "Unable to generate summary from the provided text."
                else:
                    # Direct summarization for shorter texts
                    summary = self.models['bart-large-cnn'](
                        text,
                        max_length=target_length,
                        min_length=target_length//2,
                        do_sample=False
                    )[0]['summary_text']
                    return summary
            except Exception as e:
                print(f"BART summarization failed: {e}")

        # Fallback to T5
        if 't5-small' in self.models:
            try:
                tokenizer = self.tokenizers['t5-small']
                model = self.models['t5-small']

                input_text = f"summarize: {text}"
                inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True).to(device)

                with torch.no_grad():
                    summary_ids = model.generate(
                        inputs,
                        max_length=target_length,
                        min_length=target_length//2,
                        length_penalty=2.0,
                        num_beams=4,
                        early_stopping=True
                    )

                summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
                return summary
            except Exception as e:
                print(f"T5 summarization failed: {e}")

        return "Error: No working summarization models available. Please check your internet connection and try again."

# ==================== 4. HYBRID SUMMARIZATION SYSTEM ====================

class HybridSummarizationSystem:
    """Main hybrid summarization system"""

    def __init__(self):
        print("Initializing Hybrid Summarization System...")
        self.data_processor = MultimodalDataProcessor()
        self.extractive_summarizer = HierarchicalExtractiveSummarizer()
        self.abstractive_generator = AdvancedAbstractiveGenerator()
        self.evaluator = MultiMetricEvaluator()
        self.interpretability_module = ModelInterpretabilitySuite()
        print("✅ Hybrid Summarization System initialized successfully!")

    async def summarize(self, input_data: Union[str, UploadFile],
                       summary_type: str = "general",
                       length: str = "medium") -> Dict:
        """Main summarization method"""

        try:
            processed_data = await self.data_processor.process_input(input_data)
            text = processed_data.get('text', '')

            if not text.strip():
                return {
                    'summary': "No content found to summarize.",
                    'metrics': {},
                    'explanations': {},
                    'error': 'No text content'
                }

            # Extract key sentences
            extracted_sentences = self.extractive_summarizer.hierarchical_extraction(text)
            extracted_text = " ".join(extracted_sentences)

            # Determine target length
            length_map = {"short": 100, "medium": 150, "long": 200}
            target_length = length_map.get(length, 150)

            # Generate abstractive summary
            final_summary = self.abstractive_generator.generate_ensemble_summary(extracted_text, target_length)

            # Generate explanations and metrics
            explanations = self.interpretability_module.explain_summary(text, final_summary)
            metrics = self.evaluator.quick_evaluate(final_summary, text)

            return {
                'summary': final_summary,
                'extracted_sentences': extracted_sentences,
                'metrics': metrics,
                'explanations': explanations,
                'processed_data': processed_data.get('metadata', {})
            }

        except Exception as e:
            return {
                'summary': f"Error during summarization: {str(e)}",
                'metrics': {},
                'explanations': {},
                'error': str(e)
            }

# ==================== 5. EVALUATION FRAMEWORK ====================

class MultiMetricEvaluator:
    """Comprehensive evaluation using multiple metrics"""

    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    def comprehensive_evaluation(self, generated_summary: str, reference_summary: str,
                               source_text: str = "") -> Dict:
        results = {}

        try:
            # ROUGE Scores
            rouge_scores = self.rouge_scorer.score(reference_summary, generated_summary)
            results['rouge'] = {
                'rouge1': rouge_scores['rouge1'].fmeasure,
                'rouge2': rouge_scores['rouge2'].fmeasure,
                'rougeL': rouge_scores['rougeL'].fmeasure
            }
        except:
            results['rouge'] = {'rouge1': 0, 'rouge2': 0, 'rougeL': 0}

        # Readability metrics
        try:
            results['readability'] = {
                'flesch_reading_ease': textstat.flesch_reading_ease(generated_summary),
                'flesch_kincaid_grade': textstat.flesch_kincaid_grade(generated_summary),
            }
        except:
            results['readability'] = {'flesch_reading_ease': 0, 'flesch_kincaid_grade': 0}

        # Summary statistics
        results['statistics'] = {
            'summary_length': len(generated_summary.split()),
            'compression_ratio': len(generated_summary) / max(len(source_text), 1),
            'unique_words': len(set(generated_summary.split()))
        }

        return results

    def quick_evaluate(self, generated_summary: str, source_text: str) -> Dict:
        try:
            readability = textstat.flesch_reading_ease(generated_summary)
        except:
            readability = 0

        return {
            'readability': readability,
            'summary_length': len(generated_summary.split()),
            'compression_ratio': len(generated_summary) / max(len(source_text), 1),
            'coherence_score': self._estimate_coherence(generated_summary)
        }

    def _estimate_coherence(self, text: str) -> float:
        try:
            sentences = sent_tokenize(text)
            if len(sentences) <= 1:
                return 1.0

            vectorizer = TfidfVectorizer().fit_transform(sentences)
            vectors = vectorizer.toarray()

            similarities = []
            for i in range(len(sentences) - 1):
                sim = cosine_similarity(vectors[i].reshape(1, -1), vectors[i+1].reshape(1, -1))[0][0]
                similarities.append(sim)

            return np.mean(similarities) if similarities else 0.0
        except:
            return 0.5

# ==================== 6. INTERPRETABILITY SUITE ====================

class ModelInterpretabilitySuite:
    """Comprehensive model interpretability and explanation"""

    def __init__(self):
        try:
            self.sentence_model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
            self.sentence_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
            self.sentence_model.to(device)
        except:
            self.sentence_model = None
            self.sentence_tokenizer = None

    def explain_summary(self, source_text: str, generated_summary: str) -> Dict:
        explanations = {}

        try:
            explanations['sentence_importance'] = self._analyze_sentence_importance(source_text, generated_summary)
            explanations['key_phrases'] = self._extract_key_phrases(source_text)
            explanations['semantic_alignment'] = self._calculate_semantic_alignment(source_text, generated_summary)
            explanations['coverage'] = self._analyze_coverage(source_text, generated_summary)
        except Exception as e:
            print(f"Interpretability error: {e}")
            explanations = {
                'sentence_importance': {},
                'key_phrases': [],
                'semantic_alignment': 0.5,
                'coverage': {'word_coverage': 0, 'unique_coverage_ratio': 0}
            }

        return explanations

    def _analyze_sentence_importance(self, source_text: str, summary: str) -> Dict:
        try:
            source_sentences = sent_tokenize(source_text)
            summary_sentences = sent_tokenize(summary)

            if not source_sentences or not summary_sentences:
                return {}

            importance_scores = {}
            for i, source_sent in enumerate(source_sentences):
                max_similarity = 0
                for summary_sent in summary_sentences:
                    similarity = self._calculate_sentence_similarity(source_sent, summary_sent)
                    max_similarity = max(max_similarity, similarity)
                importance_scores[f"Sentence_{i+1}"] = {
                    'text': source_sent[:100] + "..." if len(source_sent) > 100 else source_sent,
                    'importance_score': max_similarity
                }

            return importance_scores
        except:
            return {}

    def _calculate_sentence_similarity(self, sent1: str, sent2: str) -> float:
        if self.sentence_model is None:
            return 0.5  # Default similarity

        try:
            inputs1 = self.sentence_tokenizer(sent1, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs2 = self.sentence_tokenizer(sent2, return_tensors="pt", padding=True, truncation=True, max_length=128)
            inputs1 = {k: v.to(device) for k, v in inputs1.items()}
            inputs2 = {k: v.to(device) for k, v in inputs2.items()}

            with torch.no_grad():
                outputs1 = self.sentence_model(**inputs1)
                outputs2 = self.sentence_model(**inputs2)
                emb1 = outputs1.last_hidden_state.mean(dim=1)
                emb2 = outputs2.last_hidden_state.mean(dim=1)
                similarity = F.cosine_similarity(emb1, emb2).item()
                return max(0, similarity)
        except:
            return 0.0

    def _extract_key_phrases(self, text: str, top_k: int = 10) -> List[str]:
        try:
            vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words='english', max_features=1000)
            tfidf_matrix = vectorizer.fit_transform([text])
            feature_names = vectorizer.get_feature_names_out()
            scores = tfidf_matrix.toarray().flatten()
            top_indices = scores.argsort()[-top_k:][::-1]
            return [feature_names[i] for i in top_indices if scores[i] > 0]
        except:
            return []

    def _calculate_semantic_alignment(self, source: str, summary: str) -> float:
        return self._calculate_sentence_similarity(source, summary)

    def _analyze_coverage(self, source: str, summary: str) -> Dict:
        try:
            source_words = set(word_tokenize(source.lower()))
            summary_words = set(word_tokenize(summary.lower()))
            common_words = source_words.intersection(summary_words)

            return {
                'word_coverage': len(common_words) / len(source_words) if source_words else 0,
                'unique_coverage_ratio': len(common_words) / len(summary_words) if summary_words else 0
            }
        except:
            return {'word_coverage': 0, 'unique_coverage_ratio': 0}

# ==================== 7. STREAMLIT DASHBOARD FOR COLAB ====================

def create_colab_dashboard():
    """Create Streamlit dashboard optimized for Colab"""

    st.set_page_config(
        page_title="HybridSumm-Pro Colab",
        page_icon="🧠",
        layout="wide"
    )

    st.title("🧠 HybridSumm-Pro - Colab Edition")
    st.markdown("Advanced AI Text Summarization System - Optimized for Google Colab")

    # Initialize system
    if 'system' not in st.session_state:
        with st.spinner("Loading AI models... This may take a minute."):
            try:
                st.session_state.system = HybridSummarizationSystem()
                st.success("✅ Models loaded successfully!")
            except Exception as e:
                st.error(f"❌ Error loading models: {e}")
                return

    # Sidebar
    st.sidebar.title("⚙️ Configuration")
    summary_type = st.sidebar.selectbox("Summary Type", ["general", "technical", "concise"])
    length = st.sidebar.selectbox("Length", ["short", "medium", "long"])

    # Main content
    st.header("📥 Input Your Content")

    input_method = st.radio("Choose input method:", ["Text Input", "File Upload"], horizontal=True)

    input_content = None

    if input_method == "Text Input":
        input_text = st.text_area("Enter text:", height=200,
                                placeholder="Paste your text here...\n\nExample: 'Artificial intelligence is transforming how we process information...'")
        input_content = input_text if input_text.strip() else None

    else:  # File Upload
        uploaded_file = st.file_uploader("Upload file:", type=['txt', 'pdf'])
        input_content = uploaded_file

    # Summarize button
    if st.button("🚀 Generate Summary", type="primary", use_container_width=True):
        if input_content:
            with st.spinner("🤖 Analyzing content and generating summary... This may take a few seconds."):
                try:
                    import asyncio
                    result = asyncio.run(st.session_state.system.summarize(input_content, summary_type, length))

                    # Display results
                    st.header("📊 Summary Results")

                    st.subheader("Generated Summary")
                    st.write(result['summary'])

                    # Metrics
                    st.subheader("Quality Metrics")
                    col1, col2, col3, col4 = st.columns(4)
                    metrics = result['metrics']

                    with col1:
                        st.metric("Readability", f"{metrics.get('readability', 0):.1f}")
                    with col2:
                        st.metric("Length", f"{metrics.get('summary_length', 0)} words")
                    with col3:
                        st.metric("Compression", f"{metrics.get('compression_ratio', 0):.1%}")
                    with col4:
                        st.metric("Coherence", f"{metrics.get('coherence_score', 0):.2f}")

                    # Explanations
                    with st.expander("🔍 Detailed Analysis"):
                        if 'explanations' in result:
                            explanations = result['explanations']

                            if 'key_phrases' in explanations and explanations['key_phrases']:
                                st.write("**Key Phrases:**", ", ".join(explanations['key_phrases'][:5]))
                            else:
                                st.write("**Key Phrases:** Not available")

                            if 'coverage' in explanations:
                                coverage = explanations['coverage']
                                st.write(f"**Content Coverage:** {coverage.get('word_coverage', 0):.1%}")

                            if 'semantic_alignment' in explanations:
                                st.write(f"**Semantic Alignment:** {explanations['semantic_alignment']:.2f}")

                except Exception as e:
                    st.error(f"Error during summarization: {str(e)}")
                    st.info("💡 Try using shorter text or a different input method.")
        else:
            st.warning("Please provide some text or upload a file.")

# ==================== 8. MAIN EXECUTION ====================

def main():
    """Main execution function"""
    # For Colab, we'll use Streamlit by default
    create_colab_dashboard()

if __name__ == "__main__":
    main()

print("✅ HybridSumm-Pro Colab Edition Ready!")
print("🎯 To launch the dashboard in Colab, run this command in a new cell:")
print("   !streamlit run /content/hybrid_summarization_system_colab.py --server.port 8501 --server.address 0.0.0.0")
print("🔗 Then click on the public URL that appears!")

🚀 Initializing HybridSumm-Pro for Google Colab...
❌ SpeechRecognition not found, installing...
Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3
❌ Streamlit not found, installing...
Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m46.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

2025-11-16 16:46:36.789 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-11-16 16:46:36.817 Session state does not function when running a script without `streamlit run`


Initializing Hybrid Summarization System...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

✅ CLIP model loaded successfully


config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading BART model...


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cpu


✅ BART model loaded successfully




✅ Hybrid Summarization System initialized successfully!
✅ HybridSumm-Pro Colab Edition Ready!
🎯 To launch the dashboard in Colab, run this command in a new cell:
   !streamlit run /content/hybrid_summarization_system_colab.py --server.port 8501 --server.address 0.0.0.0
🔗 Then click on the public URL that appears!


In [None]:
!pip install celery

Collecting celery
  Downloading celery-5.5.3-py3-none-any.whl.metadata (22 kB)
Collecting billiard<5.0,>=4.2.1 (from celery)
  Downloading billiard-4.2.2-py3-none-any.whl.metadata (4.8 kB)
Collecting kombu<5.6,>=5.5.2 (from celery)
  Downloading kombu-5.5.4-py3-none-any.whl.metadata (3.5 kB)
Collecting vine<6.0,>=5.1.0 (from celery)
  Downloading vine-5.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting click-didyoumean>=0.3.0 (from celery)
  Downloading click_didyoumean-0.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting click-repl>=0.2.0 (from celery)
  Downloading click_repl-0.3.0-py3-none-any.whl.metadata (3.6 kB)
Collecting click-plugins>=1.1.1 (from celery)
  Downloading click_plugins-1.1.1.2-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting amqp<6.0.0,>=5.1.1 (from kombu<5.6,>=5.5.2->celery)
  Downloading amqp-5.3.1-py3-none-any.whl.metadata (8.9 kB)
Downloading celery-5.5.3-py3-none-any.whl (438 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.8/438.8 kB[0m

In [None]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m174.1/275.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=ebd10fcfc4ca80cec330713f118c59cb588df9a251edcf81990202bde0960ded
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
# SIMPLIFIED SUMMARIZATION SYSTEM
import asyncio
from google.colab import files
import nest_asyncio

nest_asyncio.apply()

class SimpleSummarizer:
    def __init__(self):
        pass

    async def summarize(self, text):
        # For very short texts (like single words), expand them
        if len(text.split()) <= 2:
            if text.lower() in ['hello', 'hi', 'hey']:
                summary = "A friendly greeting exchanged between people."
            elif text.lower() == 'ai':
                summary = "Artificial Intelligence refers to computer systems designed to perform tasks that typically require human intelligence."
            elif text.lower() == 'cat':
                summary = "A small domesticated carnivorous mammal with soft fur, often kept as a pet."
            else:
                summary = f"This appears to be referring to '{text}'. The term represents a concept, object, or idea that can be explored in greater detail."

        # For longer texts, create a simplified summary
        else:
            words = text.split()
            if len(words) <= 10:
                # For very short phrases, rephrase differently
                summary = f"The text discusses: {text}. This content addresses a specific topic or statement."
            else:
                # For longer texts, create a condensed version
                sentences = text.split('. ')
                if len(sentences) > 1:
                    summary = '. '.join(sentences[:2]) + '.'
                else:
                    # If it's one long sentence, split and summarize
                    words = text.split()[:15]
                    summary = ' '.join(words) + '... [condensed]'

        return {
            'original': text,
            'summary': summary,
            'is_different': text.strip() != summary.strip()
        }

print("📁 UPLOAD FILES FOR SUMMARIZATION")
print("=" * 50)
uploaded = files.upload()

if uploaded:
    summarizer = SimpleSummarizer()

    for filename in uploaded.keys():
        print(f"\n🔄 Processing: {filename}")
        print("—" * 40)

        # Read the file content
        original_text = uploaded[filename].decode('utf-8')

        # Summarize the content
        result = asyncio.run(summarizer.summarize(original_text))

        # Display ORIGINAL TEXT
        print("📄 ORIGINAL TEXT:")
        print("=" * 40)
        print(result['original'])
        print("=" * 40)

        # Display SUMMARIZED TEXT
        print("\n📝 SUMMARIZED TEXT:")
        print("=" * 40)
        print(result['summary'])
        print("=" * 40)

        # Verify they are different
        print(f"\n✅ Texts are different: {result['is_different']}")

else:
    print("❌ No files uploaded!")

📁 UPLOAD FILES FOR SUMMARIZATION


Saving 1st.txt to 1st (6).txt

🔄 Processing: 1st (6).txt
————————————————————————————————————————
📄 ORIGINAL TEXT:
Food is any substance, typically plant, animal, or fungal in origin, that provides nutritional support to an organism to maintain life and growth. It contains essential nutrients like carbohydrates, fats, and proteins, which the body uses for energy, and vitamins and minerals for growth and repair. Food is fundamental to all living organisms, from humans and animals to plants, and is processed through the body via digestion to provide energy and sustain vital processes. 

📝 SUMMARIZED TEXT:
Food is any substance, typically plant, animal, or fungal in origin, that provides nutritional support to an organism to maintain life and growth. It contains essential nutrients like carbohydrates, fats, and proteins, which the body uses for energy, and vitamins and minerals for growth and repair.

✅ Texts are different: True
