In [29]:
import warnings
warnings.filterwarnings("ignore")

In [30]:
!pip install gensim --upgrade



In [31]:
pip install transformers sentencepiece torch



In [32]:
from typing import List
import re
import math
import argparse
import heapq
import json
import sys

try:
    from transformers import pipeline, AutoTokenizer
    import torch
except Exception:
    pipeline = None
    AutoTokenizer = None
    torch = None
    _TRANSFORMERS_IMPORT_ERROR = None
STOPWORDS = {
    "a", "an", "the", "and", "or", "but", "if", "while", "with", "without",
    "to", "from", "by", "for", "of", "on", "in", "at", "is", "are", "was", "were",
    "this", "that", "these", "those", "it", "its", "be", "as", "which", "not",
}

_SENT_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')

def split_sentences(text: str) -> List[str]:
    sents = [s.strip() for s in _SENT_SPLIT_RE.split(text) if s.strip()]
    return sents or [text.strip()]

def word_tokens(text: str) -> List[str]:
    return [w.lower() for w in re.findall(r"\w+", text) if w.lower() not in STOPWORDS]

def extractive_reduce(text: str, ratio: float = 0.3, min_sentences: int = 1, max_sentences: int = 8) -> str:

    sentences = split_sentences(text)
    if len(sentences) <= 1:
        return text

    freq = {}
    for sent in sentences:
        for w in word_tokens(sent):
            freq[w] = freq.get(w, 0) + 1

    scores = []
    for i, sent in enumerate(sentences):
        s = sum(freq.get(w, 0) for w in word_tokens(sent))
        scores.append((s, i, sent))

    keep = max(min_sentences, min(max_sentences, math.ceil(len(sentences) * ratio)))
    top = heapq.nlargest(keep, scores, key=lambda x: (x[0], -x[1]))
    top_sorted = sorted(top, key=lambda x: x[1])
    reduced = " ".join([s for (_score, _i, s) in top_sorted])
    return reduced

def make_abstractive_pipeline(model_name: str = "t5-small"):
    if pipeline is None:
        raise RuntimeError("transformers/torch not installed. Install: pip install transformers sentencepiece torch")
    device = 0 if torch and torch.cuda.is_available() else -1
    return pipeline("summarization", model=model_name, tokenizer=model_name, device=device)

def trim_for_model(text: str, model_name: str, fraction_of_model_max: float = 0.9) -> str:
    if AutoTokenizer is None:
        return text

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_max = getattr(tokenizer, "model_max_length", 512) or 512
    if model_max > 16384:
        model_max = 1024
    budget = max(64, int(model_max * fraction_of_model_max))

    sentences = split_sentences(text)
    if not sentences:
        return text

    def token_count(s: str) -> int:
        ids = tokenizer.encode(s, add_special_tokens=False, truncation=False)
        return len(ids)

    joined = " ".join(sentences)
    if token_count(joined) <= budget:
        return joined


    left = 0
    right = len(sentences) - 1
    while left <= right:
        candidate = sentences[:left + 1] + sentences[right:]
        if token_count(" ".join(candidate)) <= budget:
            return " ".join(candidate)

        right -= 1
        if right < left:
            break


    first = sentences[0]
    ids = tokenizer.encode(first, add_special_tokens=False)
    ids = ids[:max(1, budget)]
    return tokenizer.decode(ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

def abstractive_summarize_text(text: str, model_name: str = "t5-small",
                               max_length: int = 120, min_length: int = 20,
                               use_extractive_reduced: bool = True) -> str:
    if pipeline is None:
        raise RuntimeError("transformers not installed. Install: pip install transformers sentencepiece torch")


    if use_extractive_reduced:
        reduced = extractive_reduce(text, ratio=0.25, min_sentences=1, max_sentences=8)
    else:
        reduced = text


    trimmed = trim_for_model(reduced, model_name)
    summarizer = make_abstractive_pipeline(model_name)
    out = summarizer(trimmed, max_length=max_length, min_length=min_length, do_sample=False)
    if isinstance(out, list) and out:
        return out[0].get("summary_text", "").strip()
    return str(out)

def read_input_text(args) -> str:
    if args.text:
        return args.text
    if args.input_file:
        with open(args.input_file, "r", encoding="utf-8") as f:
            return f.read()
    if not sys.stdin.isatty():
        return sys.stdin.read()
    return ""

def main(args):
    text = read_input_text(args)
    if not text or not text.strip():
        print("No input text provided. Use --text, --input-file, or pipe text via stdin.", file=sys.stderr)
        sys.exit(2)

    extractive = None
    abstractive = None

    if not args.no_extractive:
        extractive = extractive_reduce(text, ratio=args.ratio)
    if not args.no_abstractive:
        try:

            if args.no_gpu and torch is not None:

                import os
                os.environ["CUDA_VISIBLE_DEVICES"] = ""
            abstractive = abstractive_summarize_text(text, model_name=args.model,
                                                     max_length=args.max_len, min_length=args.min_len,
                                                     use_extractive_reduced=not args.use_full_for_abstractive)
        except Exception as e:
            abstractive = None
            print(f"[warning] abstractive step failed: {e}", file=sys.stderr)
    if extractive is not None:
        print("=== Extractive Summary ===")
        print(extractive)
        print()

    if abstractive is not None:
        print("=== Abstractive (Generative) Summary ===")
        print(abstractive)
        print()
    if args.json_out:
        out = {"extractive": extractive, "abstractive": abstractive}
        with open(args.json_out, "w", encoding="utf-8") as f:
            json.dump(out, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    pass

In [33]:
input_text = input("Please enter the text you want to summarize: ")
main(argparse.Namespace(text=input_text, input_file=None, model="t5-base", max_len=120, min_len=20, ratio=0.5, no_extractive=False, no_abstractive=False, use_full_for_abstractive=False, json_out=None, no_gpu=False))

Please enter the text you want to summarize: This product is a complete sham. The wireless charging has zero reliability. Don't purchase it just because of the Spigen brand attached to it. Had thoughtvthat this would be a novel charging solution, but it fails on all fronts - desk charging, wireless lortable charging and enen the 2 in 1 charging. Never buying Spigen products again.A not much space taking and elegant product, yet wireless charging not that successful as it heats up devices very soon. Prices could be a little less. fast charging is also a little slow while using a Samsung phone and comparing the same with a Samsung power bank.Wireless charging not working On off switch not able use easily Overall waste of money, Don’t buy this product.Pros:- 1. Works as a Multiple Wireless Charger, Charging Iphone, Airpods & Watch all at once while plugged in with 20W Apple adapter. 2. Works as a Powerbank to Fastcharge with both wired Type A (18W) & Type C (20W) Capabilities.Cons:- 1. Po

Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


=== Extractive Summary ===
Had thoughtvthat this would be a novel charging solution, but it fails on all fronts - desk charging, wireless lortable charging and enen the 2 in 1 charging. Never buying Spigen products again.A not much space taking and elegant product, yet wireless charging not that successful as it heats up devices very soon. fast charging is also a little slow while using a Samsung phone and comparing the same with a Samsung power bank.Wireless charging not working On off switch not able use easily Overall waste of money, Don’t buy this product.Pros:- 1. Works as a Multiple Wireless Charger, Charging Iphone, Airpods & Watch all at once while plugged in with 20W Apple adapter. Works as a Powerbank to Fastcharge with both wired Type A (18W) & Type C (20W) Capabilities.Cons:- 1. The power bank should have a strong magnet or magsafe compatible to keep it securely in place on the charging pad, so it doesn't move easily.

=== Abstractive (Generative) Summary ===
not much space

checking for halucination

In [34]:
input_text = "The quick brown fox jumps over the lazy dog. This is a test sentence to check for hallucination."
abstractive_summary = abstractive_summarize_text(input_text)
extractive_summary = extractive_reduce(input_text)

print("Original Text:")
print(input_text)
print("\nAbstractive Summary:")
print(abstractive_summary)
print("\nExtractive Summary:")
print(extractive_summary)

Device set to use cpu
Your max_length is set to 120, but your input_length is only 15. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=7)
Both `max_new_tokens` (=256) and `max_length`(=120) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Original Text:
The quick brown fox jumps over the lazy dog. This is a test sentence to check for hallucination.

Abstractive Summary:
the quick brown fox jumps over the lazy dog . the quick fox is a fox that can jump over the fox .

Extractive Summary:
The quick brown fox jumps over the lazy dog.


if halucinating:
the quick fox is a fox that can jump over the fox

Compare with different models
Fine-tune the model

using some of the metrics here in transformers.
we have options such as T5-small , t5-base ,bert in my model I am using t5-base and t5-small

In [35]:
!pip install rouge_score



In [36]:
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(abstractive_summary, input_text)

print("\nROUGE Scores (Abstractive Summary vs. Original Text):")
for key, value in scores.items():
    print(f"{key}: {value}")

scores_extractive = scorer.score(extractive_summary, input_text)
print("\nROUGE Scores (Extractive Summary vs. Original Text):")
for key, value in scores_extractive.items():
    print(f"{key}: {value}")


ROUGE Scores (Abstractive Summary vs. Original Text):
rouge1: Score(precision=0.6111111111111112, recall=0.5238095238095238, fmeasure=0.5641025641025642)
rouge2: Score(precision=0.5294117647058824, recall=0.45, fmeasure=0.48648648648648646)
rougeL: Score(precision=0.6111111111111112, recall=0.5238095238095238, fmeasure=0.5641025641025642)

ROUGE Scores (Extractive Summary vs. Original Text):
rouge1: Score(precision=0.5, recall=1.0, fmeasure=0.6666666666666666)
rouge2: Score(precision=0.47058823529411764, recall=1.0, fmeasure=0.6399999999999999)
rougeL: Score(precision=0.5, recall=1.0, fmeasure=0.6666666666666666)


In [37]:
import nltk
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')

reference = [word_tokenize(input_text)]
hypothesis = word_tokenize(abstractive_summary)

bleu_score_abstractive = sentence_bleu(reference, hypothesis)

print("\nBLEU Score (Abstractive Summary vs. Original Text):")
print(bleu_score_abstractive)

hypothesis_extractive = word_tokenize(extractive_summary)

bleu_score_extractive = sentence_bleu([word_tokenize(input_text)], hypothesis_extractive)

print("\nBLEU Score (Extractive Summary vs. Original Text):")
print(bleu_score_extractive)


BLEU Score (Abstractive Summary vs. Original Text):
0.38222431380970806

BLEU Score (Extractive Summary vs. Original Text):
0.36787944117144233


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
