In [None]:
pip install sumy rouge-score nltk glob

In [None]:
# Imports and global variables

from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# Initialize language for stemmer and the number of most important sentences returned
LANGUAGE = "english"
SENTENCES_COUNT = 3

# **IMPORTANT Note**: Since we are using Sumy package instead of the original PyTLDR, there is no direct counterpart for Relevance sentence scoring. We use the closest equivalent LexRank which also uses cosine similarity.

### Task 1: Three summarization algorithms

In [3]:
# Example text document
example_document = """
Natural language processing and text mining is a course in Master's degree program in the Univeristy of Oulu.
Video games are good for passing time efficiently.
University of Oulu has multiple restaurants which are cheap and offer healthy food.
Multiple different courses require experience on programming languages.
Oulu is known for many technological companies such as Oura and Fingersoft.
"""
parser = PlaintextParser.from_string(example_document, Tokenizer(LANGUAGE))

print("--- Top 3 Most Important Sentences by Each Algorithm ---")
# TextRank summarization
textrank_summarizer = TextRankSummarizer(Stemmer(LANGUAGE))
textrank_summarizer.stop_words = get_stop_words(LANGUAGE)
print("\nTextRank Summary:")
for sentence in textrank_summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

# LSA summarization
lsa_summarizer = LsaSummarizer(Stemmer(LANGUAGE))
lsa_summarizer.stop_words = get_stop_words(LANGUAGE)
print("\nLSA Summary:")
for sentence in lsa_summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

# LexRank summarization
lexrank_summarizer = LexRankSummarizer(Stemmer(LANGUAGE))
lexrank_summarizer.stop_words = get_stop_words(LANGUAGE)
print("\nLexRank Summary:")
for sentence in lexrank_summarizer(parser.document, SENTENCES_COUNT):
    print(sentence)

--- Top 3 Most Important Sentences by Each Algorithm ---

TextRank Summary:
Natural language processing and text mining is a course in Master's degree program in the Univeristy of Oulu.
Video games are good for passing time efficiently.
University of Oulu has multiple restaurants which are cheap and offer healthy food.

LSA Summary:
Natural language processing and text mining is a course in Master's degree program in the Univeristy of Oulu.
University of Oulu has multiple restaurants which are cheap and offer healthy food.
Multiple different courses require experience on programming languages.

LexRank Summary:
Natural language processing and text mining is a course in Master's degree program in the Univeristy of Oulu.
Video games are good for passing time efficiently.
University of Oulu has multiple restaurants which are cheap and offer healthy food.


### Task 2: Text summarizer GUI

In [4]:
# The summarizer GUI will open in a new window and this cell will run as long as the window is open.

import tkinter as tk
from tkinter import filedialog, scrolledtext, messagebox

# Helper function for text summarization
def summarize_text(source, is_url=False):
    try:
        if is_url:
            parser = HtmlParser.from_url(source, Tokenizer(LANGUAGE))
        else:
            parser = PlaintextParser.from_file(source, Tokenizer(LANGUAGE))
        
        # Initialize summarizers, use stemming and stop word removal
        textrank = TextRankSummarizer(Stemmer(LANGUAGE))
        textrank.stop_words = get_stop_words(LANGUAGE)
        lsa = LsaSummarizer(Stemmer(LANGUAGE))
        lsa.stop_words = get_stop_words(LANGUAGE)
        lexrank = LexRankSummarizer(Stemmer(LANGUAGE))
        lexrank.stop_words = get_stop_words(LANGUAGE)

        # Create dictionary for summarizer outputs
        summaries = {
            "TextRank": "\n\n".join(str(s) for s in textrank(parser.document, SENTENCES_COUNT)),
            "LSA": "\n\n".join(str(s) for s in lsa(parser.document, SENTENCES_COUNT)),
            "LexRank": "\n\n".join(str(s) for s in lexrank(parser.document, SENTENCES_COUNT))
        }
        
        return summaries

    except Exception as e:
        messagebox.showwarning("Input Error", "Please enter a URL or choose a file")
        print("Error: ", e)
        return None

# Helper function for browsing a file
def browse_file():
    filename = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    entry_source.delete(0, tk.END)
    entry_source.insert(0, filename)

# Helper function for running summary
def run_summary():
    source = entry_source.get().strip()
    if not source:
        messagebox.showwarning("Input Error", "Please enter a URL or choose a file")
        return None
    
    is_url = source.startswith("http")
    summaries = summarize_text(source, is_url)

    # Delete old summaries and add new ones (if summarizers are run multiple times)
    if summaries:
        text_textrank.delete(1.0, tk.END)
        text_lsa.delete(1.0, tk.END)
        text_lexrank.delete(1.0, tk.END)
        text_textrank.insert(tk.END, summaries["TextRank"])
        text_lsa.insert(tk.END, summaries["LSA"])
        text_lexrank.insert(tk.END, summaries["LexRank"])

# GUI setup using tkinter library
root = tk.Tk()
root.title("Text Summarizer")
root.geometry("1200x700")

frame_top = tk.Frame(root)
frame_top.pack(pady=10)

tk.Label(frame_top, text="Enter URL or choose a file to get 3 most important sentences:").pack(anchor="w", padx=5)
entry_source = tk.Entry(frame_top, width=70)
entry_source.pack(side=tk.LEFT, padx=5)

btn_browse = tk.Button(frame_top, text="Browse File", command=browse_file)
btn_browse.pack(side=tk.LEFT, padx=5)

btn_summarize = tk.Button(frame_top, text="Summarize", command=run_summary)
btn_summarize.pack(side=tk.LEFT, padx=5)

# --Text areas for summaries--
frame_texts = tk.Frame(root)
frame_texts.pack(fill=tk.BOTH, expand=True)

# TextRank section
frame_textrank = tk.Frame(frame_texts)
frame_textrank.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5)

tk.Label(frame_textrank, text="TextRank Summary:").pack(anchor="n", pady=5)
text_textrank = scrolledtext.ScrolledText(frame_textrank, wrap=tk.WORD, width=40)
text_textrank.pack(fill=tk.BOTH, expand=True)

# LSA section
frame_lsa = tk.Frame(frame_texts)
frame_lsa.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5)

tk.Label(frame_lsa, text="LSA Summary:").pack(pady=5)
text_lsa = scrolledtext.ScrolledText(frame_lsa, wrap=tk.WORD, width=40)
text_lsa.pack(fill=tk.BOTH, expand=True)

# LexRank section
frame_lexrank = tk.Frame(frame_texts)
frame_lexrank.pack(side=tk.LEFT, fill=tk.BOTH, expand=True, padx=5)

tk.Label(frame_lexrank, text="LexRank Summary:").pack(pady=5)
text_lexrank = scrolledtext.ScrolledText(frame_lexrank, wrap=tk.WORD, width=40)
text_lexrank.pack(fill=tk.BOTH, expand=True)

print("Text Summarizer GUI is running on a separate window!")
root.mainloop()

Text Summarizer GUI is running on a separate window!


### Task 3

In [None]:
# Code here

### Task 4

In [None]:
# Code here

### Task 5

In [None]:
# Code here

### Task 6: Algorithm performance on Opinosis dataset

In [5]:
import os
from rouge_score import rouge_scorer
from glob import glob

# Helper function for text summarization
def summarize_text(text, summarizer):
    parser = PlaintextParser.from_string(text, Tokenizer(LANGUAGE))
    return " ".join(str(s) for s in summarizer(parser.document, SENTENCES_COUNT))

# Helper function for ROUGE evaluation
def evaluate_rouge(system_summary, reference_summary):
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2"], use_stemmer=True)
    scores = scorer.score(reference_summary, system_summary)
    return {k: v.fmeasure for k, v in scores.items()}

# Setup summarizers (use stemming and stopword removal)
summarizers = {
    "TextRank": TextRankSummarizer(Stemmer(LANGUAGE)),
    "LSA": LsaSummarizer(Stemmer(LANGUAGE)),
    "LexRank": LexRankSummarizer(Stemmer(LANGUAGE))
}
for s in summarizers.values():
    s.stop_words = get_stop_words(LANGUAGE)

# Loop through the opinosis dataset
topics_path = "Data/OpinosisDataset/topics"
summaries_path = "Data/OpinosisDataset/summaries-gold"

results = {name: [] for name in summarizers}

for topic_file in os.listdir(topics_path):
    if not topic_file.endswith(".txt.data"):
        continue
    
    # Get the topic name from the file and construct paths for topics and gold summaries
    topic_name = topic_file.replace("txt.data", "")
    topic_path = os.path.join(topics_path, topic_file)
    summaries_dir = os.path.join(summaries_path, topic_name)

    if not os.path.exists(summaries_dir):
        continue
    
    # Read all reviews for the topic (there are mixed encodings in the dataset)
    try:
        with open(topic_path, "r", encoding="utf-8") as file:
            text = file.read()
    except UnicodeDecodeError:
        with open(topic_path, "r", encoding="latin-1") as file:
            text = file.read()
    
    # Read all gold summaries for the topic with glob
    summaries_paths = sorted(glob(os.path.join(summaries_dir, "*.gold")))
    gold_summaries = []
    for s in summaries_paths:
        with open(s, "r", encoding="utf-8") as summary_file:
            gold_summaries.append(summary_file.read())

    # Evaluate each summarizer
    for name, summarizer in summarizers.items():
        system_summary = summarize_text(text, summarizer)
        all_rouge = []
        for gold in gold_summaries:
            all_rouge.append(evaluate_rouge(system_summary, gold))
        
        # Compute average across all gold summaries
        avg_rouge1 = sum(r["rouge1"] for r in all_rouge) / len(all_rouge)
        avg_rouge2 = sum(r["rouge2"] for r in all_rouge) / len(all_rouge)
        results[name].append({"rouge1": avg_rouge1, "rouge2": avg_rouge2})

# Compute overall averages
print ("--- Performance of TextRank, Latent Semantic and LexRank algorithms on Opinosis dataset in terms of Rouge-1 and Rouge-2 ---")
for name, scores in results.items():
    avg_r1 = sum(s["rouge1"] for s in scores) / len(scores)
    avg_r2 = sum(s["rouge2"] for s in scores) / len(scores)
    print(f"\n{name} - ROUGE-1: {avg_r1:.3f} | ROUGE-2: {avg_r2:.3f}")

--- Performance of TextRank, Latent Semantic and LexRank algorithms on Opinosis dataset in terms of Rouge-1 and Rouge-2 ---

TextRank - ROUGE-1: 0.153 | ROUGE-2: 0.036

LSA - ROUGE-1: 0.129 | ROUGE-2: 0.023

LexRank - ROUGE-1: 0.238 | ROUGE-2: 0.066


### Task 7

In [None]:
# Code here

### Task 8

In [None]:
# Code here

### Task 10

In [None]:
# Code here