# Final Project: Chinese Text Summarization Comparison (TextRank, TF-IDF, MT5)

Run each cell one by one. This will compare three Chinese summarization methods and outputs ROUGE evaluation scores.

Import Libraries

In [3]:
import jieba
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rouge_score import rouge_scorer


Define TextRank Functions

In [4]:
def segment_text_to_sentence(text):
    sentences = [s.strip() for s in text.replace('。', '。|').split('|') if s.strip()]
    return sentences

def segment_text_to_words(text):
    return list(set(jieba.lcut(text)))

def original_similarity_matrix(sentences):
    sentence_words = [set(segment_text_to_words(s)) for s in sentences]
    size = len(sentences)
    sim_matrix = np.zeros((size, size))
    for i in range(size):
        for j in range(size):
            if i != j:
                overlap = len(sentence_words[i] & sentence_words[j])
                sim_matrix[i][j] = overlap / (np.log(len(sentence_words[i]) + 1) + np.log(len(sentence_words[j]) + 1))
    return sim_matrix

def cosine_tfidf_similarity_matrix(sentences):
    joined = [' '.join(segment_text_to_words(s)) for s in sentences]
    tfidf = TfidfVectorizer().fit_transform(joined)
    sim_matrix = cosine_similarity(tfidf)
    np.fill_diagonal(sim_matrix, 0)
    return sim_matrix

def text_rank(text, use_tfidf=False, top_n=3, d=0.85, max_iter=200):
    sentences = segment_text_to_sentence(text)
    sim_matrix = cosine_tfidf_similarity_matrix(sentences) if use_tfidf else original_similarity_matrix(sentences)
    weights = np.ones(len(sentences))
    for _ in range(max_iter):
        new_weights = (1 - d) + d * sim_matrix.T.dot(weights / (sim_matrix.sum(axis=1) + 1e-10))
        if np.allclose(new_weights, weights):
            break
        weights = new_weights
    ranked = np.argsort(-weights)
    summary = ' '.join([sentences[i] for i in sorted(ranked[:top_n])])
    return summary

Load MT5 Model

In [5]:
model_name = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def mt5_summary(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(inputs.input_ids, max_length=84, num_beams=4, no_repeat_ngram_size=2)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/702 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

ImportError: 
 requires the protobuf library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


Provide Example Input

In [None]:
example_text = "近日,网红李炮儿再次完成一项挑战,带300多位舞者齐跳科目三,打破了世界纪录,但没想到却遭到全网嘲讽。"


Generate Summaries

In [None]:
print("\n--- TextRank Summary ---")
print(text_rank(example_text, use_tfidf=False))

print("\n--- TextRank + TF-IDF Summary ---")
print(text_rank(example_text, use_tfidf=True))

print("\n--- MT5 Summary ---")
print(mt5_summary(example_text))

Evaluate with ROUGE

In [None]:
reference = example_text  # 
candidate = mt5_summary(example_text)  # use MT5 change to text_rank() output

from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
scores = scorer.score(reference, candidate)
print("--- ROUGE Scores ---")
print(scores)


In [3]:
text = "近日,网红李炮儿再次完成一项挑战,带300多位舞者齐跳科目三,打破了世界纪录,但没想到却遭到全网嘲讽。"

inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
summary_ids = model.generate(inputs.input_ids, max_length=84, num_beams=4, no_repeat_ngram_size=2)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("Summary:")
print(summary)


NameError: name 'tokenizer' is not defined