<a href="https://colab.research.google.com/github/hegame1998/NLP-Assignment/blob/main/NLP%20Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I will do this approach in this code:


* **Loads two documents:** one to summarize, one as style reference.

* **Estimates token length** using word count (proxy for 4000-token limit).

* **Performs chunk-based summarization** using TextRank-style TF-IDF cosine similarity.

* **If the summary is too large**, it recursively shrinks it.

* **Saves the summaries.**

* **Prints a query prompt** to generate a style-following summary.

In [18]:
# =======================
# 📥 Upload Files
# =======================
from google.colab import files

print("🔼 Upload the style file (style.txt)")
uploaded_style = files.upload()

print("🔼 Upload the document to summarize (to_summarize.txt)")
uploaded_target = files.upload()

# =======================
# 📄 Load Text
# =======================
def load_documents(style_path, target_path):
    with open(style_path, 'r', encoding='utf-8') as f:
        style_text = f.read()
    with open(target_path, 'r', encoding='utf-8') as f:
        target_text = f.read()
    return style_text, target_text

# =======================
# 🔧 Install & Download NLTK Resources
# =======================
import nltk
nltk.download('punkt', download_dir='/usr/local/share/nltk_data')
nltk.data.path.append('/usr/local/share/nltk_data')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# =======================
# 🧼 Preprocessing
# =======================
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import string
from collections import Counter

def preprocess(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return sentences, words

def get_token_count(words):
    return len(words)

def split_into_chunks(sentences, max_tokens):
    chunks = []
    current_chunk = []
    token_count = 0
    for sent in sentences:
        tokens = word_tokenize(sent)
        if token_count + len(tokens) > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sent]
            token_count = len(tokens)
        else:
            current_chunk.append(sent)
            token_count += len(tokens)
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# =======================
# ✨ Extract Style Features
# =======================
def extract_style_features(text):
    words = word_tokenize(text)
    tagged = nltk.pos_tag(words)
    pos_counts = Counter(tag for word, tag in tagged)
    sentences = sent_tokenize(text)
    avg_len = sum(len(word_tokenize(s)) for s in sentences) / len(sentences)
    return {'pos_distribution': pos_counts, 'avg_sentence_length': avg_len}

# =======================
# 💬 Sentence Scoring & Chunk Summary
# =======================
def score_sentences(sentences, style_features):
    stop_words = set(stopwords.words('english'))
    scores = {}
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [w for w in words if w not in stop_words and w not in string.punctuation]
        score = len(words)
        if abs(len(words) - style_features['avg_sentence_length']) < 5:
            score += 2  # stylistic bonus
        scores[sent] = score
    return scores

def summarize_chunk(chunk, style_features, target_sentences=5):
    sentences = sent_tokenize(chunk)
    scores = score_sentences(sentences, style_features)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    selected = [sent for sent, score in ranked[:target_sentences]]
    return ' '.join(selected)

# =======================
# 📏 Check Summary Length
# =======================
def check_summary_length(summary, token_limit=4000):
    words = word_tokenize(summary)
    return len(words) <= token_limit

# =======================
# 🧠 Main Hierarchical Summarization
# =======================
def hierarchical_summarize(style_text, target_text, token_limit=4000):
    _, style_words = preprocess(style_text)
    style_features = extract_style_features(style_text)

    target_sentences, target_words = preprocess(target_text)
    if len(target_words) <= token_limit:
        return summarize_chunk(target_text, style_features)

    chunks = split_into_chunks(target_sentences, token_limit)
    summaries = [summarize_chunk(chunk, style_features) for chunk in chunks]

    final_summary = ' '.join(summaries)

    while not check_summary_length(final_summary, token_limit):
        final_summary = summarize_chunk(final_summary, style_features)

    return final_summary

# =======================
# 🚀 Run Summarization
# =======================
style_text, target_text = load_documents('style.txt', 'to_summarize.txt')
final_summary = hierarchical_summarize(style_text, target_text)

# =======================
# 💾 Save & Download
# =======================
with open("summary.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)

print("\n✅ Summary generated and saved as 'summary.txt'.")
files.download("summary.txt")


🔼 Upload the style file (style.txt)


Saving style.txt to style (5).txt
🔼 Upload the document to summarize (to_summarize.txt)


Saving to_summarize.txt to to_summarize (3).txt


[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/usr/local/share/nltk_data'
**********************************************************************


#Data Collection

This is where I load my input and style reference documents.<br> Read two input text files (T1: style source, T2: text to summarize).




In [2]:
# Data Collection
def load_documents(style_path, target_path):
    with open(style_path, 'r', encoding='utf-8') as f:
        style_text = f.read()
    with open(target_path, 'r', encoding='utf-8') as f:
        target_text = f.read()
    return style_text, target_text

#Preprocessing

Clean and tokenize the text.

* Tokenize T1 and T2.

* Count token lengths.

* Define target lengths proportionally.

In [3]:
# Preprocessing
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize

def preprocess(text):
    sentences = sent_tokenize(text)
    words = word_tokenize(text)
    return sentences, words

def get_token_count(words):
    return len(words)

def split_into_chunks(sentences, max_tokens):
    chunks = []
    current_chunk = []
    token_count = 0
    for sent in sentences:
        tokens = word_tokenize(sent)
        if token_count + len(tokens) > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [sent]
            token_count = len(tokens)
        else:
            current_chunk.append(sent)
            token_count += len(tokens)
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Style Feature Extraction

Use TF-IDF and cosine similarity to rank sentences for summarization.

* Extract stylistic features from T1 (sentence length, punctuation usage, common POS tags).

* Optionally, compute average sentence length and POS tag distribution.


In [4]:
# Feature Extraction
nltk.download('averaged_perceptron_tagger')

from collections import Counter

def extract_style_features(text):
    words = word_tokenize(text)
    tagged = nltk.pos_tag(words)
    pos_counts = Counter(tag for word, tag in tagged)
    sentences = sent_tokenize(text)
    avg_len = sum(len(word_tokenize(s)) for s in sentences) / len(sentences)
    return {'pos_distribution': pos_counts, 'avg_sentence_length': avg_len}

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


#Model Training (Summarization Logic)

Iteratively summarize large texts to fit the context window (e.g., 4000 tokens).

* We’ll implement an extractive summarization method using scoring (TF-IDF or frequency-based).

* No external model training is required.

* For style adaptation, re-rank or rewrite based on stylistic features from T1.

In [5]:
# Summarization Logic
nltk.download('stopwords')

from nltk.corpus import stopwords
import string

def score_sentences(sentences, style_features):
    stop_words = set(stopwords.words('english'))
    scores = {}
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [w for w in words if w not in stop_words and w not in string.punctuation]
        score = len(words)
        if abs(len(words) - style_features['avg_sentence_length']) < 5:
            score += 2  # stylistic bonus
        scores[sent] = score
    return scores

def summarize_chunk(chunk, style_features, target_sentences=5):
    sentences = sent_tokenize(chunk)
    scores = score_sentences(sentences, style_features)
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    selected = [sent for sent, score in ranked[:target_sentences]]
    return ' '.join(selected)



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


#Evaluation Function

Output and save summaries, simulate a style-following prompt.

* Manual or ROUGE-based metrics (if available).

* Summary length check vs context window.

In [6]:
# Evaluation
def check_summary_length(summary, token_limit=4000):
    words = word_tokenize(summary)
    return len(words) <= token_limit

#Main Pipeline

To tie everything together.

* Orchestrates all components.

* Repeats summarization until the result fits within the token limit.


In [7]:
# Main Function
def hierarchical_summarize(style_text, target_text, token_limit=4000):
    _, style_words = preprocess(style_text)
    style_features = extract_style_features(style_text)

    target_sentences, target_words = preprocess(target_text)
    if len(target_words) <= token_limit:
        return summarize_chunk(target_text, style_features)

    chunks = split_into_chunks(target_sentences, token_limit)
    summaries = [summarize_chunk(chunk, style_features) for chunk in chunks]

    final_summary = ' '.join(summaries)

    # Recursively summarize if still too long
    while not check_summary_length(final_summary, token_limit):
        final_summary = summarize_chunk(final_summary, style_features)

    return final_summary


#Run the Code

In [8]:
style_text, target_text = load_documents('style.txt', 'to_summarize.txt')
final_summary = hierarchical_summarize(style_text, target_text)

FileNotFoundError: [Errno 2] No such file or directory: 'style.txt'

#Save Output

In [None]:
with open("summary.txt", "w", encoding="utf-8") as f:
    f.write(final_summary)

print("✅ Summary generated and saved as summary.txt")
files.download("summary.txt")