<a href="https://colab.research.google.com/github/hegame1998/NLP-Assignment/blob/main/NLP_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# No need to download nltk resources
# Basic implementation using naive sentence tokenizer

# ------------------------
# Data Collection
# ------------------------
import requests

# Define the input texts (you can replace these with your own content)
# Change these to your actual GitHub raw URLs
# source_url = "https://github.com/hegame1998/NLP-Assignment/blob/main/source_text.txt"
# style_url = "https://github.com/hegame1998/NLP-Assignment/blob/main/style_text.txt"

# Replace with your actual raw GitHub URLs
source_url = "https://raw.githubusercontent.com/hegame1998/NLP-Assignment/main/source_text.txt"
style_url = "https://raw.githubusercontent.com/hegame1998/NLP-Assignment/main/style_text.txt"

# Load text from GitHub
source_text = requests.get(source_url).text
style_text = requests.get(style_url).text

# Save the input texts (optional for reference or reuse)
with open("source_text.txt", "w", encoding="utf-8") as f:
    f.write(source_text)

with open("style_text.txt", "w", encoding="utf-8") as f:
    f.write(style_text)

# ------------------------
# Preprocessing (no punkt)
# ------------------------

def naive_sentence_tokenize(text):
    import re
    # Split sentences on punctuation followed by a space and capital letter
    return re.split(r'(?<=[.?!])\s+(?=[A-Z])', text)

def preprocess_text(text):
    sentences = naive_sentence_tokenize(text)
    return [s.strip() for s in sentences if len(s.strip()) > 0]

# ------------------------
# Feature Extraction
# ------------------------

def compute_target_lengths(len1, len2, max_token=4000):
    total = len1 + len2
    proportion1 = len1 / total
    proportion2 = len2 / total
    return int(max_token * proportion1), int(max_token * proportion2)

# ------------------------
# Model Training (Summarization Logic)
# ------------------------

def hierarchical_summarize(sentences, target_len, slice_size=20):
    summary = []
    for i in range(0, len(sentences), slice_size):
        chunk = sentences[i:i + slice_size]
        chunk_summary = simple_extract_summary(chunk, target_len)
        summary.extend(chunk_summary)
        if len(summary) >= target_len:
            break
    return summary[:target_len]

def simple_extract_summary(sentences, max_sentences):
    # Simple extractive summarization: pick first N sentences
    return sentences[:max_sentences]

# ------------------------
# Evaluation
# ------------------------

def evaluate_summary(original, summary, label):
    print(f"=== {label} Summary Evaluation ===")
    print(f"Original sentences: {len(original)}")
    print(f"Summary sentences: {len(summary)}")
    print("Sample summary:")
    print("\n".join(summary[:5]))
    print("\n" + "-"*50 + "\n")

# ------------------------
# Main Function
# ------------------------

def main_pipeline(source_text, style_text):
    # Preprocessing
    source_sentences = preprocess_text(source_text)
    style_sentences = preprocess_text(style_text)

    # Proportional length calculation
    source_target_len, style_target_len = compute_target_lengths(
        len(source_sentences), len(style_sentences), max_token=50
    )

    # Hierarchical summarization
    source_summary = hierarchical_summarize(source_sentences, source_target_len)
    style_summary = hierarchical_summarize(style_sentences, style_target_len)

    # Evaluation
    evaluate_summary(source_sentences, source_summary, "Source")
    evaluate_summary(style_sentences, style_summary, "Style")

    return source_summary, style_summary

# Run the full pipeline
source_summary, style_summary = main_pipeline(source_text, style_text)


=== Source Summary Evaluation ===
Original sentences: 60
Summary sentences: 45
Sample summary:
Natural Language Processing (NLP) is a sub-field of artificial intelligence that focuses on the interaction between computers and humans through natural language.
The ultimate objective of NLP is to read, decipher, understand, and make sense of the human languages in a manner that is valuable.
Most NLP techniques rely on machine learning to derive meaning from human languages.
Applications of NLP include speech recognition, text summarization, machine translation, sentiment analysis, and more.
The field of NLP combines computational linguistics with statistical, machine learning, and deep learning models.

--------------------------------------------------

=== Style Summary Evaluation ===
Original sentences: 6
Summary sentences: 4
Sample summary:
In the beginning, language was simple.
It served only to convey the most basic of messages—danger, food, shelter.
As human societies grew more comp