<a href="https://colab.research.google.com/github/khemsu/TextSumarization/blob/main/GradientBoostcodeSingleCell.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas nltk scikit-learn rouge-score transformers
!python -m nltk.downloader punkt stopwords averaged_perceptron_tagger_eng

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=450cff7984df39c1a473b11445e1feec5c7c0b9ea28c07572fd00914b0da0475
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_e

In [None]:
import os
import re
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import nltk
nltk.download('punkt_tab')
articles_path = "/Users/sundippahimlimbu/Gradient_Model/News Articles/business"
summaries_path = "/Users/sundippahimlimbu/Gradient_Model/Summaries/business"

def read_text_files(filepath_list, folder_path):
    """Read all .txt files with proper encoding handling"""
    contents = []
    for filename in filepath_list:
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as f:
            contents.append(f.read())
    return contents

article_files = sorted([f for f in os.listdir(articles_path) if f.endswith('.txt')])
summary_files = sorted([f for f in os.listdir(summaries_path) if f.endswith('.txt')])

article_texts = read_text_files(article_files, articles_path)
summary_texts = read_text_files(summary_files, summaries_path)

print(f"Found {len(article_files)} articles and {len(summary_files)} summaries")
print("First 5 article filenames:", article_files[:5])
print("First 5 summary filenames:", summary_files[:5])

df = pd.DataFrame({'article': article_texts, 'summary': summary_texts})

def preprocess(text):
    """Clean and tokenize text while preserving sentence structure"""
    sentences = sent_tokenize(text)
    clean_sentences = []
    for sent in sentences:
        # sentence to word
        words = word_tokenize(sent.lower())

        #words to alphnumeric only ie removing punctuations
        words = [w for w in words if w.isalnum() and w not in stopwords.words('english')]

        #joining into clean_sentences
        clean_sentences.append(' '.join(words))
    return clean_sentences

preprocessed_articles = [preprocess(text) for text in article_texts]
preprocessed_summaries = [preprocess(text) for text in summary_texts]

def label_sentences(articles, summaries):
    """Generate binary labels (1=in summary, 0=not in summary)"""
    y_labels = []
    for art_sents, sum_sents in zip(articles, summaries):
        labels = []
        for art_sent in art_sents:
            # Check if any summary sentence is contained in article sentence
            match = 0
            for sum_sent in sum_sents:
                if sum_sent in art_sent or art_sent in sum_sent:
                    match = 1
                    break
            labels.append(match)
        y_labels.append(labels)
    return y_labels

def postprocess_labels(article, labels, min_sentence_length=5):
    """Ensure very short sentences or boilerplate are not selected."""
    for i, (sent, label) in enumerate(zip(article, labels)):
        if len(sent.split()) < min_sentence_length:
            labels[i] = 0  # Discard short sentences
    return labels

def extract_advanced_features(sentences):
    """Extract multiple linguistic features for each sentence"""
    features = []
    positions = np.linspace(0, 1, num=len(sentences))
    tfidf = TfidfVectorizer().fit_transform(sentences)
    tfidf_avg = np.array(tfidf.mean(axis=1)).flatten()

    for i, sent in enumerate(sentences):
        # Quantitative features
        numbers = len(re.findall(r'\$?\d+(?:\.\d+)?%?', sent))
        proper_nouns = len([word for word, tag in pos_tag(word_tokenize(sent)) if tag == 'NNP'])

        # Structural features
        word_count = len(word_tokenize(sent))
        char_count = len(sent)
        is_question = 1 if sent.strip().endswith('?') else 0

        # Semantic features (simplified)
        has_connector = 1 if any(word in sent for word in ['however', 'therefore', 'although']) else 0

        features.append([
            positions[i],      # Normalized position (0=start, 1=end)
            tfidf_avg[i],     # TF-IDF importance
            numbers,          # Count of numerical values
            proper_nouns,     # Count of proper nouns
            word_count,       # Word length
            char_count/100,   # Normalized character length
            is_question,      # Is question sentence
            has_connector     # Contains discourse connector
        ])

    return np.array(features)

import nltk
nltk.download('punkt_tab')

# Download all required NLTK data
nltk.download([
    'punkt',          # Tokenizer
    'stopwords',      # Stopwords list
    'wordnet',        # WordNet lemmatizer
    'averaged_perceptron_tagger_eng'  # POS tagger
])


# Preprocess and label
df['clean_article'] = df['article'].apply(preprocess)
df['clean_summary'] = df['summary'].apply(preprocess)
y_labels = label_sentences(df['clean_article'], df['clean_summary'])

## 4. Model Training =====================================================

# Feature extraction
X_features = []
for sentences in df['clean_article']:
    features = extract_advanced_features(sentences)
    X_features.append(features)

# Prepare training data
X = np.vstack(X_features)
y = np.concatenate(y_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train calibrated model
model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.01,
    max_depth=3,
    random_state=42
)
calibrated_model = CalibratedClassifierCV(model, cv=5)
calibrated_model.fit(X_train, y_train)

print(f"\nTraining Accuracy: {calibrated_model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {calibrated_model.score(X_test, y_test):.2f}")

def generate_summary(article, model, top_n=3):
    """Generate extractive summary with diversity and coherence"""
    sentences = sent_tokenize(article)
    if len(sentences) <= top_n:
        return article

    clean_sents = preprocess(article)
    features = extract_advanced_features(clean_sents)
    probas = model.predict_proba(features)[:, 1]

    # Select top sentences with diversity
    selected_indices = []
    for _ in range(top_n):
        remaining = [i for i in range(len(sentences)) if i not in selected_indices]
        next_idx = remaining[np.argmax(probas[remaining])]
        selected_indices.append(next_idx)

    # Maintain original order
    selected_indices.sort()
    summary = ' '.join([sentences[i] for i in selected_indices])

    # Post-processing
    summary = summary[0].upper() + summary[1:]
    if not summary.endswith(('.', '!', '?')):
        summary = summary + '.'
    return summary

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate on sample
sample_idx = 0
generated = generate_summary(df['article'][sample_idx], calibrated_model)
reference = df['summary'][sample_idx]

print("\n=== Original Article ===")
print(df['article'][sample_idx][:500] + "...")

print("\n=== Reference Summary ===")
print(reference)

print("\n=== Generated Summary ===")
print(generated)

# ROUGE Evaluation
scores = scorer.score(reference, generated)
print("\n=== ROUGE Scores ===")
print(f"ROUGE-1: F1={scores['rouge1'].fmeasure:.3f} (Recall={scores['rouge1'].recall:.3f}, Precision={scores['rouge1'].precision:.3f})")
print(f"ROUGE-2: F1={scores['rouge2'].fmeasure:.3f} (Recall={scores['rouge2'].recall:.3f}, Precision={scores['rouge2'].precision:.3f})")
print(f"ROUGE-L: F1={scores['rougeL'].fmeasure:.3f} (Recall={scores['rougeL'].recall:.3f}, Precision={scores['rougeL'].precision:.3f})")

def generate_summary(article, model, top_n=3):
    """Generate extractive summary with diversity and coherence"""
    sentences = sent_tokenize(article)
    if len(sentences) <= top_n:
        return article

    clean_sents = preprocess(article)
    features = extract_advanced_features(clean_sents)
    probas = model.predict_proba(features)[:, 1]

    # Select top sentences with diversity
    selected_indices = []
    for _ in range(top_n):
        remaining = [i for i in range(len(sentences)) if i not in selected_indices]
        next_idx = remaining[np.argmax(probas[remaining])]
        selected_indices.append(next_idx)

    # Maintain original order
    selected_indices.sort()
    summary = ' '.join([sentences[i] for i in selected_indices])

    # Post-processing
    summary = summary[0].upper() + summary[1:]
    if not summary.endswith(('.', '!', '?')):
        summary = summary + '.'
    return summary

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate on sample
sample_idx = 0
generated = generate_summary(df['article'][sample_idx], calibrated_model)
reference = df['summary'][sample_idx]

print("\n=== Original Article ===")
print(df['article'][sample_idx][:500] + "...")

print("\n=== Reference Summary ===")
print(reference)

print("\n=== Generated Summary ===")
print(generated)

# ROUGE Evaluation
scores = scorer.score(reference, generated)
print("\n=== ROUGE Scores ===")
print(f"ROUGE-1: F1={scores['rouge1'].fmeasure:.3f} (Recall={scores['rouge1'].recall:.3f}, Precision={scores['rouge1'].precision:.3f})")
print(f"ROUGE-2: F1={scores['rouge2'].fmeasure:.3f} (Recall={scores['rouge2'].recall:.3f}, Precision={scores['rouge2'].precision:.3f})")
print(f"ROUGE-L: F1={scores['rougeL'].fmeasure:.3f} (Recall={scores['rougeL'].recall:.3f}, Precision={scores['rougeL'].precision:.3f})")
