<a href="https://colab.research.google.com/github/khemsu/NLP_Training-models/blob/main/GradientBoostingClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install numpy pandas nltk scikit-learn rouge-score transformers
!python -m nltk.downloader punkt stopwords averaged_perceptron_tagger

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=7df61891b151ff53673b1740128df294fb8158daa62e8e8654796dbf9928dc12
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [2]:
import os
import re
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
from google.colab import drive

In [3]:
# Mount Google Drive
drive.mount('/content/drive')

# Set paths to your BBC dataset
articles_path = "/content/drive/MyDrive/BBC News Summary/News Articles/business"
summaries_path = "/content/drive/MyDrive/BBC News Summary/Summaries/business"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def read_text_files(filepath_list, folder_path):
    """Read all .txt files and return their contents with proper encoding"""
    contents = []
    for filename in filepath_list:
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as f:
            contents.append(f.read())
    return contents

def preprocess(text):
    """Clean and tokenize text while preserving sentence structure"""
    sentences = sent_tokenize(text)
    clean_sentences = []
    for sent in sentences:
        # Basic cleaning
        words = word_tokenize(sent.lower())
        words = [w for w in words if w.isalnum() and w not in stopwords.words('english')]
        clean_sentences.append(' '.join(words))
    return clean_sentences


In [5]:
# Load and pair data
article_files = sorted(os.listdir(articles_path))
summary_files = sorted(os.listdir(summaries_path))

articles = read_text_files(article_files, articles_path)
summaries = read_text_files(summary_files, summaries_path)

df = pd.DataFrame({
    'article': articles,
    'summary': summaries,
    'article_filename': article_files,
    'summary_filename': summary_files
})

In [6]:
print(f"Loaded {len(df)} article-summary pairs")
print("\nSample pair verification:")
print("Article:", df['article'][0][:100] + "...")
print("Summary:", df['summary'][0][:100] + "...")

Loaded 510 article-summary pairs

Sample pair verification:
Article: Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.1...
Summary: TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner p...


In [7]:
def extract_advanced_features(sentences):
    """Extract multiple linguistic features for each sentence"""
    features = []
    positions = np.linspace(0, 1, num=len(sentences))
    tfidf = TfidfVectorizer().fit_transform(sentences)
    tfidf_avg = np.array(tfidf.mean(axis=1)).flatten()

    for i, sent in enumerate(sentences):
        # Quantitative features
        numbers = len(re.findall(r'\$?\d+(?:\.\d+)?%?', sent))
        proper_nouns = len([word for word, tag in pos_tag(word_tokenize(sent)) if tag == 'NNP'])

        # Structural features
        word_count = len(word_tokenize(sent))
        char_count = len(sent)
        is_question = 1 if sent.strip().endswith('?') else 0

        # Semantic features (simplified)
        has_connector = 1 if any(word in sent for word in ['however', 'therefore', 'although']) else 0

        features.append([
            positions[i],      # Normalized position (0=start, 1=end)
            tfidf_avg[i],     # TF-IDF importance
            numbers,          # Count of numerical values
            proper_nouns,     # Count of proper nouns
            word_count,       # Word length
            char_count/100,   # Normalized character length
            is_question,      # Is question sentence
            has_connector     # Contains discourse connector
        ])

    return np.array(features)

In [8]:
def label_sentences(articles, summaries):
    """Generate binary labels (1=in summary, 0=not in summary)"""
    y_labels = []
    for art_sents, sum_sents in zip(articles, summaries):
        labels = []
        for art_sent in art_sents:
            # Check if any summary sentence is contained in article sentence
            match = 0
            for sum_sent in sum_sents:
                if sum_sent in art_sent or art_sent in sum_sent:
                    match = 1
                    break
            labels.append(match)
        y_labels.append(labels)
    return y_labels

In [26]:
import nltk

# Download all required NLTK data
nltk.download([
    'punkt',          # Tokenizer
    'stopwords',      # Stopwords list
    'wordnet',        # WordNet lemmatizer
    'averaged_perceptron_tagger_eng'  # POS tagger
])


# Preprocess and label
df['clean_article'] = df['article'].apply(preprocess)
df['clean_summary'] = df['summary'].apply(preprocess)
y_labels = label_sentences(df['clean_article'], df['clean_summary'])

## 4. Model Training =====================================================

# Feature extraction
X_features = []
for sentences in df['clean_article']:
    features = extract_advanced_features(sentences)
    X_features.append(features)

# Prepare training data
X = np.vstack(X_features)
y = np.concatenate(y_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Train calibrated model
model = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
calibrated_model = CalibratedClassifierCV(model, cv=5)
calibrated_model.fit(X_train, y_train)

print(f"\nTraining Accuracy: {calibrated_model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {calibrated_model.score(X_test, y_test):.2f}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!



Training Accuracy: 0.92
Test Accuracy: 0.92


In [27]:
def generate_summary(article, model, top_n=3):
    """Generate extractive summary with diversity and coherence"""
    sentences = sent_tokenize(article)
    if len(sentences) <= top_n:
        return article

    clean_sents = preprocess(article)
    features = extract_advanced_features(clean_sents)
    probas = model.predict_proba(features)[:, 1]

    # Select top sentences with diversity
    selected_indices = []
    for _ in range(top_n):
        remaining = [i for i in range(len(sentences)) if i not in selected_indices]
        next_idx = remaining[np.argmax(probas[remaining])]
        selected_indices.append(next_idx)

    # Maintain original order
    selected_indices.sort()
    summary = ' '.join([sentences[i] for i in selected_indices])

    # Post-processing
    summary = summary[0].upper() + summary[1:]
    if not summary.endswith(('.', '!', '?')):
        summary = summary + '.'
    return summary

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Evaluate on sample
sample_idx = 0
generated = generate_summary(df['article'][sample_idx], calibrated_model)
reference = df['summary'][sample_idx]

print("\n=== Original Article ===")
print(df['article'][sample_idx][:500] + "...")

print("\n=== Reference Summary ===")
print(reference)

print("\n=== Generated Summary ===")
print(generated)

# ROUGE Evaluation
scores = scorer.score(reference, generated)
print("\n=== ROUGE Scores ===")
print(f"ROUGE-1: F1={scores['rouge1'].fmeasure:.3f} (Recall={scores['rouge1'].recall:.3f}, Precision={scores['rouge1'].precision:.3f})")
print(f"ROUGE-2: F1={scores['rouge2'].fmeasure:.3f} (Recall={scores['rouge2'].recall:.3f}, Precision={scores['rouge2'].precision:.3f})")
print(f"ROUGE-L: F1={scores['rougeL'].fmeasure:.3f} (Recall={scores['rougeL'].recall:.3f}, Precision={scores['rougeL'].precision:.3f})")


=== Original Article ===
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time ...

=== Reference Summary ===
TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn.For the full-year, TimeWarner posted a profit of $3.36bn, up 27% from its 2003 performance, while revenues grew 6.4% to $42.09bn.Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger in