<a href="https://colab.research.google.com/github/khemsu/Data_Warehosuing_and_mining_labs/blob/main/GradientBoostingnormalFeature.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install numpy pandas nltk scikit-learn sentence-transformers xgboost rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-many

In [6]:
import os
import re
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from rouge_score import rouge_scorer
import nltk
import xgboost as xgb

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

articles_path = "/content/drive/MyDrive/BBC News Summary/News Articles/business"
summaries_path = "/content/drive/MyDrive/BBC News Summary/Summaries/business"

# === File Reading ===
def read_text_files(filepath_list, folder_path):
    contents = []
    for filename in filepath_list:
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as f:
            contents.append(f.read())
    return contents

article_files = sorted([f for f in os.listdir(articles_path) if f.endswith('.txt')])
summary_files = sorted([f for f in os.listdir(summaries_path) if f.endswith('.txt')])

article_texts = read_text_files(article_files, articles_path)
summary_texts = read_text_files(summary_files, summaries_path)

df = pd.DataFrame({'article': article_texts, 'summary': summary_texts})

# === Preprocessing ===
def preprocess(text):
    sentences = sent_tokenize(text)
    clean_sentences = []
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [w for w in words if w.isalnum() and w not in stopwords.words('english')]
        clean_sentences.append(' '.join(words))
    return clean_sentences

# === Semantic Sentence Labeling ===
def label_sentences_semantic(articles, summaries):
    from sentence_transformers import SentenceTransformer
    model_sbert = SentenceTransformer('all-MiniLM-L6-v2')
    labels = []
    for art_sents, sum_sents in zip(articles, summaries):
        if not art_sents or not sum_sents:
            labels.append([0]*len(art_sents))
            continue
        emb_art = model_sbert.encode(art_sents)
        emb_sum = model_sbert.encode(sum_sents)
        sim_matrix = cosine_similarity(emb_art, emb_sum)
        max_sim = sim_matrix.max(axis=1)
        labels.append((max_sim >= 0.7).astype(int).tolist())
    return labels

# === Named Entity Count ===
def count_named_entities(sent):
    chunks = ne_chunk(pos_tag(word_tokenize(sent)))
    return len([chunk for chunk in chunks if isinstance(chunk, Tree)])

# === Feature Extraction (TF-IDF Based) ===
def extract_advanced_features(sentences):
    features = []
    positions = np.linspace(0, 1, num=len(sentences))
    tfidf = TfidfVectorizer().fit_transform(sentences)
    tfidf_avg = np.array(tfidf.mean(axis=1)).flatten()

    for i, sent in enumerate(sentences):
        numbers = len(re.findall(r'\$?\d+(?:\.\d+)?%?', sent))
        proper_nouns = len([word for word, tag in pos_tag(word_tokenize(sent)) if tag == 'NNP'])
        word_count = len(word_tokenize(sent))
        char_count = len(sent)
        is_question = 1 if sent.strip().endswith('?') else 0
        has_connector = 1 if any(word in sent for word in ['however', 'therefore', 'although']) else 0

        features.append([
            positions[i],
            tfidf_avg[i],
            numbers,
            proper_nouns,
            word_count,
            char_count/100,
            is_question,
            has_connector
        ])

    return np.array(features)

# === Prepare Dataset ===
df['clean_article'] = df['article'].apply(preprocess)
df['clean_summary'] = df['summary'].apply(preprocess)
y_labels = label_sentences_semantic(df['clean_article'], df['clean_summary'])

X_features, y_all = [], []
for i, row in df.iterrows():
    feats = extract_advanced_features(row['clean_article'])
    X_features.append(feats)
    y_all.extend(y_labels[i])

X = np.vstack(X_features)
y = np.array(y_all)

# === Scaling ===
scaler = StandardScaler()
X = scaler.fit_transform(X)

# === Train Model ===
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
model = xgb.XGBClassifier(n_estimators=300, max_depth=4, learning_rate=0.05)
model.fit(X_train, y_train)

print("\nClassification Report:\n", classification_report(y_test, model.predict(X_test)))

# === MMR Summary Generator ===
def mmr_summary(article, model, top_n=3):
    from sentence_transformers import SentenceTransformer
    model_sbert = SentenceTransformer('all-MiniLM-L6-v2')
    sents = sent_tokenize(article)
    if len(sents) <= top_n:
        return article
    clean_sents = preprocess(article)
    features = extract_advanced_features(clean_sents)
    features = scaler.transform(features)
    probas = model.predict_proba(features)[:, 1]
    emb = model_sbert.encode(clean_sents)

    selected_idx = []
    for _ in range(top_n):
        remaining = [i for i in range(len(sents)) if i not in selected_idx]
        if not remaining:
            break
        mmr_scores = []
        for i in remaining:
            score = probas[i]
            redundancy = max([cosine_similarity([emb[i]], [emb[j]])[0][0] for j in selected_idx], default=0)
            mmr = 0.8 * score - 0.2 * redundancy
            mmr_scores.append((i, mmr))
        next_idx = max(mmr_scores, key=lambda x: x[1])[0]
        selected_idx.append(next_idx)

    selected_idx.sort()
    summary = ' '.join([sents[i] for i in selected_idx])
    return summary

# === Evaluate on a Sample ===
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
sample_idx = 0
ref = df['summary'][sample_idx]
gen = mmr_summary(df['article'][sample_idx], model)
print("\n=== Generated Summary ===\n", gen)
print("\n=== Reference Summary ===\n", ref)
scores = scorer.score(ref, gen)
print("\n=== ROUGE Scores ===")
for key in scores:
    s = scores[key]
    print(f"{key}: F1={s.fmeasure:.3f} (R={s.recall:.3f}, P={s.precision:.3f})")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.97      0.91      1316
           1       0.62      0.22      0.32       284

    accuracy                           0.84      1600
   macro avg       0.74      0.59      0.62      1600
weighted avg       0.81      0.84      0.80      1600


=== Generated Summary ===
 Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers and will try to sign up AOL's existing customers for high-speed broadband. But its film division saw profits slump 27% to $284m, helped by box-office flops Alexander and Catwoman, a sharp contrast to year-earlier, when the third and final film in the Lord of the Rings trilogy boosted results.

=== Reference Summary ===
 TimeWarner said fourth q

In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True