<a href="https://colab.research.google.com/github/khemsu/NLP_Training-models/blob/main/RandomForestClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')

# Set paths to your folders
articles_path = "/content/drive/MyDrive/BBC News Summary/News Articles/business"
summaries_path = "/content/drive/MyDrive/BBC News Summary/Summaries/business"

# Verify files
article_files = sorted(os.listdir(articles_path))
summary_files = sorted(os.listdir(summaries_path))

print(f"Found {len(article_files)} articles and {len(summary_files)} summaries")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Found 510 articles and 510 summaries


In [None]:
def read_text_files(filepath_list, folder_path):
    """Read all .txt files and return their contents"""
    contents = []
    for filename in filepath_list:
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as f:
            contents.append(f.read())
    return contents

# Load all articles and summaries
articles = read_text_files(article_files, articles_path)
summaries = read_text_files(summary_files, summaries_path)

# Create DataFrame
import pandas as pd
df = pd.DataFrame({
    'article': articles,
    'summary': summaries,
    'article_filename': article_files,
    'summary_filename': summary_files
})

print(f"Loaded {len(df)} article-summary pairs")
print(df.head(3))

Loaded 510 article-summary pairs
                                             article  \
0  Ad sales boost Time Warner profit\n\nQuarterly...   
1  Dollar gains on Greenspan speech\n\nThe dollar...   
2  Yukos unit buyer faces loan claim\n\nThe owner...   

                                             summary article_filename  \
0  TimeWarner said fourth quarter sales rose 2% t...          001.txt   
1  The dollar has hit its highest level against t...          002.txt   
2  Yukos' owner Menatep Group says it will ask Ro...          003.txt   

  summary_filename  
0          001.txt  
1          002.txt  
2          003.txt  


In [None]:
# Verify that articles and summaries are correctly paired
for idx, row in df.sample(3).iterrows():
    print("=== Article ===")
    print(row["article"][:500] + "...")
    print("\n=== Summary ===")
    print(row["summary"])
    print("\n" + "="*50 + "\n")

=== Article ===
Fosters buys stake in winemaker

Australian brewer Fosters has bought a large stake in Australian winemaker Southcorp, sparking rumours of a possible takeover.

Fosters bought 18.8% of Southcorp, the global winemaker behind the Penfolds, Lindemans and Rosemount brands, for 4.17 Australian dollars per share. A bid at that price would value the company at A$3.1bn ($2.4bn; Â£1.25bn ). Fosters said it was currently in discussions "which may lead to a major corporate announcement".

In a separate sta...

=== Summary ===
Australian brewer Fosters has bought a large stake in Australian winemaker Southcorp, sparking rumours of a possible takeover.Fosters bought 18.8% of Southcorp, the global winemaker behind the Penfolds, Lindemans and Rosemount brands, for 4.17 Australian dollars per share.Fosters bought the 18.8% stake from Reline Investments, the family investment firm for the Oatleys, who founded the Rosemount Estates label and sold it to Southcorp in 2001.Since then, it ha

## Pre Processing

In [None]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing resource

def preprocess(text):
    sentences = sent_tokenize(text)
    clean_sentences = []
    for sent in sentences:
        words = word_tokenize(sent.lower())
        words = [w for w in words if w.isalnum() and w not in stopwords.words('english')]
        clean_sentences.append(' '.join(words))
    return clean_sentences

df['clean_article'] = df['article'].apply(preprocess)
df['clean_summary'] = df['summary'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def extract_features(articles):
    all_features = []
    for sentences in articles:
        # Position feature (normalized 0-1)
        positions = np.linspace(0, 1, num=len(sentences))

        # TF-IDF features
        tfidf = TfidfVectorizer().fit_transform(sentences)
        tfidf_avg = np.array(tfidf.mean(axis=1)).flatten()

        # Combine features
        features = np.column_stack((positions, tfidf_avg))
        all_features.append(features)
    return all_features

X_features = extract_features(df['clean_article'])

In [None]:
X_features

[array([[0.        , 0.02449732],
        [0.05263158, 0.01901744],
        [0.10526316, 0.01421   ],
        [0.15789474, 0.01906455],
        [0.21052632, 0.01416708],
        [0.26315789, 0.01281812],
        [0.31578947, 0.01740448],
        [0.36842105, 0.02159626],
        [0.42105263, 0.02161367],
        [0.47368421, 0.02248348],
        [0.52631579, 0.01735804],
        [0.57894737, 0.02547379],
        [0.63157895, 0.01632923],
        [0.68421053, 0.02256235],
        [0.73684211, 0.02166743],
        [0.78947368, 0.01910192],
        [0.84210526, 0.01752047],
        [0.89473684, 0.01946125],
        [0.94736842, 0.02324261],
        [1.        , 0.01474515]]),
 array([[0.        , 0.02749431],
        [0.07142857, 0.0240039 ],
        [0.14285714, 0.01753324],
        [0.21428571, 0.01628057],
        [0.28571429, 0.02902273],
        [0.35714286, 0.02845524],
        [0.42857143, 0.02052052],
        [0.5       , 0.01512512],
        [0.57142857, 0.02489431],
        [0.6

In [None]:
def label_sentences(articles, summaries):
    y_labels = []
    for art_sents, sum_sents in zip(articles, summaries):
        labels = []
        for sent in art_sents:
            # Check if any summary sentence is contained in article sentence
            labels.append(1 if any(sum_sent in sent for sum_sent in sum_sents) else 0)
        y_labels.append(labels)
    return y_labels

y_labels = label_sentences(df['clean_article'], df['clean_summary'])

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Flatten features and labels
X = np.vstack(X_features)
y = np.concatenate(y_labels)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
print(f"Training Accuracy: {model.score(X_train, y_train):.2f}")
print(f"Test Accuracy: {model.score(X_test, y_test):.2f}")

Training Accuracy: 1.00
Test Accuracy: 0.97


In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
from rouge import Rouge

def generate_summary(article, model, top_n=2):
    sentences = sent_tokenize(article)
    clean_sents = preprocess(article)
    features = extract_features([clean_sents])[0]
    probas = model.predict_proba(features)[:, 1]  # Probability of being in summary
    top_indices = np.argsort(probas)[-top_n:][::-1]  # Get top N sentences
    return ' '.join([sentences[i] for i in sorted(top_indices)])  # Keep original order

# Test on first article
sample_idx = 0
generated_summary = generate_summary(df['article'][sample_idx], model)
reference_summary = df['summary'][sample_idx]

print("=== Original Article ===")
print(df['article'][sample_idx][:1000] + "...")  # Show first 500 chars

print("\n=== Reference Summary ===")
print(reference_summary)

print("\n=== Generated Summary ===")
print(generated_summary)

# ROUGE Evaluation
rouge = Rouge()
scores = rouge.get_scores(generated_summary, reference_summary)
print("\n=== ROUGE Scores ===")
print(f"ROUGE-1: F1={scores[0]['rouge-1']['f']:.3f}")
print(f"ROUGE-2: F1={scores[0]['rouge-2']['f']:.3f}")
print(f"ROUGE-L: F1={scores[0]['rouge-l']['f']:.3f}")

=== Original Article ===
Ad sales boost Time Warner profit

Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (Â£600m) for the three months to December, from $639m year-earlier.

The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.

Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a