#### Imports

In [7]:
# imports

import pandas as pd
import nltk
nltk.download('punkt')
from datasets import Dataset, DatasetDict
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kiddstudio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


#### Load Data

In [2]:
# load data

test_dataset = pd.read_csv('../data/processed/new_ds_test_dataset.csv')
train_dataset = pd.read_csv('../data/processed/new_ds_train_dataset.csv')
#validation_dataset = pd.read_csv('../data/validation_set.csv')

In [3]:
train = Dataset.from_pandas(train_dataset)
test = Dataset.from_pandas(train_dataset)

# reconstruct both datasets into a Dataset Dict object
new_ds = DatasetDict(
    {
        'train': train,
        'test': test
    }
)

#### Apply Tokenization to Documents Corpus using TF-IDF

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Ensure all documents are strings
train_dataset['document'] = train_dataset['document'].astype(str)
train_dataset['summary'] = train_dataset['summary'].astype(str)
test_dataset['document'] = test_dataset['document'].astype(str)
test_dataset['summary'] = test_dataset['summary'].astype(str)

# Handle missing values (remove rows with missing documents or summaries)
train_dataset.dropna(subset=['document', 'summary'], inplace=True)
test_dataset.dropna(subset=['document', 'summary'], inplace=True)

# Extract documents
train_documents = train_dataset['document'].tolist()
test_documents = test_dataset['document'].tolist()


##### Apply TF-IDF Vectorization


In [5]:
# Apply TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
X_train = vectorizer.fit_transform(train_documents)
X_test = vectorizer.transform(test_documents)

# Print some information about the TF-IDF vectors
print("TF-IDF Train shape:", X_train.shape)
print("TF-IDF Test shape:", X_test.shape)


TF-IDF Train shape: (44972, 5000)
TF-IDF Test shape: (5622, 5000)


In [6]:
feature_names = vectorizer.get_feature_names_out()
print("Sample features:", feature_names[:10])

Sample features: ['10' '100' '1000' '10000' '100000' '11' '12' '120' '13' '130']


#### Build a Model

##### Preprocess Data

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize

nltk.download('punkt')

# Extract documents
train_documents = train_dataset['document'].tolist()
test_documents = test_dataset['document'].tolist()

# Preprocess the documents
def preprocess_text(text):
    sentences = sent_tokenize(text)
    return sentences

train_sentences = [preprocess_text(doc) for doc in train_documents]
test_sentences = [preprocess_text(doc) for doc in test_documents]

train_sentences_flat = [sentence for sublist in train_sentences for sentence in sublist]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kiddstudio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


##### Apply TF-IDF Vectorization

In [9]:
# Apply TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_train = vectorizer.fit_transform(train_sentences_flat)

# Create labels for training (1 for important sentences, 0 for others)
y_train = []
for i, summary in enumerate(train_dataset['summary']):
    summary_sentences = preprocess_text(summary)
    doc_sentences = train_sentences[i]
    labels = [1 if sentence in summary_sentences else 0 for sentence in doc_sentences]
    y_train.extend(labels)

# Check that the number of sentences matches the number of labels
assert len(train_sentences_flat) == len(y_train), "Mismatch between number of sentences and labels"


##### Train a Random Forest Model

In [10]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



##### Generate Summaries

In [18]:
# Define the summary generation function with edge case handling
def generate_summary(document, model, vectorizer, top_n=5):
    sentences = preprocess_text(document)
    if len(sentences) == 0:
        return ""
    X = vectorizer.transform(sentences)
    if X.shape[0] == 0:
        return ""
    if X.shape[0] == 1:
        return sentences[0]  # If there's only one sentence, return it as the summary
    probs = model.predict_proba(X)
    if probs.shape[1] > 1:
        probs = probs[:, 1]
    else:
        probs = probs[:, 0]
    top_sentence_indices = probs.argsort()[-top_n:][::-1]
    summary = ' '.join([sentences[i] for i in top_sentence_indices])
    return summary

# Test the model on the test dataset
generated_summaries = [generate_summary(doc, model, vectorizer) for doc in test_documents]


##### Evaluate the model

In [19]:
from datasets import load_metric
import pandas as pd

# Load ROUGE metric
rouge = load_metric('rouge')

# Get reference summaries from the test dataset
reference_summaries = test_dataset['summary'].tolist()

# Calculate ROUGE scores
results = rouge.compute(predictions=generated_summaries, references=reference_summaries)

# Create a DataFrame to display the results
def format_rouge_scores(results):
    rows = []
    for key in results.keys():
        score_dict = results[key]
        row = {
            'metric': key,
            'precision': score_dict.mid.precision,
            'recall': score_dict.mid.recall,
            'fmeasure': score_dict.mid.fmeasure
        }
        rows.append(row)
    return pd.DataFrame(rows)

# Format the results into a DataFrame
rouge_df = format_rouge_scores(results)

# Display the DataFrame
print(rouge_df)


      metric  precision    recall  fmeasure
0     rouge1   0.150111  0.517855  0.215439
1     rouge2   0.036897  0.122005  0.052441
2     rougeL   0.076659  0.251571  0.108279
3  rougeLsum   0.076687  0.251599  0.108354


##### Summmary of results

-**ROUGE-1:** The model has a relatively better performance in capturing relevant single words (unigrams) from the reference summaries. The high recall indicates that the generated summaries contain a significant portion of the words from the reference summaries, but the low precision suggests that the model may include a lot of irrelevant words.

-**ROUGE-2:** The performance drops significantly for bigrams, indicating that the model struggles to capture meaningful phrases from the reference summaries.

-**ROUGE-L and ROUGE-LSum:** The performance for capturing the longest common subsequences is also low, suggesting that the coherence and structure of the generated summaries could be improved.

This implementation demonstrates how to build a document summarization model using a Random Forest classifier trained on TF-IDF features. The model classifies sentences as important or not, and the top classified sentences are used to form a summary. The model's performance is evaluated using ROUGE metrics and displayed in a tabular format for easy comparison. This approach ensures that the model is trained and tested on the preprocessed data, providing a complete workflow from data preparation to evaluation