In [2]:
import dask.dataframe as dd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline, AutoTokenizer
import pandas as pd
import torch
import re

In [3]:
# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [13]:
# Load the model
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [None]:
# Load tokenizer along with the model
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")  # Replace with your model name
token_limit = 1024  # Token limit for the model being used (adjust based on the model)

In [12]:
def load_large_csv_with_dask(filepath):
    """
    Load a large CSV file using Dask for memory-efficient processing.
    """
    print("Loading large CSV file using Dask...")
    ddf = dd.read_csv(filepath, assume_missing=True)
    return ddf.compute()  # Convert Dask DataFrame to Pandas DataFrame for processing

In [6]:
def compute_embeddings(df, batch_size=1000):
    """
    Compute embeddings for 'combined_text' column in batches.
    """
    print("Computing embeddings...")
    embeddings = []
    for i in tqdm(range(0, len(df), batch_size), desc="Embedding Batches"):
        batch_texts = df['combined_text'].iloc[i:i + batch_size].tolist()
        batch_embeddings = model.encode(batch_texts, convert_to_tensor=True)
        embeddings.extend(batch_embeddings.cpu().numpy())
    df['combined_embedding'] = embeddings
    return df

In [14]:
def filter_by_keyword_with_embeddings(df, keyword, similarity_threshold=0.5):
    """
    Filter rows based on semantic similarity to the keyword.
    """
    print(f"Filtering rows for keyword: {keyword}")
    keyword_embedding = model.encode(keyword, convert_to_tensor=True).cpu().numpy()
    df['similarity'] = [
        util.cos_sim(embedding, keyword_embedding).item()
        for embedding in df['combined_embedding']
    ]
    return df[df['similarity'] > similarity_threshold]

In [None]:
summarizer = pipeline("summarization", model="allenai/led-base-16384")

In [None]:
def generate_summary_with_model(text, max_length=130, min_length=30):
    """
    Generate a summary by splitting long text into chunks.
    """
    sentences = text.split(". ")  # Split text into sentences
    chunks = []
    chunk = ""

    # Group sentences into chunks that fit within the token limit
    for sentence in sentences:
        # Check if adding the sentence exceeds the token limit
        if len(tokenizer(chunk + sentence)["input_ids"]) <= token_limit:
            chunk += sentence + ". "
        else:
            chunks.append(chunk.strip())
            chunk = sentence + ". "

    # Add the last chunk if it's non-empty
    if chunk:
        chunks.append(chunk.strip())

    # Generate summaries for each chunk
    summaries = []
    for chunk in chunks:
        summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
        summaries.append(summary[0]["summary_text"])

    return " ".join(summaries)


In [None]:
def preprocess_text(text):
    """
    Preprocess the text to remove duplicates and ensure clean input.
    """
    # Remove duplicate sentences
    sentences = list(set(re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)))
    return " ".join(sentences)

In [None]:
def summarize_by_category(df, keyword, max_sentences=2):
    """
    Generate meaningful summaries for each category using a summarization model.
    """
    summaries = {}
    for category in ["Strengths", "Weaknesses", "Opportunities", "Threats"]:
        # Filter data for the current category
        category_rows = df[df["categories"].apply(lambda x: category in x)]
        if not category_rows.empty:
            combined_text = " ".join(category_rows["combined_text"].tolist())

            # Handle empty or invalid text
            if not combined_text.strip():
                summaries[category] = f"No relevant content for {category}."
                continue

            # Generate summary using updated function
            summaries[category] = generate_summary_with_model(combined_text)
        else:
            summaries[category] = f"No relevant content for {category}."
    return summaries


In [None]:
def filter_by_relevance(df, keyword, model, similarity_threshold=0.5):
    """
    Filter rows by relevance to the keyword using semantic similarity.
    """
    print(f"Filtering rows for keyword: {keyword}")
    keyword_embedding = model.encode(keyword, convert_to_tensor=True)
    relevant_rows = []

    for _, row in df.iterrows():
        combined_text = row['combined_text']
        text_embedding = model.encode(combined_text, convert_to_tensor=True)
        similarity = util.cos_sim(text_embedding, keyword_embedding).item()
        if similarity >= similarity_threshold:
            relevant_rows.append(row)

    return pd.DataFrame(relevant_rows)

In [16]:
def visualize_category_distribution(df, output_path='static/publication_distribution.png'):
    """
    Create a bar chart showing the number of articles per publisher.
    """
    publication_counts = df['publication'].value_counts()
    plt.figure(figsize=(10, 6))
    publication_counts.plot(kind='bar', color='skyblue')
    plt.title('Number of Articles by Publisher')
    plt.xlabel('Publisher')
    plt.ylabel('Number of Articles')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(output_path)
    plt.close()

In [17]:
def run_summarization(filepath, keyword, similarity_threshold=0.5, max_sentences=5):
    """
    Run the complete summarization pipeline for a given keyword.
    """
    print(f"Running summarization for keyword: {keyword}")
    
    # Step 1: Load dataset
    df = load_large_csv_with_dask(filepath)
    df['title'] = df['title'].fillna('')
    df['content'] = df['content'].fillna('')
    df['combined_text'] = df['title'] + ". " + df['content']
    df['categories'] = df['categories'].apply(eval)

    # Step 2: Compute embeddings
    df = compute_embeddings(df)

    # Step 3: Filter by keyword
    filtered_df = filter_by_keyword_with_embeddings(df, keyword, similarity_threshold=similarity_threshold)

    if filtered_df.empty:
        print("No relevant rows found for the given keyword.")
        return {
            "Strengths": ["No relevant content."],
            "Weaknesses": ["No relevant content."],
            "Opportunities": ["No relevant content."],
            "Threats": ["No relevant content."],
        }, pd.DataFrame()

    # Step 4: Summarize by SWOT category
    summaries = summarize_by_category(filtered_df, keyword, max_sentences=max_sentences)

    # Step 5: Visualize publication distribution
    visualize_category_distribution(filtered_df)

    return summaries, filtered_df