In [None]:
# Install necessary packages and SpaCy model in Colab
!pip install spacy
!pip install pandas scikit-learn
!python -m spacy download en_core_web_md

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

# Load SpaCy model
nlp = spacy.load("en_core_web_md")

def process_articles(input_file):
    """
    Step 1: Process articles for POS tagging and Named Entity Recognition (NER) using SpaCy.
    """
    df = pd.read_csv(input_file)
    processed_articles = []

    for _, row in df.iterrows():
        article_number = row.get("article_number", _)
        text = row["text"]
        doc = nlp(text)

        # Extract token and POS tag information
        token_data = [(token.text, token.pos_) for token in doc]

        # Extract named entities
        named_entities = [(ent.text, ent.label_) for ent in doc.ents]

        processed_articles.append({
            "article_number": article_number,
            "original_text": text,
            "token_data": token_data,
            "named_entities": named_entities
        })

    return processed_articles

def generate_headlines(processed_articles):
    """
    Step 2: Generate grammatically coherent headlines using advanced logic and SpaCy POS tagging/NER.
    """
    headlines = []
    for article in processed_articles:
        doc = nlp(article["original_text"])

        # Extract meaningful phrases (subjects, verbs, objects, modifiers)
        important_phrases = []
        for token in doc:
            if token.dep_ in {"nsubj", "ROOT", "dobj", "pobj"} or token.ent_type_:
                important_phrases.append(token.text)

        # Include named entities for additional context
        named_entities = [ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}]

        # Construct headline with more complexity and 15-20 words
        headline = " ".join(important_phrases[:15])  # Limit to top 15 phrases
        if named_entities:
            headline += " - " + ", ".join(named_entities[:2])  # Add some context

        headlines.append({
            "article_number": article["article_number"],
            "headline": headline.capitalize()
        })
    return headlines

def calculate_cosine_similarity(original_texts, generated_headlines):
    """
    Step 3: Calculate cosine similarity between original text and generated headlines.
    """
    tfidf_vectorizer = TfidfVectorizer()
    original_texts = [text for text in original_texts]
    generated_texts = [headline["headline"] for headline in generated_headlines]

    # Compute TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(original_texts + generated_texts)
    original_matrix = tfidf_matrix[:len(original_texts)]
    headline_matrix = tfidf_matrix[len(original_texts):]

    # Calculate cosine similarity
    similarities = cosine_similarity(original_matrix, headline_matrix)
    return similarities

def get_average_cosine_similarity(similarities):
    """
    Calculate the average cosine similarity across all articles.
    """
    total_similarity = sum(similarities[i][i] for i in range(len(similarities)))
    average_similarity = total_similarity / len(similarities)
    return average_similarity

def main(input_file, output_file):
    """
    Run the full pipeline.
    """
    print("Processing articles...")
    processed_articles = process_articles(input_file)

    print("Generating headlines...")
    generated_headlines = generate_headlines(processed_articles)

    print("Calculating cosine similarity...")
    similarities = calculate_cosine_similarity(
        [article["original_text"] for article in processed_articles],
        generated_headlines
    )

    # Calculate average cosine similarity
    average_similarity = get_average_cosine_similarity(similarities)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")

    # Save results
    results = []
    for idx, headline in enumerate(generated_headlines):
        results.append({
            "article_number": headline["article_number"],
            "generated_headline": headline["headline"],
            "cosine_similarity": similarities[idx][idx]
        })
    pd.DataFrame(results).to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

if __name__ == "__main__":
    main("subset_news_data_fifty.csv", "headline_pipeline_results.csv")