In [None]:
# /**
#  * Project Name: Automated News Headline Generation Using Natural Language
#  * File Name: NLP_Final_Project.ipynb
#  * 
#  * Description:
#  * This project preprocesses news articles using spaCy and generates concise, informative headlines for each article.
#  * It evaluates the quality of the generated headlines using cosine similarity with the original title.
#  * 
#  * Contributors:
#  * - Grace Li - Handled data preprocessing and integration of TF-IDF for evaluation.
#  * - Nick Yi - Focused on implementing tokenization and cosine similarity evaluation.
#  * - Golam Raiyan - Developed headline generation logic and named entity recognition (NER) functionality.
#  * 
#  * Date: 12/12/2024
#  */

#install necessary libraries
!pip install spacy
!pip install pandas scikit-learn
!python -m spacy download en_core_web_md

#import required modules
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy

#load the spaCy model for natural language processing
nlp = spacy.load("en_core_web_md")

#reads and processes articles from a CSV file
#extracts token data and named entities using spaCy
def process_articles(input_file):
    df = pd.read_csv(input_file)
    processed_articles = []
    for _, row in df.iterrows():
        article_number = row.get("article_number", _)
        text = row["text"]
        doc = nlp(text)
        token_data = [(token.text, token.pos_) for token in doc]
        named_entities = [(ent.text, ent.label_) for ent in doc.ents]
        processed_articles.append({
            "article_number": article_number,
            "original_text": text,
            "token_data": token_data,
            "named_entities": named_entities
        })
    return processed_articles

#generates headlines for articles based on content
def generate_headlines(processed_articles):
    headlines = []
    for article in processed_articles:
        doc = nlp(article["original_text"])
        important_phrases = []
        for token in doc:
            if token.dep_ in {"nsubj", "ROOT", "dobj", "pobj"} or token.ent_type_:
                important_phrases.append(token.text)
        named_entities = [ent.text for ent in doc.ents if ent.label_ in {"PERSON", "ORG", "GPE"}]
        headline = " ".join(important_phrases[:15])
        if named_entities:
            headline += " - " + ", ".join(named_entities[:2])
        headlines.append({
            "article_number": article["article_number"],
            "headline": headline.capitalize()
        })
    return headlines

#calculates cosine similarity using TF-IDF
def calculate_cosine_similarity(original_texts, generated_headlines):
    tfidf_vectorizer = TfidfVectorizer()
    original_texts = [text for text in original_texts]
    generated_texts = [headline["headline"] for headline in generated_headlines]
    tfidf_matrix = tfidf_vectorizer.fit_transform(original_texts + generated_texts)
    original_matrix = tfidf_matrix[:len(original_texts)]
    headline_matrix = tfidf_matrix[len(original_texts):]
    similarities = cosine_similarity(original_matrix, headline_matrix)
    return similarities

#computes the average cosine similarity
def get_average_cosine_similarity(similarities):
    total_similarity = sum(similarities[i][i] for i in range(len(similarities)))
    average_similarity = total_similarity / len(similarities)
    return average_similarity

#driver function
def main(input_file, output_file):
    print("Processing articles...")
    processed_articles = process_articles(input_file)
    print("Generating headlines...")
    generated_headlines = generate_headlines(processed_articles)
    print("Calculating cosine similarity...")
    similarities = calculate_cosine_similarity(
        [article["original_text"] for article in processed_articles],
        generated_headlines
    )
    average_similarity = get_average_cosine_similarity(similarities)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")
    results = []
    for idx, headline in enumerate(generated_headlines):
        results.append({
            "article_number": headline["article_number"],
            "generated_headline": headline["headline"],
            "cosine_similarity": similarities[idx][idx]
        })
    
    #save results to a CSV file
    pd.DataFrame(results).to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

#input and output files
if __name__ == "__main__":
    main("test.csv", "headline_pipeline_results.csv")