# News Report Generator

This notebook fetches news articles on a specified topic, analyzes the sentiment and summaries of the articles, and generates a Word report.


## Step 1: Install Dependencies


In [None]:
# Install required libraries
!pip install newsapi-python transformers pillow python-docx


Collecting newsapi-python
  Downloading newsapi_python-0.2.7-py2.py3-none-any.whl (7.9 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-docx, newsapi-python
Successfully installed newsapi-python-0.2.7 python-docx-1.1.2


##Step 2: Create a Configuration File

In [None]:
# Create a config.txt file with the API key and topic
with open('config.txt', 'w') as file:
    file.write('api_key=c63e1949c4c84b1f9f9973c585ed3ee0\n')
    file.write('topic=Northvolt\n')



##Step 3: Define Functions

In [None]:
import os
from newsapi import NewsApiClient
from transformers import pipeline
from datetime import datetime, timedelta
from docx import Document
from docx.shared import Inches
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
import requests
from PIL import Image
from io import BytesIO

def init_news_api(api_key):
    """Initialize the NewsAPI client."""
    return NewsApiClient(api_key=api_key)

def fetch_news(newsapi, query, from_date):
    """Fetch news articles from NewsAPI."""
    try:
        all_articles = newsapi.get_everything(q=query,
                                              from_param=from_date,
                                              language='en',
                                              sort_by='relevancy',
                                              page_size=10)
        return all_articles['articles']
    except Exception as e:
        print(f"Error fetching news: {e}")
        return []

def analyze_data(articles):
    """Analyze data for sentiment and summarization."""
    summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
    sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

    summaries = []
    sentiments = []

    for article in articles:
        title = article.get('title', 'No Title')
        description = article.get('description', 'No Description')
        text = title + ". " + description

        # Truncate text to fit within 512 tokens for sentiment analysis
        truncated_text = text[:512]

        # Calculate dynamic max_length for summarization
        input_length = len(text.split())
        max_length = max(30, min(300, input_length * 2 // 3))  # Make summaries longer

        if description != 'No Description':
            summary = summarizer(text, max_length=max_length, min_length=max_length//2, do_sample=False)[0]['summary_text']
            sentiment = sentiment_analyzer(truncated_text)[0]
        else:
            summary = "No description available."
            sentiment = {"label": "NEUTRAL", "score": 0.0}

        summaries.append(summary)
        sentiments.append(sentiment)

    return summaries, sentiments

def set_run_color(run, color):
    """Set the color of a run."""
    rPr = run._element.get_or_add_rPr()
    color_element = OxmlElement('w:color')
    color_element.set(qn('w:val'), color)
    rPr.append(color_element)

def generate_word_report(query, summary, articles, summaries, sentiments):
    """Generate a Word report with the analysis."""
    doc = Document()

    # Get the current month and year for the report
    report_date = datetime.now().strftime("%B %Y")

    doc.add_heading(f'{query} Information Summary ({report_date})', 0)

    # Overall Summary
    doc.add_heading('Summary', level=1)
    doc.add_paragraph(summary)

    # Articles
    doc.add_heading('Articles', level=1)
    article_number = 1  # Initialize article number
    for idx, (article, summary, sentiment) in enumerate(zip(articles, summaries, sentiments)):
        title = article.get('title', 'No Title')
        url = article.get('url', 'No URL')

        # Skip problematic articles
        if "removed" in title.lower() or "removed" in summary.lower():
            continue

        # Determine color based on sentiment
        if sentiment['label'] == 'POSITIVE':
            color = '006400'  # Dark Green
        elif sentiment['label'] == 'NEGATIVE':
            color = '8B0000'  # Dark Red
        else:
            color = '000000'  # Black

        # Title
        heading = doc.add_heading(level=2)
        run = heading.add_run(f"{article_number}. {title}")
        set_run_color(run, color)

        # Link (in black)
        paragraph = doc.add_paragraph(f"Link: {url}")

        # Sentiment
        paragraph = doc.add_paragraph()
        run = paragraph.add_run(f"Sentiment: {sentiment['label']} (Score: {sentiment['score']:.2f})")
        set_run_color(run, color)

        # Summary (in black)
        doc.add_paragraph(f"Summary: {summary}")

        # Image
        if article.get('urlToImage'):
            try:
                response = requests.get(article.get('urlToImage'))
                img = Image.open(BytesIO(response.content)).convert("RGB")
                img.thumbnail((Inches(5), Inches(5)), Image.LANCZOS)
                img_path = "temp_image.jpg"
                img.save(img_path)
                doc.add_picture(img_path, width=Inches(5))
            except Exception as e:
                print(f"Error loading image: {e}")

        # Add a separator line
        doc.add_paragraph("\n" + "-"*40 + "\n")

        article_number += 1  # Increment article number

    # Generate the output file name based on the current date and time
    output_file = f"{query.replace(' ', '_')}_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
    doc.save(output_file)
    print(f"Report saved to {output_file}")

def read_config(config_file):
    """Read the API key and topic from the configuration file."""
    config = {}
    if not os.path.exists(config_file):
        print(f"Configuration file {config_file} not found.")
        return config
    with open(config_file, 'r') as file:
        for line in file:
            if '=' in line:
                key, value = line.strip().split('=', 1)
                config[key] = value.strip()
    return config


##Step 4: Fetch News Articles

In [None]:
# Read the API key and topic from the configuration file
config_file = 'config.txt'
config = read_config(config_file)

api_key = config.get('api_key')
query = config.get('topic')

if not api_key or not query:
    print("API key or topic not found in the configuration file. Exiting.")
else:
    newsapi = init_news_api(api_key)

    # Set the date from which to search for news
    from_date = (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d')

    # Fetch news articles
    articles = fetch_news(newsapi, query, from_date)

    # Save articles for debugging
    import pickle
    with open('articles.pkl', 'wb') as f:
        pickle.dump(articles, f)

    print(f"Fetched {len(articles)} articles.")


Fetched 10 articles.


##Step 5: Analyze Data and Generate Report

In [None]:
# Load articles from the saved file
import pickle
with open('articles.pkl', 'rb') as f:
    articles = pickle.load(f)

# Analyze data
summaries, sentiments = analyze_data(articles)

# Create overall summary text
overall_summary_texts = " ".join([article.get('title', '') + ". " + article.get('description', '') for article in articles])
overall_summary = analyze_data([{'title': '', 'description': overall_summary_texts}])[0][0]

# Generate the report
generate_word_report(query, overall_summary, articles, summaries, sentiments)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Report saved to Northvolt_Report_20240708_174822.docx
