<a href="https://colab.research.google.com/github/jbloewencolon/AI-News-Themes-of-the-Week/blob/main/AI_News_draft_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
import sklearn
nltk.download('punkt')  # One-time download for sentence splitting
nltk.download('stopwords')  # Download common 'stop words'

def get_ai_news_headlines():
    """ I fetch the latest AI news headlines using the News API. """

    api_key = "b506809c382c4befb258e4febc1fdf78"
    base_url = "https://newsapi.org/v2/everything"

    # Build the request parameters
    parameters = {
        'q': 'artificial intelligence',  # Your main search term
        'apiKey': api_key
    }

    response = requests.get(base_url, params=parameters)
    response.raise_for_status()  # Check for successful API response

    data = response.json()

    headlines = [article['title'] for article in data['articles']]
    return headlines

if __name__ == "__main__":
    latest_headlines = get_ai_news_headlines()
    print("This Week's Top AI News Headlines:")
    for headline in latest_headlines:
        print(headline)

def analyze_headlines(headlines):
    """ I process the headlines to find meaningful keywords"""
    all_words = []

    for headline in headlines:
        words = nltk.word_tokenize(headline)
        # Lowercase for simplicity
        words = [word.lower() for word in words if word.isalpha()]
        # Filter out stop words (the, is, a, etc.)
        stop_words = set(nltk.corpus.stopwords.words('english'))
        words = [w for w in words if not w in stop_words]

        all_words.extend(words)

    # Find the most frequent words
    freq_dist = nltk.FreqDist(all_words)
    top_keywords = [word for word, count in freq_dist.most_common(10)]  # Adjust the number for more/fewer keywords

    return top_keywords

if __name__ == "__main__":
    latest_headlines = get_ai_news_headlines()
    print("This Week's Top AI News Headlines:")
    for headline in latest_headlines:
        print(headline)
    keywords = analyze_headlines(latest_headlines)
    print("This Week's Top AI Keywords:")
    print(keywords)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


This Week's Top AI News Headlines:
Here's How Generative AI Depicts Queer People
Join The Verge at the 2024 Chicago Humanities Spring Festival
To Build a Better AI Supercomputer, Let There Be Light
Jon Stewart Confirms Apple Wouldn't Let Him Do Show on AI With FTC Chair
[Removed]
Microsoft Copilot Is Offering GPT-4 Turbo for Free: What to Know - CNET
Microsoft and OpenAI plan to build a $100 billion supercomputer to power artificial intelligence: report
[Removed]
Google wants to leverage AI to make weather forecasting more efficient
AI is now analyzing your garbage
Utah Passes Artificial Intelligence Legislation
[Removed]
EU finally adopts AI Act, marking a ‘new era’ for artificial intelligence
[Removed]
Europe is trying to regulate AI. That could backfire.
Read AI raises $21M to bring connected intelligence to meetings, email, and messaging
Musk Predicts AI Will Overtake Human Intelligence Next Year
Trump Says Humiliating Videos of His Brain Farts Are AI-Generated
MSI Prestige 13 AI E

In [2]:
def preprocess_text(text):
    """ A basic function to clean our text """
    tokens = nltk.word_tokenize(text)
    tokens = [t.lower() for t in tokens if t.isalpha()]
    # Note: No stop word removal for now, let's see the raw clusters
    return tokens

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def create_bag_of_words(articles):
    """ Takes a list of full article texts """
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(articles)  # Creates the word frequency representation
    return X, vectorizer

In [5]:
from sklearn.cluster import KMeans

def find_themes(bag_of_words, num_themes=5):
    """Applies K-Means clustering to the bag-of-words representation to group articles into themes.

    Args:
        bag_of_words: The bag-of-words representation (typically a sparse matrix)
        num_themes: The desired number of themes (clusters)

    """

    kmeans = KMeans(n_clusters=num_themes)  # Create a K-Means clustering object
    kmeans.fit(bag_of_words)  # Fit the model to the data

    # Access cluster centroids and sort words within each cluster by importance
    order_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]

    terms = vectorizer.get_feature_names_out()  # Get the vocabulary

    # Print the top words for each theme
    for i in range(num_themes):
        print("Theme %d:" % i)
        top_words = [terms[ind] for ind in order_centroids[i, :10]]  # Top 10 words
        print(" ".join(top_words))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keyphrases(articles, num_phrases=3):
    """Extract top keyphrases from a collection of articles.

    Args:
        articles: A list of article texts.
        num_phrases: The desired number of keyphrases per article.
    """

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(articles)
    feature_names = vectorizer.get_feature_names_out()

    for i, article in enumerate(articles):
        response = X[i]
        top_n_idx = response.toarray().argsort()[0][-num_phrases:][::-1]
        keyphrases = [feature_names[idx] for idx in top_n_idx]
        print(f"Article {i} Keyphrases:")
        print(", ".join(keyphrases))