# Setup

In [1]:
# !pip install python-dotenv  

import os 
import pandas as pd
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
# Retrieve API key
NEWSAPI_API_KEY = os.getenv("NEWSAPI_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# get_api_key function
def get_api_key(key_name):
    load_dotenv()
    return os.getenv(key_name)

In [3]:
from newsapi import NewsApiClient

newsapi_key = get_api_key("NEWSAPI_API_KEY")
newsapi = NewsApiClient(api_key=get_api_key("NEWSAPI_API_KEY"))


# LLM Implementation and Similarity Search

### LLM Interface, Similiarity Search

In [4]:
# LLM Interface
#!pip install ollama dspy
import ollama
import dspy
lm = dspy.LM("ollama/mistral")  # or you can use "ollama/mistral" as another option
dspy.configure(lm=lm)

In [5]:
class UserPreferenceChat(dspy.Module):
    def __init__(self):
        super().__init__()
        self.gen = dspy.ChainOfThought("chat_input -> structured_output")

    def forward(self, chat_input):
        prompt = f"""
        Act as a friendly news personalization assistant. Based on this chat input: '{chat_input}'
        
        1. Extract or infer the user's current mood
        2. Extract or infer their news interests
        3. Return the information in this exact format:
        MOOD: [extracted mood]
        INTERESTS: [comma-separated list of interests]
        
        Keep the response concise and only return the formatted information.
        """

        response = self.gen(chat_input=prompt).structured_output
        return response


def llm_chat_preferences():
    """
    LLM-powered chat function to collect user preferences
    Returns tuple of (mood_in, q_entries)
    """
    print("Hello! I'm your news personalization assistant.")
    print("Tell me how you're feeling and what kind of news interests you.")
    print(
        "For example: 'I'm feeling energetic today and would love to read about tech startups and space exploration'"
    )

    user_input = input("\nYou: ").strip()

    # Process with LLM
    processor = UserPreferenceChat()
    result = processor(user_input)

    # Parse LLM response
    try:
        mood_line = [line for line in result.split("\n") if line.startswith("MOOD:")][0]
        interests_line = [
            line for line in result.split("\n") if line.startswith("INTERESTS:")
        ][0]

        mood_in = mood_line.replace("MOOD:", "").strip()
        q_entries = interests_line.replace("INTERESTS:", "").strip()

        # Confirm with user
        print("\nI understood your preferences as:")
        print(f"Mood: {mood_in}")
        print(f"Interests: {q_entries}")

        return mood_in, q_entries

    except Exception as e:
        print(f"Error processing response: {str(e)}")
        return None, None


# Use the function and create user_prefs
try:
    mood_in, q_entries = llm_chat_preferences()
    if mood_in and q_entries:
        user_prefs = {"mood": mood_in, "news_interests": q_entries}
    else:
        print("Failed to get preferences. Please try again.")

except Exception as e:
    print(f"An error occurred: {str(e)}")

Hello! I'm your news personalization assistant.
Tell me how you're feeling and what kind of news interests you.
For example: 'I'm feeling energetic today and would love to read about tech startups and space exploration'

I understood your preferences as:
Mood: Sadness or Disappointment
Interests: Health, Universities


In [6]:
# check LLM output
print(mood_in, q_entries)

Sadness or Disappointment Health, Universities


In [7]:
# save topics for later use
topics = [topic.strip().replace("'","").replace('"','') for topic in q_entries.split(",")]
print(topics)

['Health', 'Universities']


### Fetch the news articles

In [8]:
sort_criteria = "popularity"
json_out = newsapi.get_everything(q=q_entries, sort_by=sort_criteria, language="en", page_size=100) # removed q=q_entries
type(json_out)
print(json_out)



In [9]:
# Populate dataframe with all relevant articles
df = pd.DataFrame(json_out["articles"])
#df.drop(columns=["source", "author", "urlToImage", "publishedAt"], inplace=True)
df.head(10)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'BBC News'}",,Too much debt or value for money? Students div...,"University fees are rising to £9,535, meaning ...",https://www.bbc.com/news/articles/crr92910r55o,https://ichef.bbci.co.uk/news/1024/branded_new...,2024-11-10T00:30:22Z,"Isobel, 18, decided she ""did not want the debt..."
1,"{'id': None, 'name': 'NPR'}",Rob Stein,"With Trump coming into power, the NIH is in th...","The National Institutes of Health, the crown j...",https://www.npr.org/2024/11/12/nx-s1-5183014/t...,https://npr.brightspotcdn.com/dims3/default/st...,2024-11-12T18:16:08Z,As the next Trump administration overhauls the...
2,"{'id': None, 'name': 'NPR'}",Martha Bebinger,"Instead of sodas, these vending machines offer...",Groups trying to reduce overdose deaths and th...,https://www.npr.org/sections/shots-health-news...,https://npr.brightspotcdn.com/dims3/default/st...,2024-11-14T13:00:00Z,"The small city of North Adams, nestled in the ..."
3,"{'id': None, 'name': 'Yahoo Entertainment'}",Lou Carlozo,5 surprising things American retirees should s...,"No, travel isn't one of them.",https://finance.yahoo.com/news/5-surprising-th...,https://media.zenfs.com/en/moneywise_327/73059...,2024-12-05T11:49:00Z,Youll find plenty of articles and social media...
4,"{'id': None, 'name': 'Digital Trends'}",Andrew Tarantola,The brain-computer interface revolution is jus...,Brain-Computer Interfaces fascinate the sci-fi...,https://www.digitaltrends.com/computing/tech-f...,https://www.digitaltrends.com/wp-content/uploa...,2024-11-16T00:13:17Z,Whether it’s jacking into the Matrix or becomi...
5,"{'id': None, 'name': 'The Atlantic'}",David Brooks,How the Ivy League Broke America,The meritocracy isn’t working. We need somethi...,https://www.theatlantic.com/magazine/archive/2...,https://cdn.theatlantic.com/thumbor/ISbKcAvvYE...,2024-11-14T11:00:00Z,Every coherent society has a social idealan im...
6,"{'id': None, 'name': 'Popular Science'}",RJ Mackenzie,What happens when you donate your body to scie...,"'They will be teaching over 3,000 healthcare p...",https://www.popsci.com/health/what-happens-whe...,https://www.popsci.com/wp-content/uploads/2024...,2024-11-24T13:03:00Z,We may earn revenue from the products availabl...
7,"{'id': None, 'name': 'Android Police'}",Jon Gilbert,Empire: Total War for Android is much more tha...,Empire: Total War's new features are only foun...,https://www.androidpolice.com/total-war-empire...,https://static1.anpoimages.com/wordpress/wp-co...,2024-11-19T14:53:17Z,"Over the years, Feral Interactive has ported s..."
8,"{'id': None, 'name': 'Android Police'}",Nathaniel Wilkins,Google's Audio Overview can transform bland re...,No more slogging through long documents. Find ...,https://www.androidpolice.com/google-audio-ove...,https://static1.anpoimages.com/wordpress/wp-co...,2024-11-11T15:17:03Z,Google's AI-powered NotebookLM has fully arriv...
9,"{'id': None, 'name': 'Science Daily'}",,"Building green and blue spaces, such as parks,...",With house building a priority for the new UK ...,https://www.sciencedaily.com/releases/2024/12/...,https://www.sciencedaily.com/images/scidaily-i...,2024-12-03T20:44:33Z,With house building a priority for the new UK ...


I am using similarity search to find more relevant articles, even after filtering by topic within the newsapi method call. Treat the newsapi method call parameter as a first pass filter, and the similarity search as a more refined 'search'. 

I use this method to increase the maximum search results while retaining the most relevancy.

In [10]:
# Testing different models for similarity search
# !pip install faiss-cpu hnswlib --quiet 

In [11]:
# Import libraries
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import hnswlib

# Set max results 
MAX_RESULTS = 20

# Initialize the sentence transformer
model = SentenceTransformer("all-MiniLM-L6-v2")

def create_embeddings(texts):
    return model.encode(texts, show_progress_bar=False)

# FAISS

def search_faiss(query_embedding, embeddings, k=MAX_RESULTS):
    # Initialize FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings.astype("float32"))

    # Search
    distances, indices = index.search(query_embedding.reshape(1, -1), k)
    return indices[0]

# HNSW

def search_hnsw(query_embedding, embeddings, k=MAX_RESULTS):
    # Initialize HNSW index
    dimension = embeddings.shape[1]
    index = hnswlib.Index(space="l2", dim=dimension)
    index.init_index(max_elements=len(embeddings), ef_construction=200, M=16)
    index.add_items(embeddings)

    # Search
    labels, distances = index.knn_query(query_embedding.reshape(1, -1), k=k)
    return labels[0]


# Create embeddings for all articles
content_embeddings = create_embeddings(df["content"].values)

# Store results for each topic using both methods
faiss_results = {}
hnsw_results = {}

In [12]:
# Initialize empty DataFrames for results
faiss_results = pd.DataFrame()
hnsw_results = pd.DataFrame()

for topic in topics:
    # Create query embedding
    query_embedding = create_embeddings([topic])[0]

    # FAISS search
    faiss_indices = search_faiss(query_embedding, content_embeddings)
    topic_results = df.iloc[faiss_indices].copy()
    topic_results['topic'] = topic
    faiss_results = pd.concat([faiss_results, topic_results])

    # HNSW search 
    hnsw_indices = search_hnsw(query_embedding, content_embeddings)
    topic_results = df.iloc[hnsw_indices].copy()
    topic_results['topic'] = topic
    hnsw_results = pd.concat([hnsw_results, topic_results])

# Example to view results
print("\nFAISS Results:")
for topic in topics:
    print(f"\nTop articles for '{topic}':")
    print(faiss_results[faiss_results['topic'] == topic]['title'])

print("\nHNSW Results:")
for topic in topics:
    print(f"\nTop articles for '{topic}':")
    print(hnsw_results[hnsw_results['topic'] == topic]['title'])


FAISS Results:

Top articles for 'Health':
52                                  No Meta Status Orgs
53    Suggested questions for Donald Trump’s health ...
1     With Trump coming into power, the NIH is in th...
77    Survey: Supporting Students Postelection, in 4...
88    Can RFK Jr. Fix Our Dysfunctional Public Healt...
15    Q&A: Researcher discusses protecting the healt...
96    Karachi’s Air Quality Crisis: A Call for Urgen...
38    1 In 5 Gen Z Healthcare Workers Are Considerin...
78         Imagining More Sustainable Higher Ed Careers
54    RFK Jr. vs. the NIH: Say goodbye to the greate...
73    ‘How the World Ran Out of Everything’ and ‘Rec...
65                  Trump Taps Bhattacharya to Lead NIH
69    Robert F. Kennedy Jr.'s Appointment Would 'Put...
48             Fight Abuse by Becoming More like Christ
11    America’s fractured trust in science, explaine...
55    Dr. Vinay Prasad “Sabotaging RFK Jr’s Confirma...
16    Landmark study reveals stark failure to halt M...
10  

In [13]:
print(faiss_results.head(40))
print(hnsw_results.head(40))
print(len(faiss_results), len(hnsw_results))

                                              source  \
52        {'id': None, 'name': 'Overcomingbias.com'}   
53  {'id': None, 'name': 'Sciencebasedmedicine.org'}   
1                        {'id': None, 'name': 'NPR'}   
77          {'id': None, 'name': 'Inside Higher Ed'}   
88                    {'id': None, 'name': 'Reason'}   
15                  {'id': None, 'name': 'Phys.Org'}   
96              {'id': None, 'name': 'The Diplomat'}   
38                    {'id': None, 'name': 'Forbes'}   
78          {'id': None, 'name': 'Inside Higher Ed'}   
54  {'id': None, 'name': 'Sciencebasedmedicine.org'}   
73          {'id': None, 'name': 'Inside Higher Ed'}   
65          {'id': None, 'name': 'Inside Higher Ed'}   
69          {'id': None, 'name': 'Inside Higher Ed'}   
48    {'id': None, 'name': 'Thegospelcoalition.org'}   
11                       {'id': None, 'name': 'Vox'}   
55  {'id': None, 'name': 'Sciencebasedmedicine.org'}   
16                  {'id': None, 'name': 'Phys.O

### Sentiment Analysis Filtering

In [14]:
# Import sentiment analyzer from transformers
from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

def analyze_sentiment(text):
    """Analyze sentiment of text and return score"""
    try:
        result = sentiment_analyzer(text[:256])[0] # Limit text length for speed
        return 1 if result['label'] == 'POSITIVE' else 0
    except:
        return 0 # Default to neutral if error

# Analyze mood_in sentiment
mood_sentiment = analyze_sentiment(mood_in)
# print(f"\nMood input sentiment score: {mood_sentiment}")

# Add sentiment scores
faiss_results['sentiment'] = faiss_results['content'].apply(analyze_sentiment)
hnsw_results['sentiment'] = hnsw_results['content'].apply(analyze_sentiment)

# Decide which articles to show
if mood_sentiment == 1:
    faiss_results_filtered = faiss_results
    hnsw_results_filtered = hnsw_results
else:
    faiss_results_filtered = faiss_results[faiss_results['sentiment'] == 1]
    hnsw_results_filtered = hnsw_results[hnsw_results["sentiment"] == 1]

## Summarizer

In [15]:
# METHOD for SUMMARIZING ALL RELEVANT ARTICLES

def summarize_all_articles(df, max_articles):
    """
    Creates a single unified summary of multiple articles using Mistral
    Args:
        df: DataFrame containing articles (with 'content' and 'title' columns)
        max_articles: Maximum number of articles to include in summary
    Returns:
        str: A single summary combining all articles
    """
    # Combine titles and content (limited to max_articles)
    combined_text = ""
    for _, row in df.head(max_articles).iterrows():
        combined_text += f"{row['content']} "

    # Create prompt for single summary
    prompt = f"""Please provide ONE concise summary that captures the overall narrative 
    across all these articles as a single cohesive story. Do not summarize articles 
    individually. Focus on synthesizing the key themes and insights from all 
    {min(len(df), max_articles)} articles together:
    {combined_text}
    """

    # Get summary from Mistral
    response = ollama.chat(
        model="mistral", messages=[{"role": "user", "content": prompt}]
    )

    return response["message"]["content"]


In [16]:
hnsw_results_filtered.head(10)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content,topic,sentiment
52,"{'id': None, 'name': 'Overcomingbias.com'}",Robin Hanson,No Meta Status Orgs,"In health and medicine, we have many governmen...",https://www.overcomingbias.com/p/no-meta-statu...,"https://substackcdn.com/image/fetch/w_1200,h_6...",2024-11-19T18:51:28Z,"In health and medicine, we have many governmen...",Health,1
1,"{'id': None, 'name': 'NPR'}",Rob Stein,"With Trump coming into power, the NIH is in th...","The National Institutes of Health, the crown j...",https://www.npr.org/2024/11/12/nx-s1-5183014/t...,https://npr.brightspotcdn.com/dims3/default/st...,2024-11-12T18:16:08Z,As the next Trump administration overhauls the...,Health,1
77,"{'id': None, 'name': 'Inside Higher Ed'}",colleen.flaherty,"Survey: Supporting Students Postelection, in 4...","Survey: Supporting Students Postelection, in 4...",https://www.insidehighered.com/news/student-su...,https://www.insidehighered.com/sites/default/f...,2024-11-22T08:00:00Z,The University of Virginia ramped up mental he...,Health,1
15,"{'id': None, 'name': 'Phys.Org'}",Brian Consiglio,Q&A: Researcher discusses protecting the healt...,The plague is one of the deadliest bacterial i...,https://phys.org/news/2024-12-qa-discusses-hea...,https://scx2.b-cdn.net/gfx/news/2024/protectin...,2024-12-04T21:10:59Z,The plague is one of the deadliest bacterial i...,Health,1
78,"{'id': None, 'name': 'Inside Higher Ed'}",Elizabeth Redden,Imagining More Sustainable Higher Ed Careers,Imagining More Sustainable Higher Ed Careers\n...,https://www.insidehighered.com/opinion/career-...,https://www.insidehighered.com/sites/default/f...,2024-12-04T08:00:00Z,We begin this essay with two key questions: Ar...,Health,1
65,"{'id': None, 'name': 'Inside Higher Ed'}",Katherine Knott,Trump Taps Bhattacharya to Lead NIH,President-elect Donald Trump nominated Dr. Jay...,https://www.insidehighered.com/news/quick-take...,https://www.insidehighered.com/modules/custom/...,2024-11-27T08:00:00Z,President-elect Donald Trump nominated Dr. Jay...,Health,1
22,"{'id': None, 'name': 'Psychology Today'}","Loriann Oberlin MS, LCPC",Word Puzzles and Board Games Boost Brain Health,"Board games, per academic and medical studies,...",https://www.psychologytoday.com/intl/blog/the-...,https://cdn2.psychologytoday.com/assets/styles...,2024-12-03T14:31:20Z,"In a WordsRated.com survey, 65% of US adults e...",Health,1
29,"{'id': None, 'name': 'Forbes'}","Marybeth Gasman, Contributor, \n Marybeth Gasm...",How To Support Faculty And Staff Who Support S...,38% of faculty and staff are often or always e...,https://www.forbes.com/sites/marybethgasman/20...,https://imageio.forbes.com/specials-images/ima...,2024-11-25T18:33:33Z,Diversity of mental health\r\ngetty\r\nOn coll...,Health,1
93,"{'id': 'rt', 'name': 'RT'}",RT,Alumni of Russian universities gather in Cairo...,Cairo is hosting a forum to promote the Russia...,https://www.rt.com/africa/608827-russias-alumn...,https://mf.b37mrtl.ru/files/2024.12/article/67...,2024-12-06T10:42:01Z,A forum bringing together graduates of Soviet ...,Universities,1
27,"{'id': None, 'name': 'Forbes'}","Michael T. Nietzel, Senior Contributor, \n Mic...",Johns Hopkins Again Tops NSF’s List Of Schools...,Total R&D expenditures at American colleges & ...,https://www.forbes.com/sites/michaeltnietzel/2...,https://imageio.forbes.com/specials-images/ima...,2024-11-25T21:47:24Z,Johns Hopkins University once again ranks firs...,Universities,1


In [17]:
# Call summarizer:
summary = summarize_all_articles(hnsw_results_filtered, max_articles=10)
print(summary)

 Amidst the ongoing changes in government administration and the increasing focus on mental health, particularly on college campuses, various organizations are exploring ways to improve healthcare and research. The National Institutes of Health (NIH) could face restructuring under the next Trump administration, with Dr. Jay Bhattacharya nominated as potential director. Mental health supports have become a priority for universities, such as the University of Virginia offering additional resources during election season. Board games are identified as an effective way to boost mood and reduce anxiety in adults. The plague, a deadly bacterial infection in human history, is compared to current concerns over sustainability in higher education and the need for changes to make it more resilient. International cooperation is also emphasized through a forum bringing together graduates of Soviet and Russian universities. Johns Hopkins University continues to lead in R&D funding in American colleg