In [1]:
from dotenv import load_dotenv
import pandas as pd
load_dotenv()
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from datetime import date, timedelta
import requests
from bs4 import BeautifulSoup
import time
import re
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Access the api keys
NEWSAPI_KEY = os.getenv('NEWSAPI_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

### Filter out only credible news providers

In [8]:
# Extract news provider list

def news_source():
    """Fetches news source provide from News API - id, name, description, url, category, language, country"""
    
    source_url = "https://newsapi.org/v2/sources"
    
    params = {
        "apiKey": NEWSAPI_KEY, 
    }

    source_list = requests.get(source_url, params=params)
    source_list = pd.json_normalize([news for news in source_list.json()['sources'] if (news['language'] == 'en') & 
                                    (news['country']=='us') & (news['category'] in ['general', 'technology'])])
    return(source_list)

source_list = news_source()
# source_list.to_csv('data/source_list.csv', index=False) 
source_list.head()

Unnamed: 0,id,name,description,url,category,language,country
0,abc-news,ABC News,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,general,en,us
1,al-jazeera-english,Al Jazeera English,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,general,en,us
2,ars-technica,Ars Technica,The PC enthusiast's resource. Power users and ...,https://arstechnica.com,technology,en,us
3,associated-press,Associated Press,The AP delivers in-depth coverage on the inter...,https://apnews.com/,general,en,us
4,axios,Axios,Axios are a new media company delivering vital...,https://www.axios.com,general,en,us


In [None]:

def check_news_source(news_url):
    """Fetches factual reporting, traffic, and credibility score from Media Bias/Fact Check."""
    
    base_search_url = "https://mediabiasfactcheck.com/?s={}".format(news_url)
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        # Extract extact url for each news
        response = requests.get(base_search_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.text, "html.parser")
        
        result = soup.find("h3", class_="entry-title")
        if not result or not result.a:
            return {"source": news_url, "factual_reporting": "NA", "traffic": "NA", "credibility": "NA"}

        source_page_url = result.a["href"]

        # Extract information for each news

        response = requests.get(source_page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        factual_reporting, traffic, credibility = "NA", "NA", "NA"

        for para in soup.find_all("p"):
            if "Factual Reporting:" in para.text:
                factual_match = re.search(r"Factual Reporting:\s*([^(\n]+)", para.text)
                traffic_match = re.search(r"Traffic/Popularity:\s*([^\n]+)", para.text)
                credibility_match = re.search(r"MBFC Credibility Rating:\s*([^\n]+)", para.text)
                
                factual_reporting = factual_match.group(1) if factual_match else "NA"
                traffic = traffic_match.group(1) if traffic_match else "NA"
                credibility = credibility_match.group(1) if credibility_match else "NA"
                
                break
        
        time.sleep(3) 
        
        return {
            "source": news_url,
            "factual_reporting": factual_reporting,
            "traffic": traffic,
            "credibility": credibility
        }
    
    except requests.RequestException as e:
        return {"source": news_url, "error": str(e)}

In [17]:
# Filter out credible news sources
news_scores = [check_news_source(u) for u in source_list['url']]
news_scores_df = pd.DataFrame(news_scores)

# Fill out elements not extracted but highly credible
news_scores_df[news_scores_df['source']=='http://www.cbsnews.com'] = ('http://www.cbsnews.com', 'HIGH', 'High Traffic', 'HIGH CREDIBILITY') 

# news_scores_df.to_csv('data/news_credibility.csv', index=False)
# Only high fact, traffic, credibility score hold news providers remained
news_scores_df_shortlist = news_scores_df[((news_scores_df['factual_reporting'].str.contains('HIGH')) |
                (news_scores_df['factual_reporting'].str.contains('MOSTLY'))) &
                (news_scores_df['traffic'].str.contains('High')) &
                (news_scores_df['credibility'].str.contains('HIGH'))]

news_scores_df_shortlist = pd.merge(source_list, news_scores_df_shortlist, how='right', left_on='url', right_on='source')
news_scores_df_shortlist.head()


Unnamed: 0,id,name,description,url,category,language,country,source,factual_reporting,traffic,credibility
0,abc-news,ABC News,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,general,en,us,https://abcnews.go.com,HIGH,High Traffic,HIGH CREDIBILITY
1,ars-technica,Ars Technica,The PC enthusiast's resource. Power users and ...,https://arstechnica.com,technology,en,us,https://arstechnica.com,HIGH,High Traffic,HIGH CREDIBILITY
2,associated-press,Associated Press,The AP delivers in-depth coverage on the inter...,https://apnews.com/,general,en,us,https://apnews.com/,HIGH,High Traffic,HIGH CREDIBILITY
3,axios,Axios,Axios are a new media company delivering vital...,https://www.axios.com,general,en,us,https://www.axios.com,HIGH,High Traffic,HIGH CREDIBILITY
4,cbs-news,CBS News,CBS News: dedicated to providing the best in j...,http://www.cbsnews.com,general,en,us,http://www.cbsnews.com,HIGH,High Traffic,HIGH CREDIBILITY


In [21]:
# Use a list of news as a parameter
sources_final = ','.join(news_scores_df_shortlist['id'])
# with open('data/sources_final.txt', 'w') as f:
#     f.write(sources_final)

with open('data/sources_final.txt', 'r') as f:
    sources_final = f.read()

sources_final

'abc-news,ars-technica,associated-press,axios,cbs-news,engadget,google-news,hacker-news,national-review,nbc-news,newsweek,politico,reuters,techcrunch,techradar,the-hill,the-washington-post,time,vice-news,wired'

In [None]:
def ai_news():
    start_date = date.today() - timedelta(days=1)
    end_date = date.today()

    BASE_URL = "https://newsapi.org/v2/everything"

    # Define your search query with OR
    query = "AI OR 'artificial intelligence' OR 'machine learning' OR 'data science' OR tech"

    # Set up request parameters
    params = {
        "q": query, 
        "language": "en",
        "sortBy": "Popularity",
        "from": str(start_date) + 'T05:00:01',
        "to": str(end_date) + 'T05:00:01',
        "sources": sources_final,
        # "searchIn": "title,description"
        "apiKey": NEWSAPI_KEY 
    }

    # Make the request
    response = requests.get(BASE_URL, params=params)
    return(response)


response = ai_news()
for i in range(len(response.json()['articles']))[:5]:
    print(response.json()['articles'][i]['source']['name'], response.json()['articles'][i]['title'])

Wired US Government Websites Are Disappearing in Real Time
Wired Moon or Mars? The US Might Face a Tough Choice for Future Missions
Wired Foreign Hackers Are Using Google’s Gemini in Attacks on the US
ABC News Multiple health agency websites on HIV, contraception taken down
ABC News Rubio to focus on curbing immigration, countering China in Latin America


In [28]:
llm_input = [{key: article[key] for key in ['title','description','content','url'] if key in article} for article in response.json()['articles']]
llm_input[:3]

[{'title': 'US Government Websites Are Disappearing in Real Time',
  'description': 'A growing number of US government websites have gone offline as of Saturday, including several related to USAID and others focused on youth programs, Africa, and more.',
  'content': 'Several government websites have been taken down, including the usaid.gov, foreignassistance.gov, neglecteddiseases.gov, and childreninadversity.gov. A WIRED analysis of more than 1,000 federal .gov … [+3280 chars]',
  'url': 'https://www.wired.com/story/us-government-websites-are-disappearing-in-real-time/'},
 {'title': 'Moon or Mars? The US Might Face a Tough Choice for Future Missions',
  'description': 'Continuing the Artemis program and using its planned lunar space station as a staging post would be a more energy efficient but slower way to reach Mars, and it’s unlikely to be Elon Musk’s preference.',
  'content': 'THIS ARTICLE IS republished fromThe Conversationunder aCreative Commons license.\r\nThe Artemis progra

In [30]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0
)

prompt= """You are an AI news summarization assistant. 
Your task is to analyze a collection of news articles, identify the most frequently mentioned news topics, summarize them, and select the best representative article for each.

- Introduction
1. Group Similar News Articles: Identify common themes among the articles and group them by topic.  
2. Filter for Tech-Related News: Only include topics related to AI, machine learning, data science, cloud computing, automation, and major tech breakthroughs. Ignore irrelevant news.  
3. Summarize Each Topic: Provide a concise summary of each grouped topic.  
4. Select the Best Representative URL: Choose one URL that best represents the topic (preferably from a reputable source).  
5. Format Output as JSON.

- Input Data
{articles}"""

summary_prompt = PromptTemplate(
    input_variables=["articles"],
    template=prompt
)

result = llm.invoke(summary_prompt.format(articles=llm_input))

In [52]:
result_json = result.to_json()['kwargs']['content'].strip("'```json\n")
news_summary = json.loads(result_json)['news_summary']
news_summary

{'AI Technology and DeepSeek': {'summary': 'The Chinese AI platform DeepSeek is rapidly gaining popularity, surpassing ChatGPT in growth and raising concerns about US AI dominance.  Its rapid adoption has prompted discussions about the global AI race and potential national security implications.  OpenAI has responded to the competition by releasing its o3-mini reasoning model for all ChatGPT users.',
  'representative_url': 'https://www.techradar.com/pro/security/only-two-weeks-in-and-ai-phenomenon-deepseek-is-officially-growing-faster-than-chatgpt'},
 'Nvidia and AI Hardware': {'summary': 'Nvidia is offering free AI courses covering generative AI, deep learning, and accelerated computing.  Performance tests of the Nvidia GeForce RTX 5090 show it significantly outperforming competitors in creative software and AI tasks.',
  'representative_url': 'https://www.techradar.com/pro/nvidia-giving-away-free-ai-courses-worth-up-to-usd90-and-no-it-has-absolutely-nothing-to-do-with-deepseeks-asce

In [82]:
news_summary_ls = []
for k, v in news_summary.items():
    l = {}
    l['title'] = k
    for k2, v2 in v.items():
       l[k2] = v2
    news_summary_ls.append(l)
# news_summary_ls = pd.DataFrame(news_summary_ls)
news_summary_ls

[{'title': 'AI Technology and DeepSeek',
  'summary': 'The Chinese AI platform DeepSeek is rapidly gaining popularity, surpassing ChatGPT in growth and raising concerns about US AI dominance.  Its rapid adoption has prompted discussions about the global AI race and potential national security implications.  OpenAI has responded to the competition by releasing its o3-mini reasoning model for all ChatGPT users.',
  'representative_url': 'https://www.techradar.com/pro/security/only-two-weeks-in-and-ai-phenomenon-deepseek-is-officially-growing-faster-than-chatgpt'},
 {'title': 'Nvidia and AI Hardware',
  'summary': 'Nvidia is offering free AI courses covering generative AI, deep learning, and accelerated computing.  Performance tests of the Nvidia GeForce RTX 5090 show it significantly outperforming competitors in creative software and AI tasks.',
  'representative_url': 'https://www.techradar.com/pro/nvidia-giving-away-free-ai-courses-worth-up-to-usd90-and-no-it-has-absolutely-nothing-to-