In [6]:
from dotenv import load_dotenv
import pandas as pd
load_dotenv()
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from datetime import date, timedelta
import requests
from bs4 import BeautifulSoup
import time
import re


In [2]:
# Access the api keys
NEWSAPI_KEY = os.getenv('NEWSAPI_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

### Filter out only credible news providers

In [4]:
# Extract news provider list

BASE_URL = "https://newsapi.org/v2/sources"

params = {
    "apiKey": NEWSAPI_KEY, 
}

source_list = requests.get(BASE_URL, params=params)
source_list = pd.json_normalize([news for news in source_list.json()['sources'] if (news['language'] == 'en') & 
                                 (news['country']=='us') & (news['category'] in ['general', 'technology'])])
print('- number of news providers:', len(source_list))
source_list.head()

- number of news providers: 35


Unnamed: 0,id,name,description,url,category,language,country
0,abc-news,ABC News,"Your trusted source for breaking news, analysi...",https://abcnews.go.com,general,en,us
1,al-jazeera-english,Al Jazeera English,"News, analysis from the Middle East and worldw...",https://www.aljazeera.com,general,en,us
2,ars-technica,Ars Technica,The PC enthusiast's resource. Power users and ...,https://arstechnica.com,technology,en,us
3,associated-press,Associated Press,The AP delivers in-depth coverage on the inter...,https://apnews.com/,general,en,us
4,axios,Axios,Axios are a new media company delivering vital...,https://www.axios.com,general,en,us


In [8]:
def check_news_source(news_url):
    """Fetches factual reporting, traffic, and credibility score from Media Bias/Fact Check."""
    
    base_search_url = "https://mediabiasfactcheck.com/?s={}".format(news_url)
    headers = {"User-Agent": "Mozilla/5.0"}
    
    try:
        # Step 1: Search for the news provider
        response = requests.get(base_search_url, headers=headers)
        response.raise_for_status()  # Raise an error for bad responses
        soup = BeautifulSoup(response.text, "html.parser")
        
        result = soup.find("h3", class_="entry-title")
        if not result or not result.a:
            return {"source": news_url, "factual_reporting": "NA", "traffic": "NA", "credibility": "NA"}

        # Step 2: Navigate to the news source page
        source_page_url = result.a["href"]
        response = requests.get(source_page_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        factual_reporting, traffic, credibility = "NA", "NA", "NA"

        for para in soup.find_all("p"):
            if "Factual Reporting:" in para.text:
                factual_match = re.search(r"Factual Reporting:\s*([^(\n]+)", para.text)
                traffic_match = re.search(r"Traffic/Popularity:\s*([^\n]+)", para.text)
                credibility_match = re.search(r"MBFC Credibility Rating:\s*([^\n]+)", para.text)
                
                factual_reporting = factual_match.group(1) if factual_match else "NA"
                traffic = traffic_match.group(1) if traffic_match else "NA"
                credibility = credibility_match.group(1) if credibility_match else "NA"
                
                break  # Stop searching once found
        
        time.sleep(3)  # Rate-limiting to avoid being blocked
        
        return {
            "source": news_url,
            "factual_reporting": factual_reporting,
            "traffic": traffic,
            "credibility": credibility
        }
    
    except requests.RequestException as e:
        return {"source": news_url, "error": str(e)}

news_scores = [check_news_source(u) for u in source_list['url']]

news_scores_df = pd.DataFrame(news_scores)
news_scores_df

Unnamed: 0,source,factual_reporting,traffic,credibility
0,https://abcnews.go.com,HIGH,High Traffic,HIGH CREDIBILITY
1,https://www.aljazeera.com,MIXED,High Traffic,MEDIUM CREDIBILITY
2,https://arstechnica.com,HIGH,High Traffic,HIGH CREDIBILITY
3,https://apnews.com/,HIGH,High Traffic,HIGH CREDIBILITY
4,https://www.axios.com,HIGH,High Traffic,HIGH CREDIBILITY
5,http://www.breitbart.com,MIXED,,
6,http://www.cbsnews.com,,,
7,http://us.cnn.com,,,
8,https://www.ccn.com,HIGH,Medium Traffic,HIGH CREDIBILITY
9,https://www.engadget.com,HIGH,High Traffic,HIGH CREDIBILITY


In [9]:
# Fill out elements not extracted but highly credible
news_scores_df[news_scores_df['source']=='http://www.cbsnews.com'] = ('http://www.cbsnews.com', 'HIGH', 'High Traffic', 'HIGH CREDIBILITY') 

# Only high fact, traffic, credibility score hold news providers remained
news_scores_df_shortlist = news_scores_df[((news_scores_df['factual_reporting'].str.contains('HIGH')) |
                (news_scores_df['factual_reporting'].str.contains('MOSTLY'))) &
                (news_scores_df['traffic'].str.contains('High')) &
                (news_scores_df['credibility'].str.contains('HIGH'))]

news_scores_df_shortlist = pd.merge(source_list[['id','url']], news_scores_df_shortlist, how='right', left_on='url', right_on='source')

# Use list of news as a parameter
sources_final = ','.join(news_scores_df_shortlist['id'])

In [10]:
start_date = date.today() - timedelta(days=1)
end_date = date.today()

BASE_URL = "https://newsapi.org/v2/everything"

# Define your search query with OR
query = "AI OR 'artificial intelligence' OR 'machine learning' OR 'data science' OR tech"

# Set up request parameters
params = {
    "q": query, 
    "language": "en",
    "sortBy": "Popularity",
    "from": str(start_date) + 'T05:00:01',
    "to": str(end_date) + 'T05:00:01',
    "sources": sources_final,
    # "searchIn": "title,description"
    "apiKey": NEWSAPI_KEY 
}

# Make the request
response = requests.get(BASE_URL, params=params)

for i in range(len(response.json()['articles'])):
    print(response.json()['articles'][i]['source']['name'], response.json()['articles'][i]['title'])

Time DeepSeek Is Not a Good Reason for Big Tech to Become More Powerful
Time Is the DeepSeek Panic Overblown?
Time Gabbard’s Refusal to Call Snowden a Traitor Draws Pushback at Hearing to Be Intel Chief
Time Breaking Down the Action-Packed Ending of The Recruit Season 2
Time Gabriel Basso on the ‘Morally Ambiguous’ Season 2 of The Night Agent—And What Comes Next
Time D.C. Plane Crash Raises Questions About Trump Aviation Personnel Changes
Time What to Know About the Passenger Jet, Army Helicopter Collision Near Washington, D.C.
ABC News Gabbard avoids condemning government secrets leaker Snowden in confirmation hearing
ABC News Kash Patel hearing live updates: Staunch Trump supporter to face confirmation
ABC News Trump's missing the point on DEI and meritocracy, experts say
ABC News Kash Patel, Trump's controversial pick for FBI director, set to face Senate grilling
TechRadar AI agents are proving remarkably popular - but firms still face many challenges
TechRadar Microsoft reveals mas

In [11]:
llm_input = [{key: article[key] for key in ['title','description','content','url'] if key in article} for article in response.json()['articles']]
llm_input

[{'title': 'DeepSeek Is Not a Good Reason for Big Tech to Become More Powerful',
  'description': 'DeepSeek’s disruption is a sign that the U.S. needs more competition, not Big Tech dominance, in order to continue its global AI leadership.',
  'content': 'The Chinese artificial intelligence (AI) lab DeepSeek grabbed headlines and tanked the stock market with its announcement of a new AI model nearly equivalent to the United States most recent reasonin… [+8065 chars]',
  'url': 'https://time.com/7211610/deepseek-not-reason-big-tech-more-powerful/'},
 {'title': 'Is the DeepSeek Panic Overblown?',
  'description': 'AI scientists contend that the outsize reaction to the rise of the Chinese AI company DeepSeek is misguided.',
  'content': 'This week, leaders across Silicon Valley, Washington D.C., Wall Street, and beyond have been thrown into disarray due to the unexpected rise of the Chinese AI company DeepSeek. DeepSeek recently rele… [+8226 chars]',
  'url': 'https://time.com/7211646/is-

In [12]:
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    temperature=0
)


In [13]:
prompt= """You are an AI news summarization assistant. Your task is to analyze a collection of news articles, identify the most frequently mentioned news topics, summarize them, and select the best representative article for each.

### **Instructions:**
1. **Group Similar News Articles**: Identify common themes among the articles and group them by topic.  
2. **Filter for Tech-Related News**: Only include topics related to AI, machine learning, data science, cloud computing, automation, and major tech breakthroughs. Ignore irrelevant news.  
3. **Summarize Each Topic**: Provide a concise summary of each grouped topic.  
4. **Select the Best Representative URL**: Choose one URL that best represents the topic (preferably from a reputable source).  
5. **Format Output as JSON**.

---

### **Input Data Format:**
You will receive a list of articles with the following structure:

[
  {"title": "--", "description": "--", "content": "--", "url": "--"},
  {"title": "--", "description": "--", "content": "--", "url": "--"}
]"""
prompt


'You are an AI news summarization assistant. Your task is to analyze a collection of news articles, identify the most frequently mentioned news topics, summarize them, and select the best representative article for each.\n\n### **Instructions:**\n1. **Group Similar News Articles**: Identify common themes among the articles and group them by topic.  \n2. **Filter for Tech-Related News**: Only include topics related to AI, machine learning, data science, cloud computing, automation, and major tech breakthroughs. Ignore irrelevant news.  \n3. **Summarize Each Topic**: Provide a concise summary of each grouped topic.  \n4. **Select the Best Representative URL**: Choose one URL that best represents the topic (preferably from a reputable source).  \n5. **Format Output as JSON**.\n\n---\n\n### **Input Data Format:**\nYou will receive a list of articles with the following structure:\n\n[\n  {"title": "--", "description": "--", "content": "--", "url": "--"},\n  {"title": "--", "description": "-

In [14]:
summary_prompt = PromptTemplate(
    input_variables=["articles"],
    template=prompt
)

llm.predict(summary_prompt.format(articles=llm_input))

NameError: name 'PromptTemplate' is not defined

In [None]:
import streamlit as st

# List of headlines and corresponding URLs
news_headlines = [
    ("Breaking News: Example Headline 1", "https://example.com/news1"),
    ("Breaking News: Example Headline 2", "https://example.com/news2"),
    ("Breaking News: Example Headline 3", "https://example.com/news3")
]

# Display headlines with hyperlinks
for headline, url in news_headlines:
    st.markdown(f"[{headline}]({url})")
