In [2]:
# %pip install feedparser transformers newspaper3k

### Notebook to extract gnews 

In [2]:
import feedparser
import datetime
from newspaper import Article
import pandas as pd
from collections import Counter
import re, requests, os
from IPython.display import display, Markdown, HTML
from bs4 import BeautifulSoup


In [3]:
# function to count keyword occurances in a text
def count_keywords(text, keywords):
    words = re.findall(r'\w+', text.lower())
    word_count = Counter(words)
    return sum(word_count[keyword] for keyword in keywords)

# function to make the link text clicable
def make_clickable(val):
    return f'<a target="_blank" href="{val}">{val}</a>'

# topics that you want to search 
topics = ["Copper", "Nickle"]
days_ago = 1 # number of days
time_range = f"{days_ago}"

#define num of pages to process
num_pages = 2

base_url = "https://news.google.com/rss/search?q="

# set desired language, location, and edition
hl = "en-US" # language
gl = "US" # location
ceid = "US:en" # edition

In [4]:
def get_gnews():
    # fetch the news using feedparser and process the defined number of pages
    news = []
    
    for page in range(1, num_pages+1):
#         gnews_url = f'https://news.google.com/rss/search?q={topic}&tbs=qdr:{time_range}$hl=en-US&gl=US&ceid=US:en&start={10*(page-1)}'
        # combine the search terms
        query = "+".join(element.replace(" ","+") for element in topics)
        gnews_url = f"{base_url}{query}&tbs=qdr:{time_range}$hl={hl}&gl={gl}&ceid={ceid}&start={10*(page-1)}"
        
        news_feed = feedparser.parse(gnews_url)
        
        for entry in news_feed.entries:
            headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0;Win64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"}
            response = requests.get(entry.link, headers=headers)
            
            if response.status_code != 200:
                continue
            
            # parse content use newspaper3k
            try:
                article = Article('')
                article.set_html(response.text)
                article.parse()
            except:
                continue
                
            content = article.text
            
            # count the key words occurances in the article content
            kw = ['price']
            
            kw_count = count_keywords(content, kw)
            
            published_time = datetime.datetime.strptime(entry.published, '%a, %d %b %Y %H:%M:%S %Z')
            
            time_diff = datetime.datetime.now()-published_time
            time_ago = f"{time_diff.seconds//3600} hours ago" if time_diff.days==0 else f"{time_diff.days} days ago"
            
            news.append({
                "News Title": entry.title,
                "News Source": entry.source.title,
                "Date": time_ago,
                "Published Datetime": entry.published,
                "News Link": entry.link,
                "NumKeyWord": kw_count
            })
            
            sort_news = sorted(news, key = lambda x: x['Published Datetime'], reverse=True)

            news_df = pd.DataFrame(sort_news)

            df = news_df.copy()
            df['News Link'] = df['News Link'].apply(make_clickable)

            html_tbl = df.to_html(escape=False)
            df = pd.read_html(html_tbl, index_col=0)[0]
            
        return html_tbl, df

In [35]:
html_tbl, df = get_gnews()

In [36]:
display(HTML(html_tbl))

Unnamed: 0,News Title,News Source,Date,Published Datetime,News Link,NumKeyWord
0,‘Ocean Is at Stake’ at International Seabed Authority Negotiations ... - EcoWatch,EcoWatch,-1 days ago,"Wed, 29 Mar 2023 19:24:56 GMT",https://news.google.com/rss/articles/CBMiPmh0dHBzOi8vd3d3LmVjb3dhdGNoLmNvbS9pc2EtbmVnb3RpYXRpb25zLWRlZXAtc2VhLW1pbmluZy5odG1s0gEA?oc=5,0
1,$90m contract to build West Musgrave accommodation village - Kalgoorlie Miner,Kalgoorlie Miner,-1 days ago,"Wed, 29 Mar 2023 18:03:27 GMT",https://news.google.com/rss/articles/CBMilQFodHRwczovL3d3dy5rYWxtaW5lci5jb20uYXUvbmV3cy9yZWdpb25hbC9rZXJtYW4tY29udHJhY3RpbmctYXdhcmRlZC05MG0tY29udHJhY3QtdG8tYnVpbGQtb3otbWluZXJhbHMtd2VzdC1tdXNncmF2ZS1hY2NvbW1vZGF0aW9uLXZpbGxhZ2UtYy0xMDE4NzIyN9IBAA?oc=5,0
2,Fitch Upgrades Anglo American to 'BBB+'; Outlook Stable - Fitch Ratings,Fitch Ratings,-1 days ago,"Wed, 29 Mar 2023 17:53:00 GMT",https://news.google.com/rss/articles/CBMidmh0dHBzOi8vd3d3LmZpdGNocmF0aW5ncy5jb20vcmVzZWFyY2gvY29ycG9yYXRlLWZpbmFuY2UvZml0Y2gtdXBncmFkZXMtYW5nbG8tYW1lcmljYW4tdG8tYmJiLW91dGxvb2stc3RhYmxlLTI5LTAzLTIwMjPSAQA?oc=5,0
3,Playing the long game in commodities | UBS United States of America - UBS,UBS,-1 days ago,"Wed, 29 Mar 2023 17:42:33 GMT",https://news.google.com/rss/articles/CBMiVWh0dHBzOi8vd3d3LnVicy5jb20vdXMvZW4vd2VhbHRoLW1hbmFnZW1lbnQvaW5zaWdodHMvbWFya2V0LW5ld3MvYXJ0aWNsZS4xNTg4MjM2Lmh0bWzSAQA?oc=5,0
4,Analysis | Republicans grill Interior chief on mining of critical minerals - The Washington Post,The Washington Post,1 hours ago,"Wed, 29 Mar 2023 14:54:00 GMT",https://news.google.com/rss/articles/CBMibWh0dHBzOi8vd3d3Lndhc2hpbmd0b25wb3N0LmNvbS9wb2xpdGljcy8yMDIzLzAzLzI5L3JlcHVibGljYW5zLWdyaWxsLWludGVyaW9yLWNoaWVmLW1pbmluZy1jcml0aWNhbC1taW5lcmFscy_SAQA?oc=5,0
5,Kendrick Resources touts possibility of vanadium at Sweden asset - Marketscreener.com,Marketscreener.com,2 hours ago,"Wed, 29 Mar 2023 13:54:06 GMT",https://news.google.com/rss/articles/CBMimwFodHRwczovL3d3dy5tYXJrZXRzY3JlZW5lci5jb20vcXVvdGUvc3RvY2svS0VORFJJQ0stUkVTT1VSQ0VTLVBMQy0xMzc1Mzc5NDcvbmV3cy9LZW5kcmljay1SZXNvdXJjZXMtdG91dHMtcG9zc2liaWxpdHktb2YtdmFuYWRpdW0tYXQtU3dlZGVuLWFzc2V0LTQzMzcwMDUyL9IBnwFodHRwczovL3d3dy5tYXJrZXRzY3JlZW5lci5jb20vYW1wL3F1b3RlL3N0b2NrL0tFTkRSSUNLLVJFU09VUkNFUy1QTEMtMTM3NTM3OTQ3L25ld3MvS2VuZHJpY2stUmVzb3VyY2VzLXRvdXRzLXBvc3NpYmlsaXR5LW9mLXZhbmFkaXVtLWF0LVN3ZWRlbi1hc3NldC00MzM3MDA1Mi8?oc=5,0
6,Steetz Copper Craft ltd. Supplies High-Quality European-Made ... - Digital Journal,Digital Journal,2 hours ago,"Wed, 29 Mar 2023 13:53:51 GMT",https://news.google.com/rss/articles/CBMiiAFodHRwczovL3d3dy5kaWdpdGFsam91cm5hbC5jb20vcHIvbmV3cy9zdGVldHotY29wcGVyLWNyYWZ0LWx0ZC1zdXBwbGllcy1oaWdoLXF1YWxpdHktZXVyb3BlYW4tbWFkZS10b29scy1hbmQtd29yay13ZWFyLWZvci1wcm9mZXNzaW9uYWxz0gEA?oc=5,0
7,Sea Moss: Things to Consider Before Introducing It to Your Diet - One Green Planet,One Green Planet,3 hours ago,"Wed, 29 Mar 2023 13:43:43 GMT",https://news.google.com/rss/articles/CBMibWh0dHBzOi8vd3d3Lm9uZWdyZWVucGxhbmV0Lm9yZy9uYXR1cmFsLWhlYWx0aC9zZWEtbW9zcy10aGluZ3MtdG8tY29uc2lkZXItYmVmb3JlLWludHJvZHVjaW5nLWl0LXRvLXlvdXItZGlldC_SAQA?oc=5,0
8,Interview: Rosotics Wants to Revolutionize Space Manufacturing with New Metal 3D Printer - 3DPrint.com,3DPrint.com,3 hours ago,"Wed, 29 Mar 2023 13:01:16 GMT",https://news.google.com/rss/articles/CBMic2h0dHBzOi8vM2RwcmludC5jb20vMjk4NTgwL2ludGVydmlldy1yb3NvdGljcy13YW50cy10by1yZXZvbHV0aW9uaXplLXNwYWNlLW1hbnVmYWN0dXJpbmctd2l0aC1uZXctbWV0YWwtM2QtcHJpbnRlci_SAXdodHRwczovLzNkcHJpbnQuY29tLzI5ODU4MC9pbnRlcnZpZXctcm9zb3RpY3Mtd2FudHMtdG8tcmV2b2x1dGlvbml6ZS1zcGFjZS1tYW51ZmFjdHVyaW5nLXdpdGgtbmV3LW1ldGFsLTNkLXByaW50ZXIvYW1wLw?oc=5,0
9,Granite Creek Copper Adds to Board of Directors and Advisory Board - Yahoo Finance,Yahoo Finance,4 hours ago,"Wed, 29 Mar 2023 12:00:00 GMT",https://news.google.com/rss/articles/CBMiTWh0dHBzOi8vZmluYW5jZS55YWhvby5jb20vbmV3cy9ncmFuaXRlLWNyZWVrLWNvcHBlci1hZGRzLWJvYXJkLTEyMDAwMDEyNS5odG1s0gFVaHR0cHM6Ly9maW5hbmNlLnlhaG9vLmNvbS9hbXBodG1sL25ld3MvZ3Jhbml0ZS1jcmVlay1jb3BwZXItYWRkcy1ib2FyZC0xMjAwMDAxMjUuaHRtbA?oc=5,0
