Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Little enhancement #26

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,13 +113,28 @@ business = gn.topic_headlines('business')

```

### **Stories by Multiple Topic**

```python
business = gn.topic_multiple_headlines(['business', 'world', 'nation'])

```

### **Geolocation Specific Stories**

```python
headquaters = gn.geo_headlines('San Fran')

```


### **Geolocation Multiple Specific Stories**

```python
headquaters = gn.geo_multiple_headlines(['Rome', 'Milan', 'Turin'])

```

### **Stories by a Query Search**

```python
Expand Down
150 changes: 129 additions & 21 deletions pygooglenews/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import time
import feedparser
from bs4 import BeautifulSoup
import urllib
from dateparser import parse as parse_date
import requests
from datetime import datetime
from datetime import timedelta
from time import mktime
import types

from typing import List


class GoogleNews:
def __init__(self, lang = 'en', country = 'US'):
def __init__(self, lang='en', country='US'):
self.lang = lang.lower()
self.country = country.upper()
self.BASE_URL = 'https://news.google.com/rss'
Expand All @@ -32,7 +38,7 @@ def __top_news_parser(self, text):

def __ceid(self):
"""Compile correct country-lang parameters for Google News RSS URL"""
return '?ceid={}:{}&hl={}&gl={}'.format(self.country,self.lang,self.lang,self.country)
return '?ceid={}:{}&hl={}&gl={}'.format(self.country, self.lang, self.lang, self.country)

def __add_sub_articles(self, entries):
for i, val in enumerate(entries):
Expand All @@ -54,26 +60,25 @@ def __scaping_bee_request(self, api_key, url):
if response.status_code == 200:
return response
if response.status_code != 200:
raise Exception("ScrapingBee status_code: " + str(response.status_code) + " " + response.text)
raise Exception("ScrapingBee status_code: " + str(response.status_code) + " " + response.text)

def __parse_feed(self, feed_url, proxies=None, scraping_bee = None):
def __parse_feed(self, feed_url, proxies=None, scraping_bee=None):

if scraping_bee and proxies:
raise Exception("Pick either ScrapingBee or proxies. Not both!")

if proxies:
r = requests.get(feed_url, proxies = proxies)
r = requests.get(feed_url, proxies=proxies)
else:
r = requests.get(feed_url)

if scraping_bee:
r = self.__scaping_bee_request(url = feed_url, api_key = scraping_bee)
r = self.__scaping_bee_request(url=feed_url, api_key=scraping_bee)
else:
r = requests.get(feed_url)


if 'https://news.google.com/rss/unsupported' in r.url:
raise Exception('This feed is not available')
raise Exception('This feed is not available: ' + r.url)

d = feedparser.parse(r.text)

Expand All @@ -92,40 +97,142 @@ def __from_to_helper(self, validate=None):
except:
raise Exception('Could not parse your date')



def top_news(self, proxies=None, scraping_bee = None):
def top_news(self, proxies=None, scraping_bee=None):
"""Return a list of all articles from the main page of Google News
given a country and a language"""
d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies, scraping_bee=scraping_bee)
d['entries'] = self.__add_sub_articles(d['entries'])
return d

def topic_headlines(self, topic: str, proxies=None, scraping_bee=None):
def topic_headlines(self, topic: str, proxies=None, time_span: timedelta = None, sort_by_publish_date: bool = True,
scraping_bee=None):
"""Return a list of all articles from the topic page of Google News
given a country and a language"""
#topic = topic.upper()
if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS', 'HEALTH']:
d = self.__parse_feed(self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
# topic = topic.upper()
d = {'entries': []}
if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS',
'HEALTH']:
t = self.__parse_feed(
self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(),
proxies=proxies, scraping_bee=scraping_bee)
else:
t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
scraping_bee=scraping_bee)
d['feed'] = t['feed']
if time_span is not None:
d['entries'] += [ta for ta in t['entries'] if
datetime.now() - time_span <= datetime.fromtimestamp(
mktime(ta['published_parsed']))]
else:
d['entries'] += t['entries']

d['entries'] = self.__add_sub_articles(d['entries'])

if sort_by_publish_date:
d['entries'] = sorted(d['entries'],
key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
reverse=True)

if len(d['entries']) > 0:
return d
else:
d = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
raise Exception('unsupported topic')

def topic_multiple_headlines(self, topic_list: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True,
proxies=None, scraping_bee=None):
"""Return a list of all articles from the list of topic page of Google News
given a country and a language"""

d = {'entries': []}
for topic in topic_list:
try:
if topic.upper() in ['WORLD', 'NATION', 'BUSINESS', 'TECHNOLOGY', 'ENTERTAINMENT', 'SCIENCE', 'SPORTS',
'HEALTH']:
t = self.__parse_feed(
self.BASE_URL + '/headlines/section/topic/{}'.format(topic.upper()) + self.__ceid(),
proxies=proxies, scraping_bee=scraping_bee)
else:
t = self.__parse_feed(self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(), proxies=proxies,
scraping_bee=scraping_bee)
d['feed'] = t['feed']
if time_span is not None:
d['entries'] += [ta for ta in t['entries'] if
datetime.now() - time_span <= datetime.fromtimestamp(
mktime(ta['published_parsed']))]
else:
d['entries'] += t['entries']
except Exception as e:
pass

d['entries'] = self.__add_sub_articles(d['entries'])
if sort_by_publish_date:
d['entries'] = sorted(d['entries'],
key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
reverse=True)

if len(d['entries']) > 0:
return d
else:
raise Exception('unsupported topic')

def geo_headlines(self, geo: str, proxies=None, scraping_bee=None):
def geo_headlines(self, geo: str, proxies=None, scraping_bee=None, time_span: timedelta = None,
sort_by_publish_date: bool = True):
"""Return a list of all articles about a specific geolocation
given a country and a language"""
d = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(), proxies = proxies, scraping_bee=scraping_bee)
d = {'entries': []}
t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(geo) + self.__ceid(),
proxies=proxies, scraping_bee=scraping_bee)
d['feed'] = t['feed']
if time_span is not None:
d['entries'] += [ta for ta in t['entries'] if
datetime.now() - time_span <= datetime.fromtimestamp(
mktime(ta['published_parsed']))]
else:
d['entries'] += t['entries']

d['entries'] = self.__add_sub_articles(d['entries'])
if sort_by_publish_date:
d['entries'] = sorted(d['entries'],
key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
reverse=True)
return d

def geo_multiple_headlines(self, geo: List[str], time_span: timedelta = None, sort_by_publish_date: bool = True,
proxies=None, scraping_bee=None):
"""Return a list of all articles about a list of geolocation
given a country and a language"""

d = {'entries': []}
for n in geo:
try:
t = self.__parse_feed(self.BASE_URL + '/headlines/section/geo/{}'.format(n) + self.__ceid(),
proxies=proxies, scraping_bee=scraping_bee)
d['feed'] = t['feed']
if time_span is not None:
d['entries'] += [ta for ta in t['entries'] if
datetime.now() - time_span <= datetime.fromtimestamp(
mktime(ta['published_parsed']))]
else:
d['entries'] += t['entries']
except Exception as e:
pass

d['entries'] = self.__add_sub_articles(d['entries'])

if sort_by_publish_date:
d['entries'] = sorted(d['entries'],
key=lambda p: datetime.fromtimestamp(mktime(p['published_parsed'])).timestamp(),
reverse=True)

# Remove multiple news
res = []
for da in d['entries']:
if da['title'] not in [re['title'] for re in res]:
res += [da]
d['entries'] = res
return d

def search(self, query: str, helper = True, when = None, from_ = None, to_ = None, proxies=None, scraping_bee=None):
def search(self, query: str, helper=True, when=None, from_=None, to_=None, proxies=None, scraping_bee=None):
"""
Return a list of all articles given a full-text search parameter,
a country and a language
Expand All @@ -151,7 +258,8 @@ def search(self, query: str, helper = True, when = None, from_ = None, to_ = Non
search_ceid = self.__ceid()
search_ceid = search_ceid.replace('?', '&')

d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies = proxies, scraping_bee=scraping_bee)
d = self.__parse_feed(self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies=proxies,
scraping_bee=scraping_bee)

d['entries'] = self.__add_sub_articles(d['entries'])
return d
return d
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pygooglenews"
version = "0.1.2"
version = "0.1.3"
description = "If Google News had a Python library"
authors = ["kotartemiy <bugara.artem@gmail.com>"]
license = "MIT"
Expand Down