### Ingest Google News

#### Initialise

In [29]:
!pip install beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [30]:
# import libraries
import requests
from bs4 import BeautifulSoup
import re

In [218]:
# define functions
# extract urls from google news search query
def search_google_news_urls(search_query):
    hrefs = []
    page = 1
    query_end = False
    while query_end==False:
        search_url = f'https://www.google.com/search?q={search_query}&source=lnms&tbm=nws&start={(page-1)*10}&tbs=sbd:1&safe=active&ssui=on'
        r = requests.get(search_url)
        soup = BeautifulSoup(r.text, 'html.parser')
        atags = soup.find_all('a')
        links = [link['href'] for link in atags]
        hrefs += links
        page += 1 
        if re.search('Next',str(soup))==None:
            query_end = True
        print()
    return hrefs 
# exclude urls if they contain an exclusion term
def exclude_urls(urls, exclude_list):
    val = []
    for url in urls: 
        if 'https://' in url and not any(exclude_word in url for exclude_word in exclude_list):
            res = re.findall(r'(https?://\S+)', url)[0].split('&')[0]
            val.append(res)
    return list(set(val))
# extract full article text
def extract_articles(URLs):
    articles = []
    for url in URLs: 
        r = requests.get(url)
        soup = BeautifulSoup(r.text, 'html.parser')
        paragraphs = soup.find_all('p')
        text = [paragraph.text for paragraph in paragraphs]
        words = ' '.join(text).split(' ')
        article = ' '.join(words)
        articles.append(article)
    return articles

In [198]:
# define parameters
topic_terms = ['quantum','hydrogen','robotics','nanomaterials'] # list of topic terms used to create each query
start_date = '2023-08-01'
end_date = '2023-08-02'
search_conditions = f'+AND+(investment|start-up|invent|development|market|funding|research)+after:{start_date}+before:{end_date}' # search conditions to apply to all queries
queries = {topic:topic+search_conditions for topic in topic_terms}  # terms and conditions combined
exclude_list = ['maps', 'policies', 'preferences', 'accounts', 'support', 'www.google.com']   # urls containing any of these terms will be excluded

#### Data ingestion

In [199]:
# extract URLs from search pages
raw_urls = {query:search_google_news_urls(query) for query in queries}

In [200]:
# filter only URLs of search results
cleaned_urls = {query:exclude_urls(raw_urls[query], exclude_list) for query in queries}

In [None]:
# extract full text of articles
articles = {query:extract_articles(cleaned_urls[query]) for query in queries}

#### Store result

In [None]:
# convert to dataframe
df = pd.DataFrame(articles)

In [None]:
# save as CSV
df.to_csv('data/raw/google_news.csv')