In [None]:
%pip freeze

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [72]:
BASE_URL = 'https://news.google.com/search?q='

driver = setup_driver()

In [75]:
def get_news_articles(query, limit=10):
    query_param = query.replace(' ', '+')
    driver.get(BASE_URL + query_param)
    print('Searching for : ', query)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'article'))
    )

    DATA = []

    articles = driver.find_elements(By.TAG_NAME, 'article')

    for i in range(limit):
        article = articles[i].find_elements(By.TAG_NAME, 'a')
        article_heading = article[-1].text
        article_url = article[-1].get_attribute('href')
        article_source = articles[i].find_element(By.CSS_SELECTOR, 'div.vr1PYe').text
        article_img_src = articles[i].find_elements(By.CSS_SELECTOR, 'img')[-1].get_attribute('src')
        article_object = {
            'query_param': query,
            'article_name': article_heading,
            'article_source': article_source,
            'article_url': article_url,
            'article_img_src': article_img_src
        }
        DATA.append(article_object)

    return DATA

In [76]:
queries = [
    'science & technology',
    'environment & nature',
    'comedy',
    'sports & entertainment',
    'education & career'
]

article_data = []

for query in queries:
    article_data.extend(get_news_articles(query, limit=10))

Searching for :  science & technology
Searching for :  environment & nature
Searching for :  comedy
Searching for :  sports & entertainment
Searching for :  education & career


In [77]:
import json

JSON_FILE_NAME = 'gnews_articles.json'

with open(JSON_FILE_NAME, 'w') as file:
    json.dump(article_data, file, indent=2)

print('JSON file saved successfully')

JSON file saved successfully


In [78]:
import pandas as pd

CSV_FILE_NAME = 'gnews_articles.csv'

df = pd.DataFrame(article_data)

df.to_csv(CSV_FILE_NAME, index=False)

print("CSV file saved successfully")

CSV file saved successfully
