In [None]:
%pip freeze

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
def setup_driver():
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [None]:
BASE_URL = 'https://news.google.com/search?q='

In [47]:
def get_news_articles(query, limit=10):
    query = query.replace(' ', '+')
    driver = setup_driver()
    driver.get(BASE_URL + query)
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.TAG_NAME, 'article'))
    )

    DATA = []

    articles = driver.find_elements(By.TAG_NAME, 'article')

    for i in range(limit):
        article = articles[i].find_elements(By.TAG_NAME, 'a')
        article_heading = article[-1].text
        article_url = article[-1].get_attribute('href')
        article_source = articles[i].find_element(By.CSS_SELECTOR, 'div.vr1PYe').text
        article_img_src = articles[i].find_element(By.CSS_SELECTOR, 'figure img').get_attribute('src')
        article_object = {
            'article_name': article_heading,
            'article_source': article_source,
            'article_url': article_url,
            'article_img_src': article_img_src
        }
        DATA.append(article_object)
    
    driver.quit()
    return DATA

In [48]:
query_param = 'artificial intelligence'

article_data = get_news_articles(query_param)

In [49]:
print(article_data)

[{'article_name': 'Army eyes artificial intelligence to enhance future Golden Dome', 'article_source': 'Defense News', 'article_url': 'https://news.google.com/read/CBMisgFBVV95cUxNSmZfMWZKeU1XXzVYZGJBbE83Zm8wbnNKV0MzNjlmUUxzM0RjRnZSUGFRMk5fRlFwVjk2OFB6S1JYa0dGVm5pbl9reHlpUlptdG1fUjB2MTlHTnh3TDhXa1FvaUxJVmR4c0QzamxaYUw5OWRsVG80aE5wdEVRM1ppaHFxZUhvaHl3MDNUQnJPbXo5WjdIdDZYeFo5TWZqVE1uOEJIYXc5X3pOajRCOTZxN0Nn?hl=en-IN&gl=IN&ceid=IN%3Aen', 'article_img_src': 'https://news.google.com/api/attachments/CC8iK0NnNVdUMDVXZUdsamVucFRkbkZRVFJERUF4aW1CU2dLTWdZWk5Jck1uUVU=-w200-h112-p-df-rw'}, {'article_name': 'Cloudflare Using Devilish Trick to Trap AI Scrapers in Infinite Maze of AI-Generated Content', 'article_source': 'Futurism', 'article_url': 'https://news.google.com/read/CBMiY0FVX3lxTFBUN2gxcEg1cGRZNVM3bGJJa3BQOWNiWlAzSXRVS1oyVHZPWEtKTU9tWFBLMHhlU0xJZE9OeUhxc2xra3JtUmJPWFcwMmxXa0Q2b2NCU3duR0pWLWZoek0wNFUyTQ?hl=en-IN&gl=IN&ceid=IN%3Aen', 'article_img_src': 'https://news.google.com/api/attachment

In [50]:
import json

JSON_FILE_NAME = 'gnews_articles.json'

with open(JSON_FILE_NAME, 'w') as file:
    json.dump(article_data, file, indent=2)

print('JSON file saved successfully')

JSON file saved successfully


In [51]:
import pandas as pd

CSV_FILE_NAME = 'gnews_articles.csv'

df = pd.DataFrame(article_data)

df.to_csv(CSV_FILE_NAME, index=False)

print("CSV file saved successfully")

CSV file saved successfully
