In [None]:
pip install requests beautifulsoup4 selenium webdriver-manager python-docx

Collecting selenium
  Downloading selenium-4.34.2-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Collecting trio~=0.30.0 (from selenium)
  Downloading trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Downloading trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.12.2->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.34.2-py3-none-any.whl (9.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m

In [None]:
# Install necessary packages for Selenium in Google Colab
!pip install selenium webdriver-manager beautifulsoup4 python-docx

# Install Chromium browser and WebDriver
!apt-get update # Update apt-get
!apt install chromium-browser # Install Chromium
!apt-get install chromium-chromedriver # Install the Chromium WebDriver

# Set up Selenium to use the installed Chromium and ChromeDriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Configure Chrome options for headless execution in Colab
chrome_options = Options()
chrome_options.add_argument('--headless') # Run in headless mode (no visible browser UI)
chrome_options.add_argument('--no-sandbox') # Required for running in Colab's environment
chrome_options.add_argument('--disable-dev-shm-usage') # Prevents issues with shared memory in Docker/Colab

# Specify the path to the ChromeDriver installed by apt-get
# This is crucial for Colab because apt-get installs it in a specific location
chrome_driver_path = '/usr/lib/chromium-browser/chromedriver'
service = Service(chrome_driver_path) # Use the specific path for the service

# Now, your get_webdriver function will look like this:
def get_webdriver():
    """Initializes and returns a Chrome WebDriver instance for Colab."""
    return webdriver.Chrome(service=service, options=chrome_options)


Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,851 kB]
Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:11 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3,159 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 http://archive.ubuntu.com

In [5]:
# --- SECTION 1: INSTALLATION AND SETUP FOR GOOGLE COLAB ---
# Install necessary packages and Chrome browser for Colab environment

print("Installing Python packages...")
!pip install selenium webdriver-manager beautifulsoup4 > /dev/null
print("Python packages installed.")

print("Setting up Google Chrome stable browser...")
!apt-get update > /dev/null
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list
!apt-get update > /dev/null
!apt-get install google-chrome-stable -y > /dev/null
print("Google Chrome stable browser installed.")

# --- SECTION 2: IMPORTS AND WEBDRIVER SETUP ---
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
import re

# Configure Chrome options for headless execution
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.binary_location = '/usr/bin/google-chrome-stable'

# Initialize ChromeDriver service
try:
    service = Service(ChromeDriverManager().install())
    print("ChromeDriver service initialized successfully.")
except Exception as e:
    print(f"Error initializing ChromeDriver service: {e}")
    raise SystemExit("Failed to initialize ChromeDriver service. Exiting.")

def get_webdriver():
    return webdriver.Chrome(service=service, options=chrome_options)

# --- SECTION 3: WEB SCRAPING LOGIC ---
target_url = "https://news.sky.com"
all_articles_data = []
scraped_urls = set()
driver = None

try:
    print("\n--- Starting Sky News Scraping ---")
    driver = get_webdriver()
    print(f"Navigating to {target_url}...")
    driver.get(target_url)
    time.sleep(3)  # Initial page load

    # Handle Cookie Consent Pop-up
    try:
        print("Attempting to handle cookie consent...")
        cookie_accept_button = None
        # Try multiple selectors for robustness
        selectors = [
            (By.ID, "onetrust-accept-btn-handler"),
            (By.XPATH, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept')]"),
            (By.XPATH, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'agree')]"),
            (By.CSS_SELECTOR, 'button[class*="cookie"][class*="accept"], button[class*="consent"][class*="accept"]'),
            (By.CSS_SELECTOR, 'button[aria-label*="accept"], button[aria-label*="consent"]')
        ]
        for by, value in selectors:
            try:
                cookie_accept_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((by, value))
                )
                break
            except TimeoutException:
                continue
        if cookie_accept_button:
            cookie_accept_button.click()
            print("Successfully clicked cookie consent button.")
            time.sleep(2)
        else:
            print("Cookie consent button not found. Proceeding without clicking.")
    except Exception as e:
        print(f"Error handling cookie consent: {e}")

    # Dynamic Scrolling to Load Articles
    print("Scrolling to load articles...")
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_pause_time = 3
    max_scroll_attempts = 20  # Prevent infinite loops
    scroll_attempts = 0

    while scroll_attempts < max_scroll_attempts:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts += 1
        print(f"  Scrolled {scroll_attempts}/{max_scroll_attempts}. Page height: {new_height}")
        if new_height == last_height:
            print("No more articles to load.")
            break
        last_height = new_height

    print("Finished scrolling. Extracting links...")
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract Article Links and Headlines
    # Use flexible selectors for article containers
    article_selectors = [
        'div.sdc-site-tiles__item',  # Common for Sky News tiles
        'article',  # Fallback for semantic HTML
        'div[class*="tile"], div[class*="card"]'  # Broad match for tile/card classes
    ]
    article_links_to_visit = []
    for selector in article_selectors:
        articles = soup.select(selector)
        print(f"Found {len(articles)} elements with selector '{selector}'")
        for article in articles:
            link_tag = article.find('a', href=True)
            if link_tag and link_tag['href']:
                article_url = link_tag['href']
                if not article_url.startswith('http'):
                    article_url = target_url.rstrip('/') + '/' + article_url.lstrip('/')
                if article_url in scraped_urls:
                    continue
                # Extract headline
                headline_element = article.find(['h3', 'h2', 'h1']) or link_tag
                headline_text = headline_element.text.strip() if headline_element else "No Headline Found"
                article_links_to_visit.append({'headline': headline_text, 'url': article_url})
                scraped_urls.add(article_url)
        if article_links_to_visit:  # Stop if we found articles
            break

    print(f"Collected {len(article_links_to_visit)} unique article links.")

    # Scrape Individual Articles
    article_scrape_delay = 1
    for i, article_info in enumerate(article_links_to_visit):
        article_url = article_info['url']
        initial_headline = article_info['headline']
        print(f"\nScraping article {i+1}/{len(article_links_to_visit)}: '{initial_headline}' ({article_url})...")

        try:
            driver.get(article_url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="article-body"], div[class*="content"]'))
            )
            article_soup = BeautifulSoup(driver.page_source, 'html.parser')

            # Extract full article title
            title_element = article_soup.find(['h1', 'h2'], class_=re.compile('headline|title', re.I))
            full_article_title = title_element.text.strip() if title_element else initial_headline

            # Extract article content
            body_selectors = [
                'div.sdc-article-body',
                'div[class*="article-body"]',
                'article',
                'div[class*="content"]'
            ]
            article_content = ""
            for sel in body_selectors:
                article_body = article_soup.select_one(sel)
                if article_body:
                    content_elements = article_body.find_all(['p', 'h2', 'h3', 'ul', 'ol', 'blockquote'])
                    article_content = "\n\n".join(elem.get_text(strip=True) for elem in content_elements)
                    break
            if not article_content:
                article_content = "Content not found."

            # Clean content
            article_content = re.sub(r'\n\s*\n', '\n\n', article_content).strip()

            # Extract publication date
            date_element = article_soup.find(['time', 'p', 'span'], class_=re.compile('date|time', re.I))
            publication_date = date_element.text.strip() if date_element else "Date not found"

            # Extract author
            author_element = article_soup.find(['p', 'span'], class_=re.compile('author|byline', re.I))
            author = author_element.text.strip() if author_element else "Author not found"

            all_articles_data.append({
                'Headline': full_article_title,
                'URL': article_url,
                'Publication Date': publication_date,
                'Author': author,
                'Content': article_content
            })
            print(f"  Successfully scraped: '{full_article_title}'")
        except Exception as e:
            print(f"  Error scraping {article_url}: {e}")
        time.sleep(article_scrape_delay)

    # Save to CSV
    if all_articles_data:
        csv_filename = 'sky_news_articles.csv'
        keys = ['Headline', 'URL', 'Publication Date', 'Author', 'Content']
        with open(csv_filename, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_articles_data)
        print(f"\n--- Scraping Complete! Saved {len(all_articles_data)} articles to {csv_filename} ---")
    else:
        print("\nNo articles scraped.")

except Exception as e:
    print(f"Critical error: {e}")
finally:
    if driver:
        driver.quit()
        print("Browser closed.")

Installing Python packages...
Python packages installed.
Setting up Google Chrome stable browser...
W: Target Packages (main/binary-amd64/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt/sources.list.d/google-chrome.list:4
W: Target Packages (main/binary-all/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt/sources.list.d/google-chrome.list:4
W: Target Packages (main/binary-amd64/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt/sources.list.d/google-chrome.list:5
W: Target Packages (main/binary-all/Packages) is configured multiple times in /etc/apt/sources.list.d/google-chrome.list:3 and /etc/apt/sources.list.d/google-chrome.list:5
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
W: Target Packages 



Browser closed.
