In [7]:
# --- SECTION 1: INSTALLATION AND SETUP FOR GOOGLE COLAB ---
print("Installing Python packages...")
!pip install selenium webdriver-manager beautifulsoup4 > /dev/null
print("Python packages installed.")

print("Setting up Google Chrome stable browser...")
!apt-get update > /dev/null
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list
!apt-get update > /dev/null
!apt-get install google-chrome-stable -y > /dev/null
print("Google Chrome stable browser installed.")

# --- SECTION 2: MAIN CODE ---
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv

# Configure Chrome options for headless execution
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.binary_location = '/usr/bin/google-chrome-stable'

# Initialize ChromeDriver service
try:
    service = Service(ChromeDriverManager().install())
except Exception as e:
    raise SystemExit(f"Error initializing ChromeDriver service: {e}")

def get_webdriver():
    return webdriver.Chrome(service=service, options=chrome_options)

# Web scraping logic
base_url = "https://trt.global/world/politics"
all_articles_data = []
scraped_urls = set()
driver = None

try:
    driver = get_webdriver()
    driver.get(base_url)
    time.sleep(3)

    # Handle Cookie Consent Pop-up
    try:
        cookie_accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept')]"))
        )
        cookie_accept_button.click()
        time.sleep(2)
    except TimeoutException:
        pass

    # Dynamic Scrolling to Load Articles
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_pause_time = 3
    max_scroll_attempts = 20
    scroll_attempts = 0

    while scroll_attempts < max_scroll_attempts:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause_time)
        new_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts += 1
        if new_height == last_height:
            break
        last_height = new_height

    # Handle Pagination (if available)
    while True:
        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        # Extract Article Links and Data
        article_selectors = ['article', 'div[class*="news-item"], div[class*="article"]', 'li[class*="item"]']
        article_links_to_visit = []
        for selector in article_selectors:
            articles = soup.select(selector)
            for article in articles:
                link_tag = article.find('a', href=True)
                if link_tag and link_tag['href']:
                    article_url = link_tag['href']
                    # Ensure full URL and handle redirects
                    if not article_url.startswith('http'):
                        article_url = 'https://trt.global' + article_url if article_url.startswith('/') else base_url.rstrip('/') + '/' + article_url.lstrip('/')
                    try:
                        driver.get(article_url)
                        time.sleep(1)
                        article_url = driver.current_url  # Get the resolved URL after redirect
                    except WebDriverException:
                        continue
                    if article_url in scraped_urls:
                        continue
                    title_element = article.find(['h1', 'h2', 'h3', 'div'], class_=re.compile('title|headline|article-title|heading', re.I))
                    title_text = title_element.text.strip() if title_element else "No Title Found"
                    article_links_to_visit.append({'title': title_text, 'url': article_url})
                    scraped_urls.add(article_url)
            if article_links_to_visit:
                break

        # Check for next page link
        next_page = soup.find('a', class_=re.compile('next|pagination-next|load-more', re.I))
        if next_page and 'href' in next_page.attrs:
            next_url = 'https://trt.global' + next_page['href'] if next_page['href'].startswith('/') else next_page['href']
            driver.get(next_url)
            time.sleep(3)
        else:
            break

    # Scrape Individual Articles with Retry
    max_retries = 3
    for i, article_info in enumerate(article_links_to_visit):
        article_url = article_info['url']
        initial_title = article_info['title']
        retries = 0
        while retries < max_retries:
            try:
                driver.get(article_url)
                WebDriverWait(driver, 15).until(  # Increased wait time to 15 seconds
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="article-body"], div[class*="content"], article, main'))
                )
                article_soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Extract title with fallback
                title_element = article_soup.find(['h1', 'h2', 'h3', 'div'], class_=re.compile('title|headline|article-title|heading', re.I))
                full_title = title_element.text.strip() if title_element else initial_title

                # Extract date with fallback
                date_element = article_soup.find(['time', 'p', 'span', 'div'], class_=re.compile('date|time|published|post-date', re.I))
                date = date_element.text.strip() if date_element else "Date not found"

                # Extract section with fallback
                section_element = article_soup.find('a', class_=re.compile('category|section', re.I)) or article_soup.find('div', class_=re.compile('category|section', re.I))
                section = section_element.text.strip() if section_element and section_element.text.strip() else "Politics"

                # Extract uri
                uri = article_url

                # Extract content in paragraph format
                body_selectors = ['div[class*="article-body"]', 'div[class*="content"]', 'article', 'main']
                article_content = ""
                for sel in body_selectors:
                    article_body = article_soup.select_one(sel)
                    if article_body:
                        paragraphs = article_body.find_all(['p', 'div'], recursive=False)
                        article_content = "\n\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
                        break
                if not article_content:
                    article_content = "Content not found."

                all_articles_data.append({
                    'date': date,
                    'section': section,
                    'uri': uri,
                    'title': full_title,
                    'description': article_content
                })
                print(f"Successfully scraped: '{full_title}' from {article_url}")
                break
            except (TimeoutException, NoSuchElementException, WebDriverException) as e:
                retries += 1
                print(f"Error scraping {article_url} (Attempt {retries}/{max_retries}): {str(e)}")
                if retries == max_retries:
                    print(f"Failed to scrape {article_url} after {max_retries} attempts")
                time.sleep(2 ** retries)  # Exponential backoff
            except Exception as e:
                print(f"Unexpected error scraping {article_url}: {str(e)}")
                break
        time.sleep(1)

    # Save to CSV
    if all_articles_data:
        csv_filename = 'trt_politics_news.csv'
        keys = ['date', 'section', 'uri', 'title', 'description']
        with open(csv_filename, 'w', newline='', encoding='utf-8') as output_file:
            dict_writer = csv.DictWriter(output_file, fieldnames=keys)
            dict_writer.writeheader()
            dict_writer.writerows(all_articles_data)
        print(f"Saved {len(all_articles_data)} articles to {csv_filename}")
    else:
        print("No articles scraped.")

except Exception as e:
    print(f"Critical error: {e}")
finally:
    if driver:
        driver.quit()

if platform.system() == "Emscripten":
    pass
else:
    if __name__ == "__main__":
        pass

Installing Python packages...
Python packages installed.
Setting up Google Chrome stable browser...
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Google Chrome stable browser installed.
Successfully scraped: 'No Title Found' from https://trt.global/world/article/5bc24073797c
Successfully scraped: 'No Title Found' from https://trt.global/world/article/114c2add2514
Successfully scraped: 'No Title Found' from https://trt.global/world/article/e980eed3e9d1
Successfully scraped: 'No Title Found' from https://trt.global/world/article/3263d7078994
Successfully scraped: 'No Title Found' from https://trt.global/world/article/f47ccb6ccf14
Successfully scraped: 'No Title 