In [1]:
# --- SECTION 1: INSTALLATION AND SETUP FOR GOOGLE COLAB ---
print("Installing Python packages...")
!pip install selenium webdriver-manager beautifulsoup4 dateparser > /dev/null
print("Python packages installed.")

print("Setting up Google Chrome stable browser...")
!apt-get update > /dev/null
!wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - > /dev/null
!echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list
!apt-get update > /dev/null
!apt-get install google-chrome-stable -y > /dev/null
print("Google Chrome stable browser installed.")

# --- SECTION 2: MAIN CODE ---
import re
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import csv
import platform
import dateparser

# Configure Chrome options for headless execution
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--start-maximized')
chrome_options.binary_location = '/usr/bin/google-chrome-stable'

# Initialize ChromeDriver service
try:
    service = Service(ChromeDriverManager().install())
except Exception as e:
    raise SystemExit(f"Error initializing ChromeDriver service: {e}")

def get_webdriver():
    return webdriver.Chrome(service=service, options=chrome_options)

def clean_text(text):
    """Clean and normalize text"""
    if not text:
        return ""
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()
    return text

def extract_date(soup):
    """Improved date extraction with multiple fallbacks"""
    # Try to find date in common elements
    date_selectors = [
        ('time', {'datetime': True}),
        ('time', {'class': re.compile('date|time|published', re.I)}),
        ('span', {'class': re.compile('date|time|published', re.I)}),
        ('div', {'class': re.compile('date|time|published', re.I)}),
        ('p', {'class': re.compile('date|time|published', re.I)}),
        ('meta', {'property': 'article:published_time'}),
        ('meta', {'name': 'date'}),
        ('meta', {'name': 'DC.date.issued'}),
    ]

    for tag, attrs in date_selectors:
        element = soup.find(tag, attrs)
        if element:
            if tag == 'meta':
                date_text = element.get('content', '')
            elif 'datetime' in element.attrs:
                date_text = element['datetime']
            else:
                date_text = element.get_text()

            date_text = clean_text(date_text)
            if date_text:
                # Parse the date string to a consistent format
                parsed_date = dateparser.parse(date_text)
                if parsed_date:
                    return parsed_date.strftime('%Y-%m-%d %H:%M:%S')

    # If no date found in standard locations, try to find it in the article body
    body_text = soup.get_text()
    date_patterns = [
        r'\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}',
        r'\d{4}-\d{2}-\d{2}',
        r'\d{1,2}/\d{1,2}/\d{2,4}',
        r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}'
    ]

    for pattern in date_patterns:
        match = re.search(pattern, body_text)
        if match:
            parsed_date = dateparser.parse(match.group())
            if parsed_date:
                return parsed_date.strftime('%Y-%m-%d %H:%M:%S')

    return "Date not found"

def extract_content(soup):
    """Improved content extraction that separates metadata from content"""
    # First try to find the main article body
    body_selectors = [
        'div[class*="article-body"]',
        'div[class*="content"]',
        'article',
        'main',
        'div[class*="story"]',
        'div[class*="post-content"]'
    ]

    for selector in body_selectors:
        article_body = soup.select_one(selector)
        if article_body:
            # Remove unwanted elements that might contain metadata
            for unwanted in article_body.select('.date, .author, .category, .tags, .social-share, .comments'):
                unwanted.decompose()

            # Get clean paragraphs
            paragraphs = []
            for p in article_body.find_all(['p', 'h2', 'h3'], recursive=True):
                text = clean_text(p.get_text())
                if text and len(text.split()) > 3:  # Skip very short paragraphs
                    paragraphs.append(text)

            if paragraphs:
                return "\n\n".join(paragraphs)

    # Fallback to more generic approach if specific selectors don't work
    body_text = soup.get_text()
    body_text = re.sub(r'\s+', ' ', body_text)
    return clean_text(body_text)

# Web scraping logic
base_url = "https://trt.global/world/politics"
all_articles_data = []
scraped_urls = set()
driver = None

try:
    driver = get_webdriver()
    driver.get(base_url)
    time.sleep(3)

    # Handle Cookie Consent Pop-up
    try:
        cookie_accept_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(translate(text(), 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'accept')]"))
        )
        cookie_accept_button.click()
        time.sleep(2)
    except TimeoutException:
        pass

    # Click "Load More" button until no more articles are available
    load_more_attempts = 0
    max_load_more_attempts = 50

    while load_more_attempts < max_load_more_attempts:
        try:
            # Scroll to bottom to ensure the button is in view
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)

            # Find and click the "Load More" button
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(translate(., 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz'), 'load more')]"))
            )
            driver.execute_script("arguments[0].click();", load_more_button)
            print("Clicked 'Load More' button")
            time.sleep(3)
            load_more_attempts += 1
        except TimeoutException:
            print("No more 'Load More' button found or reached maximum attempts")
            break
        except Exception as e:
            print(f"Error clicking 'Load More': {str(e)}")
            break

    # After loading all articles, parse the page
    html_content = driver.page_source
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract Article Links and Data
    article_selectors = [
        'article',
        'div[class*="news-item"]',
        'div[class*="article"]',
        'li[class*="item"]',
        'div[class*="card"]'
    ]

    article_links_to_visit = []

    for selector in article_selectors:
        articles = soup.select(selector)
        for article in articles:
            link_tag = article.find('a', href=True)
            if link_tag and link_tag['href']:
                article_url = link_tag['href']
                # Ensure full URL and handle redirects
                if not article_url.startswith('http'):
                    article_url = 'https://trt.global' + article_url if article_url.startswith('/') else base_url.rstrip('/') + '/' + article_url.lstrip('/')

                if article_url in scraped_urls:
                    continue

                # Try to get title from multiple possible elements and classes
                title_element = None
                title_selectors = [
                    ('h1', re.compile('title|headline', re.I)),
                    ('h2', re.compile('title|headline', re.I)),
                    ('h3', re.compile('title|headline', re.I)),
                    ('div', re.compile('title|headline', re.I)),
                    ('a', re.compile('title|headline', re.I)),
                    ('span', re.compile('title|headline', re.I))
                ]

                for tag, class_pattern in title_selectors:
                    if not title_element:
                        title_element = article.find(tag, class_=class_pattern)
                        if not title_element:
                            title_element = article.find(tag)

                title_text = clean_text(title_element.get_text()) if title_element else "No Title Found"

                article_links_to_visit.append({
                    'title': title_text,
                    'url': article_url
                })
                scraped_urls.add(article_url)

        if article_links_to_visit:
            break

    print(f"Found {len(article_links_to_visit)} articles to scrape")

    # Scrape Individual Articles with Retry
    max_retries = 3
    for i, article_info in enumerate(article_links_to_visit):
        article_url = article_info['url']
        initial_title = article_info['title']
        retries = 0

        print(f"\nScraping article {i+1}/{len(article_links_to_visit)}: {article_url}")

        while retries < max_retries:
            try:
                driver.get(article_url)
                WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, 'div[class*="article-body"], div[class*="content"], article, main'))
                )
                article_soup = BeautifulSoup(driver.page_source, 'html.parser')

                # Extract title with multiple fallbacks
                title_element = None
                title_selectors = [
                    ('h1', re.compile('title|headline|article-title|heading', re.I)),
                    ('h2', re.compile('title|headline|article-title|heading', re.I)),
                    ('h3', re.compile('title|headline|article-title|heading', re.I)),
                    ('div', re.compile('title|headline|article-title|heading', re.I))
                ]

                for tag, class_pattern in title_selectors:
                    if not title_element:
                        title_element = article_soup.find(tag, class_=class_pattern)

                full_title = clean_text(title_element.get_text()) if title_element else initial_title

                # Extract date with improved function
                date = extract_date(article_soup)

                # Extract section with fallback
                section_element = article_soup.find('a', class_=re.compile('category|section', re.I)) or \
                                article_soup.find('div', class_=re.compile('category|section', re.I))
                section = clean_text(section_element.get_text()) if section_element and section_element.get_text() else "Politics"

                # Extract uri
                uri = article_url

                # Extract content with improved function
                article_content = extract_content(article_soup)

                all_articles_data.append({
                    'date': date,
                    'section': section,
                    'uri': uri,
                    'title': full_title,
                    'content': article_content
                })

                print(f"Successfully scraped: '{full_title}'")
                print(f"Date: {date}")
                print(f"Section: {section}")
                print(f"Content length: {len(article_content)} characters\n")
                break

            except (TimeoutException, NoSuchElementException, WebDriverException) as e:
                retries += 1
                print(f"Error scraping {article_url} (Attempt {retries}/{max_retries}): {str(e)}")
                if retries == max_retries:
                    print(f"Failed to scrape {article_url} after {max_retries} attempts")
                time.sleep(2 ** retries)
            except Exception as e:
                print(f"Unexpected error scraping {article_url}: {str(e)}")
                break

        time.sleep(1)

    # Save to CSV with proper formatting
    if all_articles_data:
        csv_filename = 'trt_politics_news_improved.csv'
        keys = ['date', 'section', 'uri', 'title', 'content']

        # Write CSV with proper escaping
        with open(csv_filename, 'w', newline='', encoding='utf-8') as output_file:
            writer = csv.DictWriter(output_file, fieldnames=keys, quoting=csv.QUOTE_ALL)
            writer.writeheader()
            for article in all_articles_data:
                # Ensure all fields are properly escaped
                cleaned_article = {
                    'date': article['date'].replace('\n', ' ').replace('\r', ''),
                    'section': article['section'].replace('\n', ' ').replace('\r', ''),
                    'uri': article['uri'],
                    'title': article['title'].replace('\n', ' ').replace('\r', ''),
                    'content': article['content'].replace('\r', '')  # Keep \n for paragraphs
                }
                writer.writerow(cleaned_article)

        print(f"\nSaved {len(all_articles_data)} articles to {csv_filename}")
    else:
        print("No articles scraped.")

except Exception as e:
    print(f"Critical error: {e}")
    if driver:
        driver.quit()
    raise

finally:
    if driver:
        driver.quit()

if platform.system() == "Emscripten":
    pass
else:
    if __name__ == "__main__":
        pass

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Date: 2025-06-27 00:00:00
Section: Politics
Content length: 4450 characters


Scraping article 213/518: https://trt.global/world/article/c7727273db5d
Successfully scraped: 'Trump says Gaza ceasefire is possible, 'even next week'US President Trump voices optimism about new ceasefire in Gaza, saying an agreement involving Israel and Hamas could come as early as next week.'
Date: 2025-06-27 00:00:00
Section: Politics
Content length: 2200 characters


Scraping article 214/518: https://trt.global/world/article/a09a26678157
Successfully scraped: 'Fidan: Europe now shifting to Erdogan's tough stance on Gaza, UkraineIsrael's recent actions revealed its inability to dismantle Iran’s nuclear programme on its own, Turkish top diplomat says.'
Date: 2025-06-27 00:00:00
Section: Politics
Content length: 1453 characters


Scraping article 215/518: https://trt.global/world/article/f9b8831be82e
Successfully scraped: 'Was Armenia coup plot