In [None]:
pip install -U selenium

In [None]:
from datetime import datetime
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from bs4 import BeautifulSoup
import pandas as pd

# Function to scrape CNN articles based on specified parameters
def scrape_cnn_articles(stop_date, search_term, start_page):
    # Set up options for the Chrome WebDriver
    options = Options()
    options.headless = False  # Set to True if you want to run in headless mode (no browser window)

    # Initialize the WebDriver
    driver = webdriver.Chrome(options=options)

    # List to store scraped results
    results = []

    # Convert stop_date string to datetime object for comparison
    stop_date_object = datetime.strptime(stop_date, '%b %d, %Y')

    try:
        # Navigate to the CNN search page with specified parameters
        driver.get(f'https://edition.cnn.com/search?q={search_term}&from=0&size=10&page={start_page}&sort=newest&types=article&section=')

        # Main scraping loop
        while True:
            # Wait until the articles are present on the page
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'card'))
            )

            # Parse the HTML content with BeautifulSoup
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            # Find all articles on the page
            articles = soup.find_all('div', class_='card')

            # Iterate through each article
            for article in articles:
                # Extract date information
                date_element = article.find('div', class_='container__date')
                date_string = date_element.get_text(strip=True) if date_element else 'Date not available'
                date_object = datetime.strptime(date_string, '%b %d, %Y')

                # Extract headline information
                headline_element = article.find('span', class_='container__headline-text')
                headline = headline_element.get_text(strip=True) if headline_element else 'Headline not available'

                # Extract link information
                link_element = article.find('a')
                link = link_element['href'] if link_element else 'Link not available'

                # Add the result to the list
                results.append({
                    'Headline': headline,
                    'Link': link,
                    'Date': date_object
                })

                # Check if the article date is before the stop_date
                if date_object < stop_date_object:
                    print(f'Stopping scraping as an article with date {date_string} has been reached.')

                    # Write the scraped data to a CSV file
                    with open(f'{search_term}_scraped_data.csv', 'w', newline='', encoding='utf-8') as csv_file:
                        fieldnames = ['Headline', 'Link', 'Date']
                        writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                        writer.writeheader()
                        for result in results:
                            writer.writerow(result)
                    print(f'Scraped data has been saved to {search_term}_scraped_data.csv')
                    return results

            # Try to find and click the next page button
            try:
                next_page_button = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.CLASS_NAME, 'pagination-arrow-right'))
                )

                # Scroll to the next page button and click it
                driver.execute_script("arguments[0].scrollIntoView(true);", next_page_button)
                time.sleep(7)  # Add a delay to ensure the page has loaded
                next_page_button.click()

            except StaleElementReferenceException:
                # Handle StaleElementReferenceException and retry finding the next page button
                print('Stale element reference. Retrying to find the next page button.')
                continue

    finally:
        # Quit the WebDriver when done
        driver.quit()

# Example usage:
# scrape_cnn_articles('Jan 01, 2024', 'example_search', 1)



In [None]:
#Israel
scraped_articles = scrape_cnn_articles('Oct 07, 2023', "Israel", "1")

Stopping scraping as an article with date Oct 06, 2023 has been reached.
Scraped data has been saved to Israel_scraped_data.csv


In [None]:
#Palestine
scraped_articles = scrape_cnn_articles('Oct 07, 2023', "Palestine", "1")

Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stopping scraping as an article with date Oct 04, 2023 has been reached.
Scraped data has been saved to Palestine_scraped_data.csv


In [None]:
#Hamas
scraped_articles = scrape_cnn_articles('Oct 07, 2023', "Hamas", "1")

Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stopping scraping as an article with date Aug 22, 2023 has been reached.
Scraped data has been saved to Hamas_scraped_data.csv


In [None]:
#Gaza
scraped_articles = scrape_cnn_articles('Oct 07, 2023', "Gaza", "1")

Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stopping scraping as an article with date Oct 06, 2023 has been reached.
Scraped data has been saved to Gaza_scraped_data.csv


In [None]:
#IDF
scraped_articles = scrape_cnn_articles('Oct 07, 2023', "IDF", "1")

Stopping scraping as an article with date Oct 06, 2023 has been reached.
Scraped data has been saved to IDF_scraped_data.csv


Combining CSV files and filtering on date

In [None]:
csv_files = ['Gaza_scraped_data.csv', 'Hamas_scraped_data.csv', 'Israel_scraped_data.csv', 'Palestine_scraped_data.csv', 'IDF_scraped_data.csv']

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    dfs.append(df)

combined_df = pd.concat(dfs, ignore_index=True)
combined_df.drop_duplicates(inplace=True)
combined_df.to_csv('israel_palestine_conflict.csv', index=False)



In [None]:
df = pd.read_csv('israel_palestine_conflict.csv')

df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

start_date = '2023-10-07'
end_date = '2023-12-07'
filtered_df = df.loc[start_date:end_date]

filtered_df.reset_index(inplace=True)

filtered_df.to_csv('israel_palestine_conflict.csv', index=False)


RUSSO UKRAINIAN WAR

In [None]:
#Ukraine
scraped_articles = scrape_cnn_articles('Feb 24, 2022', "Ukraine", "207")

Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stopping scraping as an article with date Feb 23, 2022 has been reached.
Scraped data has been saved to Ukraine_scraped_data.csv


In [None]:
#Russia
scraped_articles = scrape_cnn_articles('Feb 24, 2022', "Russia", "176")

Stopping scraping as an article with date Feb 23, 2022 has been reached.
Scraped data has been saved to Russia_scraped_data.csv


In [None]:
#Putin
scraped_articles = scrape_cnn_articles('Feb 24, 2022', "Putin", "181")

Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stale element reference. Retrying to find the next page button.
Stopping scraping as an article with date Feb 23, 2022 has been reached.
Scraped data has been saved to Putin_scraped_data.csv


In [None]:
#Zelensky
scraped_articles = scrape_cnn_articles('Feb 24, 2022', "Zelensky", "14")

Stopping scraping as an article with date Feb 19, 2022 has been reached.
Scraped data has been saved to Zelensky_scraped_data.csv


In [None]:
csv_files = ['Russia_scraped_data.csv', 'Zelensky_scraped_data.csv', 'Ukraine_scraped_data.csv', 'Putin_scraped_data.csv']

# List to store DataFrames read from CSV files
dfs = []

# Loop through each CSV file and read it into a DataFrame
for file in csv_files:
    # Read CSV file into a DataFrame
    df = pd.read_csv(file)

    # Append the DataFrame to the list
    dfs.append(df)

# Combine all DataFrames in the list into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Drop duplicate rows in the combined DataFrame
combined_df.drop_duplicates(inplace=True)

# Save the combined and deduplicated DataFrame to a new CSV file
combined_df.to_csv('russia_ukraine_conflict.csv', index=False)

In [None]:
df = pd.read_csv('russia_ukraine_conflict.csv')

#filter on date
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

start_date = '2022-02-24'
end_date = '2022-04-24'
filtered_df = df.loc[start_date:end_date]

filtered_df.reset_index(inplace=True)

filtered_df.to_csv('russia_ukraine_conflict.csv', index=False)
