In [None]:
!pip install pandas requests bs4 selenium webdriver-manager

In [1]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

# Set up Selenium with Chrome WebDriver
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode (no browser UI)
driver = webdriver.Chrome(service=service, options=options)


# Define the range of pages to scrape for Palestine-related articles
first_page = 155
last_page = 348

# Define the range of pages to scrape for Russia Ukraine-related articles
# first_page = 95
# last_page = 270

all_urls = []

try:
    # Loop over each page
    for page in range(first_page, last_page):
        r_page = page * 10
        # URL of the main search page
        # Palestine Israel Gaza search
        url = f"https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from={r_page}&size=10&page=1&sort=newest&types=all&section="
        
        # Russia Ukraine search
        # url = f"https://edition.cnn.com/search?q=+Russia+or+Ukraine&from={r_page}&size=10&page=1&sort=newest&types=all&section="
        print(url)

        # Open the URL with Selenium
        driver.get(url)
        
        # Wait until the page content (targeted span element) is loaded
        try:
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "container__headline-text"))
            )
        except:
            print(f"Timeout or missing elements on page {page}")
            continue

        # Parse the fully loaded page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Find all span elements with the specified class and extract URLs
        elements = soup.find_all("span", class_="container__headline-text")
        urls = [element.get('data-zjs-href') for element in elements if element.get('data-zjs-href')]
        all_urls.extend(urls)  # Add URLs directly to the main list

finally:
    # Ensure the browser is closed after scraping
    driver.quit()

# Display all collected URLs
print("Found URLs:")
for url in all_urls:
    print(url)

pd.DataFrame({'url':all_urls}).to_csv('CNN_Pal_Articles.csv', index=False)

https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1550&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1560&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1570&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1580&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1590&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1600&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1610&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&from=1620&size=10&page=1&sort=newest&types=all&section=
https://edition.cnn.com/search?q=Palestine+or+Israel+or+Gaza&fro

In [2]:
df = pd.DataFrame({'url':all_urls})

# Drop rows with 'video' in the URL
df = df[~df['url'].str.contains('video')]
df = df[~df['url'].str.contains('opinion')]

df.reset_index(drop=True, inplace=True)

In [4]:
import pandas as pd
df = pd.read_csv('CNN_pal_Articles.csv')
# Drop rows with 'video' in the URL
df = df[~df['url'].str.contains('video')]
df = df[~df['url'].str.contains('opinion')]

df.reset_index(drop=True, inplace=True)

In [5]:
df

Unnamed: 0,url
0,https://www.cnn.com/2024/02/16/middleeast/isra...
1,https://www.cnn.com/2024/02/13/middleeast/egyp...
2,https://www.cnn.com/middleeast/live-news/israe...
3,https://www.cnn.com/2024/02/16/middleeast/egyp...
4,https://www.cnn.com/2024/02/15/middleeast/nass...
...,...
693,https://www.cnn.com/2023/06/15/middleeast/huwa...
694,https://www.cnn.com/2023/06/04/middleeast/isra...
695,https://www.cnn.com/2023/05/27/middleeast/sett...
696,https://www.cnn.com/2023/05/18/middleeast/jeru...


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Function to parse a CNN article page and extract title, author, content, and timestamp
def parse(html):
    soup = BeautifulSoup(html, features="html.parser")
    
    # Extract title
    title = return_text_if_not_none(soup.find('h1', {'class': 'headline__text'}))
    
    # Extract author
    author = soup.find('span', {'class': 'byline__name'})
    if not author:
        author = soup.find('span', {'class': 'byline__names'})
    author = return_text_if_not_none(author)
    
    # Extract content
    article_content = return_text_if_not_none(soup.find('div', {'class': 'article__content'}))
    
    # Extract timestamp
    timestamp = return_text_if_not_none(soup.find('div', {'class': 'timestamp'}))
    
    return {
        'title': title,
        'author': author,
        'content': article_content,
        'timestamp': timestamp
    }

# Helper function to return text if the element exists, otherwise an empty string
def return_text_if_not_none(element):
    return element.text.strip() if element else ''

# Initialize an empty list to collect all parsed article data
articles_data = []

# Loop through each URL to get and parse its content
for url in df['url']:
    print(url)
    response = requests.get(url)
    if response.status_code == 200:
        article_data = parse(response.text)
        article_data['url'] = url  # Add URL to the article data
        articles_data.append(article_data)
    else:
        print(f"Failed to retrieve {url}")

# Convert the list of article data to a DataFrame
a_df = pd.DataFrame(articles_data)

# Display or save the DataFrame
print(a_df)
# Optionally, save to a CSV file
a_df.to_csv("cnn_pal_articles_content.csv", index=False)


In [7]:
a_df['clean_content'] = a_df['content'].str.replace(r'\s+', ' ', regex=True)

In [8]:
a_df['clean_content']

0      CNN — Israel released more details on Friday a...
1      Editor’s Note: A version of this story appears...
2                                                       
3      CNN — Egypt is building a massive miles-wide b...
4      CNN — Israeli special forces raided Nasser Hos...
                             ...                        
693    CNN — When hundreds of Israeli settlers rampag...
694    Jerusalem/London CNN — Three Israeli soldiers ...
695    Jerusalem CNN — Israeli settler attacks on Pal...
696    Jerusalem CNN — Thousands of Israelis waving S...
697    Editor’s Note: A version of this story appears...
Name: clean_content, Length: 698, dtype: object

In [10]:
cnn_pal = pd.read_csv("cnn_pal_articles_content.csv")
cnn_ukr = pd.read_csv("cnn_ukr_articles_content.csv")

In [28]:
cnn_ukr['clean_content'] = cnn_ukr['content'].str.replace(r'\s+', ' ', regex=True)

In [29]:
cnn_ukr

Unnamed: 0,title,author,content,timestamp,url,clean_content
0,"Ukraine says it has sunk another warship, disa...",Victoria Butenko,CNN\n — \n \n\n\nUkraine claims it ...,"Updated\n 2:08 PM EST, Wed February 14,...",https://www.cnn.com/2024/02/14/europe/ukraine-...,CNN — Ukraine claims it has now disabled a thi...
1,How Ukraine seized the upper hand against Russ...,,CNN\n — \n \n\n\n Ukrain...,"Updated\n 11:39 AM EST, Wed February 14...",https://www.cnn.com/2024/02/14/europe/ukraine-...,CNN — Ukrainians have had little to celebrate ...
2,Russia can sustain war effort ‘for another two...,Christian Edwards,CNN\n — \n \n\n\nRussia can sustain...,"Published\n 11:00 AM EST, Wed February ...",https://www.cnn.com/2024/02/14/europe/russia-s...,CNN — Russia can sustain its war effort in Ukr...
3,‘Relief from everyday life’: How soccer is hel...,,CNN\n — \n \n\n\n Breath...,"Updated\n 9:52 AM EST, Wed February 14,...",https://www.cnn.com/2024/02/14/sport/ukraine-s...,"CNN — Breathing heavily, Oleksandr Malchevsky ..."
4,Russia places Estonia’s prime minister on want...,Sebastian Shukla,CNN\n — \n \n\n\n Russia...,"Updated\n 5:31 AM EST, Wed February 14,...",https://www.cnn.com/2024/02/13/europe/russia-e...,CNN — Russian authorities on Tuesday launched ...
...,...,...,...,...,...,...
779,How Ukraine turned the tables on Russia’s aeri...,Brad Lendon,CNN\n — \n \n\n\n It’s t...,"Updated\n 8:40 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/17/europe/ukraine-...,CNN — It’s the big question that has Russian m...
780,Ukraine’s first lady asks South Korea for air ...,Gawon Bae,"Seoul, South Korea\nCNN\n — \n \n\n...","Updated\n 4:45 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/17/asia/ukraine-ze...,"Seoul, South Korea CNN — Ukraine’s first lady ..."
781,"UK, Netherlands are working to procure F-16 fi...",Lauren Kent,London\nCNN\n — \n \n\n\n ...,"Updated\n 12:52 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/16/europe/uk-nethe...,London CNN — Britain and the Netherlands are w...
782,,Tara Subramaniam,,"Updated\n 12:04 AM EDT, Wed May 17, 2023",https://www.cnn.com/europe/live-news/russia-uk...,


In [30]:
# drop NaN values in title
cnn_ukr = cnn_ukr.dropna(subset=['title'])

In [31]:
cnn_ukr

Unnamed: 0,title,author,content,timestamp,url,clean_content
0,"Ukraine says it has sunk another warship, disa...",Victoria Butenko,CNN\n — \n \n\n\nUkraine claims it ...,"Updated\n 2:08 PM EST, Wed February 14,...",https://www.cnn.com/2024/02/14/europe/ukraine-...,CNN — Ukraine claims it has now disabled a thi...
1,How Ukraine seized the upper hand against Russ...,,CNN\n — \n \n\n\n Ukrain...,"Updated\n 11:39 AM EST, Wed February 14...",https://www.cnn.com/2024/02/14/europe/ukraine-...,CNN — Ukrainians have had little to celebrate ...
2,Russia can sustain war effort ‘for another two...,Christian Edwards,CNN\n — \n \n\n\nRussia can sustain...,"Published\n 11:00 AM EST, Wed February ...",https://www.cnn.com/2024/02/14/europe/russia-s...,CNN — Russia can sustain its war effort in Ukr...
3,‘Relief from everyday life’: How soccer is hel...,,CNN\n — \n \n\n\n Breath...,"Updated\n 9:52 AM EST, Wed February 14,...",https://www.cnn.com/2024/02/14/sport/ukraine-s...,"CNN — Breathing heavily, Oleksandr Malchevsky ..."
4,Russia places Estonia’s prime minister on want...,Sebastian Shukla,CNN\n — \n \n\n\n Russia...,"Updated\n 5:31 AM EST, Wed February 14,...",https://www.cnn.com/2024/02/13/europe/russia-e...,CNN — Russian authorities on Tuesday launched ...
...,...,...,...,...,...,...
778,White House not planning to ask for extra Ukra...,Natasha Bertrand,CNN\n — \n \n\n\n The Wh...,"Updated\n 9:19 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/16/politics/white-...,CNN — The White House says it is not currently...
779,How Ukraine turned the tables on Russia’s aeri...,Brad Lendon,CNN\n — \n \n\n\n It’s t...,"Updated\n 8:40 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/17/europe/ukraine-...,CNN — It’s the big question that has Russian m...
780,Ukraine’s first lady asks South Korea for air ...,Gawon Bae,"Seoul, South Korea\nCNN\n — \n \n\n...","Updated\n 4:45 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/17/asia/ukraine-ze...,"Seoul, South Korea CNN — Ukraine’s first lady ..."
781,"UK, Netherlands are working to procure F-16 fi...",Lauren Kent,London\nCNN\n — \n \n\n\n ...,"Updated\n 12:52 AM EDT, Wed May 17, 2023",https://www.cnn.com/2023/05/16/europe/uk-nethe...,London CNN — Britain and the Netherlands are w...


In [32]:
# extract date from url in format /yyyy/mm/dd/
cnn_ukr['date'] = cnn_ukr['url'].str.extract(r'(\d{4}/\d{2}/\d{2})')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnn_ukr['date'] = cnn_ukr['url'].str.extract(r'(\d{4}/\d{2}/\d{2})')


In [33]:
cnn_ukr.rename(columns={'date':'Timestamp', 'title':'Title', 'clean_content':'Text'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnn_ukr.rename(columns={'date':'Timestamp', 'title':'Title', 'clean_content':'Text'}, inplace=True)


In [34]:
cnn_ukr['location'] = 'Ukraine'
cnn_ukr['press'] = "CNN"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnn_ukr['location'] = 'Ukraine'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cnn_ukr['press'] = "CNN"


In [None]:
cnn_ukr[['Timestamp','Title','Text','location','press']].to_csv('CNN_Ukr_clean.csv', index=False)

In [38]:
pal = pd.read_csv("CNN_Pal_clean.csv")
ukr = pd.read_csv("CNN_Ukr_clean.csv")

In [42]:
# Combine the two dataframes
CNNMerged = pd.concat([pal, ukr])

In [43]:
CNNMerged.to_csv('CNNMerged.csv', index=False)