# Exercise 4 : Scrape And Categorize News Articles From A JavaScript-Enabled News Site

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint  # To tidy up
import pandas as pd

In [None]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument("--no-sandbox")  # Bypass OS security model
options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
driver = webdriver.Chrome(options=options)

In [None]:
# Function to scrape archive data from iranintl.com
def scrape_page(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html')
    articles = soup.find_all('article')

    data = []
    for article in articles:
        # Find the h3 element containing the article name
        article_name = article.find('h3', class_='jsx-268a7bdb02dd195b card__headline')
        if article_name:
            article_name_text = article_name.get_text(strip=True)

        # Find the time element containing the datetime
        time_element = article.find('time')
        if time_element:
            datetime = time_element['datetime']

        data.append({'Article_name': article_name_text, 'Article_date': datetime})

    return data

In [None]:
# Function to scrape data from multiple pages
def scrape_multiple_pages(base_url, max_pages):
    all_data = []
    for page_number in range(1, max_pages + 1):
        url = f"{base_url}/page/{page_number}"
        data = scrape_page(url)
        all_data.extend(data)
    return all_data

In [None]:
# Define the base URL and the maximum number of pages to scrape
base_url = "https://www.iranintl.com/en/archive/iran-en"
max_pages = 100

In [None]:
# Scrape data from multiple pages
all_data = scrape_multiple_pages(base_url, max_pages)

In [None]:
# Create a DataFrame
df = pd.DataFrame(all_data)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df1 = df.copy()

In [None]:
# Convert 'Article_date' to datetime format
df1['Article_date'] = pd.to_datetime(df1['Article_date'].dt.date)

# Create a new column with month names
df1['Month'] = pd.to_datetime(df1['Article_date']).dt.strftime('%B')

In [None]:
df1.head()