# Exercise 4 : Scrape And Categorize News Articles From A JavaScript-Enabled News Site

In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pprint  # To tidy up
import pandas as pd

In [2]:
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # Run Chrome in headless mode
options.add_argument("--no-sandbox")  # Bypass OS security model
options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
driver = webdriver.Chrome(options=options)

In [3]:
# Function to scrape archive data from iranintl.com
def scrape_page(url):
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html')
    articles = soup.find_all('article')

    data = []
    for article in articles:
        # Find the h3 element containing the article name
        article_name = article.find('h3', class_='jsx-268a7bdb02dd195b card__headline')
        if article_name:
            article_name_text = article_name.get_text(strip=True)

        # Find the time element containing the datetime
        time_element = article.find('time')
        if time_element:
            datetime = time_element['datetime']

        data.append({'Article_name': article_name_text, 'Article_date': datetime})

    return data

In [4]:
# Function to scrape data from multiple pages
def scrape_multiple_pages(base_url, max_pages):
    all_data = []
    for page_number in range(1, max_pages + 1):
        url = f"{base_url}/page/{page_number}"
        data = scrape_page(url)
        all_data.extend(data)
    return all_data

In [5]:
# Define the base URL and the maximum number of pages to scrape
base_url = "https://www.iranintl.com/en/archive/iran-en"
max_pages = 100

In [6]:
# Scrape data from multiple pages
all_data = scrape_multiple_pages(base_url, max_pages)

In [7]:
# Create a DataFrame
df = pd.DataFrame(all_data)

In [8]:
df.head()

Unnamed: 0,Article_name,Article_date
0,Iran's Crackdown On Women Intensifies Under Co...,2024-04-15T17:31:34.573Z
1,Iran FM Warns Of Harsh Response If Israel Reta...,2024-04-15T16:32:48.941Z
2,Iran Ramps Up Oil Exports By Relying On Floati...,2024-04-15T14:47:06.637Z
3,"US Tried To Stop Tehran Attacking Israel, Says...",2024-04-15T14:37:21.120Z
4,Iranians Wary Of War With Israel,2024-04-15T12:43:13.689Z


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Article_name  3000 non-null   object
 1   Article_date  3000 non-null   object
dtypes: object(2)
memory usage: 47.0+ KB


In [10]:
df1 = df.copy()

In [24]:
# Convert 'Article_date' to datetime format
df1['Article_date'] = pd.to_datetime(df1['Article_date'])

In [25]:
df1.head()

Unnamed: 0,Article_name,Article_date
0,Iran's Crackdown On Women Intensifies Under Co...,2024-04-15
1,Iran FM Warns Of Harsh Response If Israel Reta...,2024-04-15
2,Iran Ramps Up Oil Exports By Relying On Floati...,2024-04-15
3,"US Tried To Stop Tehran Attacking Israel, Says...",2024-04-15
4,Iranians Wary Of War With Israel,2024-04-15


In [26]:
print(df1['Article_date'].dtype)

datetime64[ns]


In [27]:
df2 = df1.copy()
df2['Months'] = df2['Article_date'].dt.month_name()

In [28]:
df2.head()

Unnamed: 0,Article_name,Article_date,Months
0,Iran's Crackdown On Women Intensifies Under Co...,2024-04-15,April
1,Iran FM Warns Of Harsh Response If Israel Reta...,2024-04-15,April
2,Iran Ramps Up Oil Exports By Relying On Floati...,2024-04-15,April
3,"US Tried To Stop Tehran Attacking Israel, Says...",2024-04-15,April
4,Iranians Wary Of War With Israel,2024-04-15,April


In [35]:
article_counts_by_month = df2.groupby('Months').size().reset_index(name='Article_Count')
print(article_counts_by_month)

      Months  Article_Count
0      April            209
1     August            232
2   December            325
3   February            333
4    January            359
5       July            230
6      March            368
7   November            328
8    October            370
9  September            246
