<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 1 - Arianne

## Required Python packages

- <>
- <>

## Importing the required libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import sys
import time
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.edge.options import Options

## Define input variables

In [2]:
input_directory = 'cl_st2_ph1_arianne'
output_directory = 'cl_st2_ph1_arianne'

## Create output directory

In [3]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(output_directory):
    print('Output directory already exists.')
else:
    try:
        os.makedirs(output_directory)
        print('Output directory successfully created.')
    except OSError as e:
        print('Failed to create the directory:', e)
        sys.exit(1)

Output directory already exists.


## Set up logging

In [4]:
log_filename = f"{output_directory}/{output_directory}.log"

In [5]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename=log_filename
)

## Functions

### Create output subdirectories

In [6]:
def create_directory(path):
    """Creates a subdirectory if it doesn't exist."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
            print(f"Successfully created the directory: {path}")
        except OSError as e:
            print(f"Failed to create the {path} directory: {e}")
            sys.exit(1)
    else:
        print(f"Directory already exists: {path}")

### Scrape web pages

In [7]:
def scrape_html(url):
    """Loads a web page and returns its source HTML."""
    # Setting up the WebDriver
    #service = Service(r'C:\Users\eyamr\OneDrive\00-Technology\msedgedriver\edgedriver_win64\msedgedriver.exe')
    service = Service('/Users/eyamrog/msedgedriver/edgedriver_mac64/msedgedriver')
    #service = Service('/home/eyamrog/msedgedriver/edgedriver_linux64/msedgedriver')

    # Configure Edge to run headless
    options = Options()
    # For modern Edge/Chromium; if incompatible with your version, try "--headless"
    options.add_argument('--headless=new')
    options.add_argument('--disable-gpu')
    options.add_argument('--window-size=1920,1080')

    driver = webdriver.Edge(service=service, options=options)
    html = None
    try:
        driver.get(url)

        # Explicit wait for stable page load
        wait = WebDriverWait(driver, 10)
        max_wait_time = 30
        start_time = time.time()
        previous_html = ''

        while True:
            current_html = driver.page_source
            if current_html == previous_html or time.time() - start_time > max_wait_time:
                break
            previous_html = current_html
            time.sleep(2)

        html = driver.page_source  # Capture page source
    except Exception as e:
        logging.error(f"Error scraping {url}: {e}")
    finally:
        # Always close WebDriver
        driver.quit()

    return html

In [8]:
def scrape_html_docs2(df, path):
    """Iterates over a DataFrame and saves HTML pages within multiple WebDriver sessions."""
    if not os.path.exists(path):
        try:
            os.makedirs(path)
        except OSError as e:
            logging.error(f"Failed to create the {path} directory: {e}")
            sys.exit(1)

    for _, row in tqdm(df.iterrows(), total=len(df), desc="Scraping HTML documents"):
        url = row['URL']
        doc_id = row['ID']
        filename = os.path.join(path, f"{doc_id}.html")

        page_source = scrape_html(url)  # Call scrape_html function

        if page_source:
            with open(filename, 'w', encoding='utf-8') as file:
                file.write(page_source)
            logging.info(f"Saved: {filename}")

## Scraping [Greenpeace Stories](https://www.greenpeace.org/international/story/)

### Define local variables

In [9]:
id = 'grp'
path = os.path.join(output_directory, id)

### Create output subdirectory

In [10]:
create_directory(path)

Directory already exists: cl_st2_ph1_arianne/grp


### Capture a few document pages for inspection

In [11]:
filename_sample_1 = 'greenpeace_stories_sample1.html'
url_sample_1 = 'https://www.greenpeace.org/international/story/page/1/'
filename_sample_11 = 'greenpeace_stories_sample11.html'
url_sample_11 = 'https://www.greenpeace.org/international/story/77736/from-hiroshima-to-gaza-defending-peace/'
filename_sample_2 = 'greenpeace_stories_sample2.html'
url_sample_2 = 'https://www.greenpeace.org/international/story/page/2/'
filename_sample_21 = 'greenpeace_stories_sample21.html'
url_sample_21 = 'https://www.greenpeace.org/international/story/77406/boots-to-boost-justice-standing-in-solidarity-with-indonesian-migrant-fishers/'
filename_sample_3 = 'greenpeace_stories_sample3.html'
url_sample_3 = 'https://www.greenpeace.org/international/story/page/3/'
filename_sample_31 = 'greenpeace_stories_sample31.html'
url_sample_31 = 'https://www.greenpeace.org/international/story/76810/vanishing-millet-fields-endangered-sparrows-the-climate-crisis-and-taiwans-forgotten-guardians/'

In [12]:
document_page_sample_1 = scrape_html(url_sample_1)

with open(f'{path}/{filename_sample_1}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_1)

In [13]:
document_page_sample_11 = scrape_html(url_sample_11)

with open(f'{path}/{filename_sample_11}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_11)

In [14]:
document_page_sample_2 = scrape_html(url_sample_2)

with open(f'{path}/{filename_sample_2}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_2)

In [15]:
document_page_sample_21 = scrape_html(url_sample_21)

with open(f'{path}/{filename_sample_21}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_21)

In [16]:
document_page_sample_3 = scrape_html(url_sample_3)

with open(f'{path}/{filename_sample_3}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_3)

In [17]:
document_page_sample_31 = scrape_html(url_sample_31)

with open(f'{path}/{filename_sample_31}', 'w', encoding='utf8', newline='\n') as file:
    file.write(document_page_sample_31)

### Scraping the articles

In [14]:
source = 'Greenpeace'
index_page_url_1 = 'https://www.greenpeace.org/international/story/page/'
index_page_url_2 = '/'
start_page = 1
end_page = 136

In [None]:
df_grp = scrape_articles(source, index_page_url_1, index_page_url_2, start_page, end_page)

In [6]:
def scrape_articles(source, index_page_url_1, index_page_url_2, start_page, end_page):
    data = []

    for i in tqdm(range(start_page, end_page + 1)):
        url = f"{index_page_url_1}{i}{index_page_url_2}"

        index_page = scrape_html(url)

        # Parse page source with BeautifulSoup
        soup = BeautifulSoup(index_page, 'lxml')

        # Capture the listing page content
        listing_page_content = soup.find('div', id='listing-page-content')

        # Extract the items
        if listing_page_content:
            list = listing_page_content.find('ul', class_='wp-block-post-template')
            if list:
                items = list.find_all('li')

        for item in items:
            # Extract the item body
            body = item.find('div', class_='query-list-item-body')

            # Extract the post term
            if body:
                post_term = body.find('div', class_='wp-block-post-terms')
                if post_term:
                    post_term_text = ' '.join(post_term.get_text(' ', strip=True).split()) if post_term else ''

            # Extract the post tags
            if body:
                post_tags = body.find('div', class_='taxonomy-post_tag wp-block-post-terms')
                if post_tags:
                    post_tags_list = [a.get_text(strip=True) for a in post_tags.select('a[rel="tag"]')]
                    post_tags_text = ", ".join(post_tags_list) if post_tags_list else ''

            # Extract the title
            if body:
                headline = body.find('h4', class_='query-list-item-headline wp-block-post-title')
                title_text = ' '.join(headline.get_text(' ', strip=True).split()) if headline else ''

            # Extract the URL
            if headline:
                anchor = headline.find('a')
                url = anchor['href'] if anchor else ''

            # Extract the authors
            authors_tag = article.find('input', class_='inputAuthor')
            authors = authors_tag['value'] if authors_tag else ''

            # Extract published date
            published_tag = article.find('input', class_='inputEPubDate')
            published = published_tag['value'] if published_tag else ''

            # Extract DOI
            doi = f"{doi_root_url}{doi_tag['value']}" if doi_tag else ''

            # Extract Free Access status
            free_access_tag = article.find('span', class_='issue-item_free')
            free_access = free_access_tag.get_text(strip=True) if free_access_tag else ''

            # Extract PDF URL
            pdf_url = f"{root_url}/doi/pdf/{doi_tag['value']}" if doi_tag else ''

            # Append extracted data
            data.append({
                'Article Type': article_type,
                'Title': title,
                'URL': article_url,
                'Authors': authors,
                'Vol/Issue': volume_issue,
                'Published': published,
                'DOI': doi,
                'Free Access': free_access,
                'PDF URL': pdf_url,
                'Area of Knowledge': area_of_knowledge
            })

    # Close WebDriver
    driver.quit()

    return pd.DataFrame(data)

In [20]:
# Initialize an empty list to store the data
data = []

# Iterate through the URLs and using 'tqdm' for progress tracking in the range loop
for i in tqdm(range(start_page, end_page + 1)):
    url = f"{article_list_url}{i}"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'lxml')

    # Find all <article> elements with the class 'u-full-height c-card c-card--flush'
    articles = soup.find_all('article', class_='u-full-height c-card c-card--flush')

    for article in articles:
        # Extracting the title
        title_tag = article.find('h3', class_='c-card__title').find('a')
        title = title_tag.get_text(strip=True) if title_tag else ''
        title_url = f"{root_url}{title_tag.get('href')}" if title_tag else ''

        # Extracting the authors
        author_tags = article.find_all('li', itemprop='creator')
        authors = ', '.join(author.get_text(strip=True) for author in author_tags)

        # Extracting the published date
        date_tag = article.find('time')
        date_published = date_tag['datetime'] if date_tag else ''

        # Extracting the PDF URL
        pdf_url = f"{title_url}.pdf" if title_url else ''

        # Extracting the 'Open Access' label
        open_access_tag = article.find('span', class_='u-color-open-access')
        open_access = open_access_tag.get_text(strip=True) if open_access_tag else ''

        # Appending the data to the list
        data.append({
            'Title': title,
            'URL': title_url,
            'Authors (compact list)': authors,
            'Published': date_published,
            'PDF URL': pdf_url,
            'Open Access': open_access,
            'Area of Knowledge': area_of_knowledge
        })

# Creating a DataFrame from the data
df_nature_food = pd.DataFrame(data)

100%|██████████| 15/15 [00:18<00:00,  1.22s/it]
