<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 2 - Elaine

## [TED Talks](https://www.ted.com/talks) data extraction

Considering the period 2020 to 2025 (raw data extracted on 14/03/2025 at 11:38 am Brasilia).

## Required Python packages

- beautifulsoup4
- lxml
- pandas

## Importing the required libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
import requests
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Defining input variables

In [2]:
html_talks_dir = 'html_talks'
txt_dir = 'txt'
input_file = 'valid_urls'
log_filename = 'cl_st2_elaine.log'

## Setting up logging

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename = log_filename
)

## Creating output directories

In [5]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(html_talks_dir):
    logging.info(f"Output directory {html_talks_dir} already exists.")
else:
    try:
        os.makedirs(html_talks_dir)
        logging.info(f"Output directory {html_talks_dir} successfully created.")
    except OSError as e:
        logging.error(f"Failed to create the {html_talks_dir} directory:", e)
        sys.exit(1)


if os.path.exists(txt_dir):
    logging.info(f"Output directory {txt_dir} already exists.")
else:
    try:
        os.makedirs(txt_dir)
        logging.info(f"Output directory {txt_dir} successfully created.")
    except OSError as e:
        logging.error(f"Failed to create the {txt_dir} directory:", e)
        sys.exit(1)

## Importing the data into a DataFrame

In [6]:
df_tedtalks_urls3 = pd.read_csv(input_file, sep='\t', header=None)
df_tedtalks_urls3.columns = ['File ID', 'TED Talks URL']

In [7]:
# Ensure the 'File ID' column is treated as strings
df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].astype('str')

In [8]:
# Pad the values in the 'File ID' column with leading zeros to make them 6 digits
df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].str.zfill(6)

In [9]:
df_tedtalks_urls3.dtypes

File ID          object
TED Talks URL    object
dtype: object

In [10]:
df_tedtalks_urls3

Unnamed: 0,File ID,TED Talks URL
0,000001,https://www.ted.com/talks/alex_gendler_a_brief...
1,000002,https://www.ted.com/talks/alex_gendler_a_day_i...
2,000003,https://www.ted.com/talks/alex_gendler_can_you...
3,000004,https://www.ted.com/talks/andrew_marantz_insid...
4,000005,https://www.ted.com/talks/anne_f_broadbridge_t...
...,...,...
3974,003975,https://www.ted.com/talks/steve_truglia_a_leap...
3975,003976,https://www.ted.com/talks/stewart_brand_procla...
3976,003977,https://www.ted.com/talks/tom_wujec_on_3_ways_...
3977,003978,https://www.ted.com/talks/vishal_vaid_s_hypnot...


## Getting the `TED Talks` URLs

In [None]:
# Retry mechanism setup
retry_strategy = Retry(
    total=3,  # Retry up to 3 times
    backoff_factor=2,  # Exponential backoff: wait 2s, 4s, 8s...
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP error codes
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Loop through the DataFrame rows
for _, row in df_tedtalks_urls3.iterrows():
    file_id = row['File ID']  # Get the File ID
    url = row['TED Talks URL']  # Get the URL

    try:
        # Log the start of the request
        logging.info(f"Fetching HTML for File ID: {file_id} from URL: {url}")

        # Fetch the HTML content from the URL
        response = http.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the HTML content to a file in the 'html_talks' directory
        file_path = os.path.join(html_talks_dir, f"{file_id}.html")
        with open(file_path, 'w', encoding='utf-8') as html_file:
            html_file.write(response.text)

        # Log success
        logging.info(f"Successfully saved HTML for File ID: {file_id} to {file_path}")

    except requests.exceptions.RequestException as e:
        # Log any failures or retries
        logging.error(f"Failed to fetch HTML for File ID: {file_id} from URL: {url}: {e}")

### Adapting the programme for command line

The programme was named `geturls.py`.

In [None]:
import pandas as pd
import os
import requests
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def main():
    # Defining input variables
    html_talks_dir = 'html_talks'
    input_file = 'valid_urls'
    log_filename = 'cl_st2_elaine.log'
    
    # Setting up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename = log_filename
    )

    # Creating output directories
    if os.path.exists(html_talks_dir):
        logging.info(f"Output directory {html_talks_dir} already exists.")
    else:
        try:
            os.makedirs(html_talks_dir)
            logging.info(f"Output directory {html_talks_dir} successfully created.")
        except OSError as e:
            logging.error(f"Failed to create the {html_talks_dir} directory:", e)
            sys.exit(1)
        
    # Importing the data into a DataFrame
    df_tedtalks_urls3 = pd.read_csv(input_file, sep='\t', header=None)
    df_tedtalks_urls3.columns = ['File ID', 'TED Talks URL']
    
    # Ensure the 'File ID' column is treated as strings
    df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].astype('str')
    
    # Pad the values in the 'File ID' column with leading zeros to make them 6 digits
    df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].str.zfill(6)
    
    # Getting the `TED Talks` URLs
    # Retry mechanism setup
    retry_strategy = Retry(
        total=3,  # Retry up to 3 times
        backoff_factor=2,  # Exponential backoff: wait 2s, 4s, 8s...
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP error codes
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    
    # Loop through the DataFrame rows
    for _, row in df_tedtalks_urls3.iterrows():
        file_id = row['File ID']  # Get the File ID
        url = row['TED Talks URL']  # Get the URL
    
        try:
            # Log the start of the request
            logging.info(f"Fetching HTML for File ID: {file_id} from URL: {url}")
    
            # Fetch the HTML content from the URL
            response = http.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
    
            # Save the HTML content to a file in the 'html_talks' directory
            file_path = os.path.join(html_talks_dir, f"{file_id}.html")
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)
    
            # Log success
            logging.info(f"Successfully saved HTML for File ID: {file_id} to {file_path}")
    
        except requests.exceptions.RequestException as e:
            # Log any failures or retries
            logging.error(f"Failed to fetch HTML for File ID: {file_id} from URL: {url}: {e}")

if __name__ == "__main__":
    main()