<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 2 - Phase 2 - Elaine

## [TED Talks](https://www.ted.com/talks) data extraction

Considering the period 2020 to 2025 (raw data extracted on 14/03/2025 at 11:38 am Brasilia).

## Required Python packages

- beautifulsoup4
- lxml
- pandas

## Importing the required libraries

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import json
import os
import requests
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

## Defining input variables

In [2]:
html_talks_dir = 'html_talks'
txt_dir = 'txt'
input_file = 'valid_urls'
log_filename = 'cl_st2_elaine.log'

## Setting up logging

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename = log_filename
)

## Creating output directories

In [5]:
# Check if the output directory already exists. If it does, do nothing. If it doesn't exist, create it.
if os.path.exists(html_talks_dir):
    logging.info(f"Output directory {html_talks_dir} already exists.")
else:
    try:
        os.makedirs(html_talks_dir)
        logging.info(f"Output directory {html_talks_dir} successfully created.")
    except OSError as e:
        logging.error(f"Failed to create the {html_talks_dir} directory:", e)
        sys.exit(1)


if os.path.exists(txt_dir):
    logging.info(f"Output directory {txt_dir} already exists.")
else:
    try:
        os.makedirs(txt_dir)
        logging.info(f"Output directory {txt_dir} successfully created.")
    except OSError as e:
        logging.error(f"Failed to create the {txt_dir} directory:", e)
        sys.exit(1)

## Importing the data into a DataFrame

In [6]:
df_tedtalks_urls3 = pd.read_csv(input_file, sep='\t', header=None)
df_tedtalks_urls3.columns = ['File ID', 'TED Talks URL']

In [7]:
# Ensure the 'File ID' column is treated as strings
df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].astype('str')

In [8]:
# Pad the values in the 'File ID' column with leading zeros to make them 6 digits
df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].str.zfill(6)

In [9]:
df_tedtalks_urls3.dtypes

File ID          object
TED Talks URL    object
dtype: object

In [10]:
df_tedtalks_urls3

Unnamed: 0,File ID,TED Talks URL
0,000001,https://www.ted.com/talks/alex_gendler_a_brief...
1,000002,https://www.ted.com/talks/alex_gendler_a_day_i...
2,000003,https://www.ted.com/talks/alex_gendler_can_you...
3,000004,https://www.ted.com/talks/andrew_marantz_insid...
4,000005,https://www.ted.com/talks/anne_f_broadbridge_t...
...,...,...
3974,003975,https://www.ted.com/talks/steve_truglia_a_leap...
3975,003976,https://www.ted.com/talks/stewart_brand_procla...
3976,003977,https://www.ted.com/talks/tom_wujec_on_3_ways_...
3977,003978,https://www.ted.com/talks/vishal_vaid_s_hypnot...


## Getting the `TED Talks` URLs

In [None]:
# Retry mechanism setup
retry_strategy = Retry(
    total=3,  # Retry up to 3 times
    backoff_factor=2,  # Exponential backoff: wait 2s, 4s, 8s...
    status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP error codes
)
adapter = HTTPAdapter(max_retries=retry_strategy)
http = requests.Session()
http.mount("https://", adapter)

# Loop through the DataFrame rows
for _, row in df_tedtalks_urls3.iterrows():
    file_id = row['File ID']  # Get the File ID
    url = row['TED Talks URL']  # Get the URL

    try:
        # Log the start of the request
        logging.info(f"Fetching HTML for File ID: {file_id} from URL: {url}")

        # Fetch the HTML content from the URL
        response = http.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        # Save the HTML content to a file in the 'html_talks' directory
        file_path = os.path.join(html_talks_dir, f"{file_id}.html")
        with open(file_path, 'w', encoding='utf-8') as html_file:
            html_file.write(response.text)

        # Log success
        logging.info(f"Successfully saved HTML for File ID: {file_id} to {file_path}")

    except requests.exceptions.RequestException as e:
        # Log any failures or retries
        logging.error(f"Failed to fetch HTML for File ID: {file_id} from URL: {url}: {e}")

### Adapting the programme for command line

The programme was named `geturls.py`.

In [None]:
import pandas as pd
import os
import requests
import logging
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

def main():
    # Defining input variables
    html_talks_dir = 'html_talks'
    input_file = 'valid_urls'
    log_filename = 'cl_st2_elaine.log'
    
    # Setting up logging
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        filename = log_filename
    )

    # Creating output directories
    if os.path.exists(html_talks_dir):
        logging.info(f"Output directory {html_talks_dir} already exists.")
    else:
        try:
            os.makedirs(html_talks_dir)
            logging.info(f"Output directory {html_talks_dir} successfully created.")
        except OSError as e:
            logging.error(f"Failed to create the {html_talks_dir} directory:", e)
            sys.exit(1)
        
    # Importing the data into a DataFrame
    df_tedtalks_urls3 = pd.read_csv(input_file, sep='\t', header=None)
    df_tedtalks_urls3.columns = ['File ID', 'TED Talks URL']
    
    # Ensure the 'File ID' column is treated as strings
    df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].astype('str')
    
    # Pad the values in the 'File ID' column with leading zeros to make them 6 digits
    df_tedtalks_urls3['File ID'] = df_tedtalks_urls3['File ID'].str.zfill(6)
    
    # Getting the `TED Talks` URLs
    # Retry mechanism setup
    retry_strategy = Retry(
        total=3,  # Retry up to 3 times
        backoff_factor=2,  # Exponential backoff: wait 2s, 4s, 8s...
        status_forcelist=[429, 500, 502, 503, 504],  # Retry on these HTTP error codes
    )
    adapter = HTTPAdapter(max_retries=retry_strategy)
    http = requests.Session()
    http.mount("https://", adapter)
    
    # Loop through the DataFrame rows
    for _, row in df_tedtalks_urls3.iterrows():
        file_id = row['File ID']  # Get the File ID
        url = row['TED Talks URL']  # Get the URL
    
        try:
            # Log the start of the request
            logging.info(f"Fetching HTML for File ID: {file_id} from URL: {url}")
    
            # Fetch the HTML content from the URL
            response = http.get(url)
            response.raise_for_status()  # Raise an exception for HTTP errors
    
            # Save the HTML content to a file in the 'html_talks' directory
            file_path = os.path.join(html_talks_dir, f"{file_id}.html")
            with open(file_path, 'w', encoding='utf-8') as html_file:
                html_file.write(response.text)
    
            # Log success
            logging.info(f"Successfully saved HTML for File ID: {file_id} to {file_path}")
    
        except requests.exceptions.RequestException as e:
            # Log any failures or retries
            logging.error(f"Failed to fetch HTML for File ID: {file_id} from URL: {url}: {e}")

if __name__ == "__main__":
    main()

## Scraping the `TED Talks` URLs

In [None]:
# Directory containing the HTML files
html_dir = 'html_talks'
#html_dir = 'html_talks_back'

# Initialize new columns
new_columns = ['Speaker', 'Title', 'Duration', 'Tags', 'Views', 'Year', 'Talk', 'Video', 'Event', 'TED_ID']
for col in new_columns:
    df_tedtalks_urls3[col] = None

# Process each row
for idx, row in df_tedtalks_urls3.iterrows():
    file_id = row['File ID']
    file_path = f"{html_dir}/{file_id}.html"
    
    # Open and parse the HTML file
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            soup = BeautifulSoup(file, 'lxml')
    except FileNotFoundError:
        print(f"File {file_path} not found. Skipping...")
        continue

    # Extract JSON content from <script> tags
    script_tag = soup.find('script', {'type': 'application/json'})
    if script_tag:
        try:
            json_content = json.loads(script_tag.string)

            # Navigate to videoData
            video_data = json_content['props']['pageProps']['videoData']

            # Parse the playerData string into a dictionary
            player_data = json.loads(video_data.get('playerData', '{}'))

            # Fill the new columns with the extracted data
            df_tedtalks_urls3.at[idx, 'Speaker'] = player_data.get('speaker')
            df_tedtalks_urls3.at[idx, 'Title'] = player_data.get('title')
            df_tedtalks_urls3.at[idx, 'Duration'] = player_data.get('duration')
            df_tedtalks_urls3.at[idx, 'Tags'] = video_data.get('targeting', {}).get('tag')
            df_tedtalks_urls3.at[idx, 'Views'] = video_data.get('viewedCount')
            df_tedtalks_urls3.at[idx, 'Year'] = video_data.get('targeting', {}).get('year')
            df_tedtalks_urls3.at[idx, 'Talk'] = video_data.get('targeting', {}).get('talk')

            # Handle Video (h264 resource)
            h264_resources = player_data.get('resources', {}).get('h264', [])
            if isinstance(h264_resources, list) and len(h264_resources) > 0:
                df_tedtalks_urls3.at[idx, 'Video'] = h264_resources[0].get('file')
            else:
                df_tedtalks_urls3.at[idx, 'Video'] = None

            df_tedtalks_urls3.at[idx, 'Event'] = video_data.get('targeting', {}).get('event')
            df_tedtalks_urls3.at[idx, 'TED_ID'] = video_data.get('targeting', {}).get('id')
        except (KeyError, json.JSONDecodeError) as e:
            print(f"Error parsing JSON in file {file_path}: {e}")
    else:
        print(f"No suitable <script> tag found in {file_path}")

# Display the updated DataFrame
print(df_tedtalks_urls3)


### Scraping the input HTML file

In [None]:
with open(f'{input_directory}/{input_file}', 'r', encoding='utf8', newline='\n') as html_doc:
    soup = BeautifulSoup(html_doc, 'lxml')

In [None]:
# Find all 'a' tags with the class 'relative'
tedtalks_urls = soup.find_all('a', class_='relative')

In [None]:
# Extract the 'href' attribute of each link and store them in a list
tedtalks_urls_list = [tedtalks_url.get('href') for tedtalks_url in tedtalks_urls if tedtalks_url.get('href')]

### Exporting the data into a DataFrame

In [None]:
# Create a Pandas DataFrame from the list of links
df_tedtalks_urls1 = pd.DataFrame(tedtalks_urls_list, columns=['TED Talks URL'])

# Filter the DataFrame to keep only rows where 'TED Talks URL' contains 'https://www.ted.com/talks/'
df_tedtalks_urls1 = df_tedtalks_urls1[df_tedtalks_urls1['TED Talks URL'].str.contains('https://www.ted.com/talks/')]
df_tedtalks_urls1 = df_tedtalks_urls1.reset_index(drop=True)

In [None]:
# Append '/transcript?language=en' to each 'TED Talks URL'
df_tedtalks_urls1['TED Talks URL'] = df_tedtalks_urls1['TED Talks URL'] + '/transcript?language=en'

In [None]:
df_tedtalks_urls1

### Inspecting a few rows

In [None]:
df_tedtalks_urls1.loc[1000, 'TED Talks URL']

## Importing the previous study's `TED Talks` URLs

In [None]:
# Read the CSV file into a DataFrame, specifying the TAB character as the delimiter
df_tedtalks_urls2 = pd.read_csv('previous_study_valid_urls', delimiter='\t', header=None)

# Define the column name
df_tedtalks_urls2.columns = ['File ID', 'TED Talks URL']

In [None]:
df_tedtalks_urls2

## Creating `df_tedtalks_urls3` by excluding rows from `df_tedtalks_urls1` that are present in `df_tedtalks_urls2`

The reason for this is to exclude any videos that were considered in the previous study.

In [None]:
df_tedtalks_urls3 = df_tedtalks_urls1[~df_tedtalks_urls1['TED Talks URL'].isin(df_tedtalks_urls2['TED Talks URL'])]
df_tedtalks_urls3 = df_tedtalks_urls3.reset_index(drop=True)

In [None]:
df_tedtalks_urls3

### Creating the column 'File ID'

In [None]:
# Create the 'File ID' column starting from '000001'
df_tedtalks_urls3['File ID'] = (df_tedtalks_urls3.index + 1).astype(str).str.zfill(6)

In [None]:
df_tedtalks_urls3

## Creating the file `valid_urls`

In [None]:
df_tedtalks_urls3[['File ID', 'TED Talks URL']].to_csv('valid_urls', sep='\t', index=False, header=False, encoding='utf-8', lineterminator='\n')

## Adapting the programme for command line

The programme was named `buildurls.py`.

In [None]:
from bs4 import BeautifulSoup
import pandas as pd

def main():
    # Defining input variables
    input_directory = 'indexes'
    input_file = 'TED_7061_of_7061_20250314.html'

    # Scraping the 'TED Talks' URLs
    with open(f'{input_directory}/{input_file}', 'r', encoding='utf8', newline='\n') as html_doc:
        soup = BeautifulSoup(html_doc, 'lxml')

    # Find all 'a' tags with the class 'relative'
    tedtalks_urls = soup.find_all('a', class_='relative')

    # Extract the 'href' attribute of each link and store them in a list
    tedtalks_urls_list = [tedtalks_url.get('href') for tedtalks_url in tedtalks_urls if tedtalks_url.get('href')]

    # Create a Pandas DataFrame from the list of links
    df_tedtalks_urls1 = pd.DataFrame(tedtalks_urls_list, columns=['TED Talks URL'])

    # Filter the DataFrame to keep only rows where 'TED Talks URL' contains 'https://www.ted.com/talks/'
    df_tedtalks_urls1 = df_tedtalks_urls1[df_tedtalks_urls1['TED Talks URL'].str.contains('https://www.ted.com/talks/')]
    df_tedtalks_urls1 = df_tedtalks_urls1.reset_index(drop=True)

    # Append '/transcript?language=en' to each 'TED Talks URL'
    df_tedtalks_urls1['TED Talks URL'] = df_tedtalks_urls1['TED Talks URL'] + '/transcript?language=en'

    # Importing the previous study's TED Talks URLs
    df_tedtalks_urls2 = pd.read_csv('previous_study_valid_urls', delimiter='\t', header=None)
    df_tedtalks_urls2.columns = ['File ID', 'TED Talks URL']

    # Creating df_tedtalks_urls3 by excluding rows from df_tedtalks_urls1 that are present in df_tedtalks_urls2
    df_tedtalks_urls3 = df_tedtalks_urls1[~df_tedtalks_urls1['TED Talks URL'].isin(df_tedtalks_urls2['TED Talks URL'])]
    df_tedtalks_urls3 = df_tedtalks_urls3.reset_index(drop=True)

    # Create the 'File ID' column starting from '000001'
    df_tedtalks_urls3['File ID'] = (df_tedtalks_urls3.index + 1).astype(str).str.zfill(6)

    # Creating the file 'valid_urls'
    df_tedtalks_urls3[['File ID', 'TED Talks URL']].to_csv('valid_urls', sep='\t', index=False, header=False, encoding='utf-8', lineterminator='\n')

if __name__ == "__main__":
    main()