In [1]:
%pip install requests beautifulsoup4 lxml 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [3]:
def scrape_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }
        # Send a GET request with headers
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract meaningful text (e.g., from <p> tags)
        paragraphs = soup.find_all('p')
        text = '\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)

        return text if text else "No meaningful text found on this page."

    except requests.exceptions.RequestException as e:
        return f"An error occurred while fetching the URL: {e}"

In [4]:
def clean_text(text, stop_line):
    """
    Remove a specific line and everything after it from the text.
    
    Args:
        text (str): The full extracted text.
        stop_line (str): The line indicating where to stop.
        
    Returns:
        str: The cleaned text.
    """
    lines = text.split('\n')  # Split the text into lines
    cleaned_lines = []

    year_pattern = re.compile(r"^(\d{4})—(.+)$")
    
    for line in lines:
        if stop_line in line:
            break  # Stop processing once the stop line is found
        cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)  # Rejoin the cleaned lines

In [7]:
def process_text(input_text):
    """
    Extract lines starting with a year in various formats and format them as a dictionary.

    Args:
        input_text (str): Raw text input.

    Returns:
        dict: A dictionary with years as keys and the text after the separator as values.
    """
    result = {}

    # Unified regex to handle multiple patterns
    year_pattern = re.compile(r"^(\d{4})\s*[—–-]{1,2}\s*(.+)$")


    # Split the text into lines and process each
    lines = input_text.splitlines()
    for line in lines:
        match = year_pattern.match(line)
        if match:
            year = match.group(1)  # Extract the year
            text = match.group(2).strip()  # Extract and clean the text after the year
            result[year] = text

    return result

In [5]:
def filter_years(data, selected_years):
    """
    Filter the dictionary to only keep selected years.

    Args:
        data (dict): The full dictionary of years and their text.
        selected_years (list): List of years to keep.

    Returns:
        dict: Filtered dictionary with only the selected years.
    """
    return {year: data[year] for year in selected_years if year in data}

In [15]:
url = 'https://triblive.com/sports/this-date-in-sports-history-feb-17/'

extracted_text = scrape_url(url)
# print(extracted_text)

clean_text_ = clean_text(extracted_text, "Remove the adsfrom your TribLIVE reading experience")

clean_dict = process_text(clean_text_)

print(f"COPY: {url}")
display(clean_dict)

COPY: https://triblive.com/sports/this-date-in-sports-history-feb-17/


{'1923': 'Cy Denneny of the Ottawa Senators becomes the NHL’s career scoring leader. He scores his 143rd goal to surpass Joe Malone in a 2-0 win over the Montreal Canadiens.',
 '1924': 'Johnny Weissmuller sets a world record in the 100-yard freestyle swim with a time of 52.4 seconds.',
 '1926': 'Suzanne Lenglen beats Helen Wills, 6-3, 8-6, in Cannes, France, in their only tennis match against each other.',
 '1928': 'Sweden’s Gillis Grafstrom successfully defends his 1920 and ‘24 Olympic figure skating title, with Austrian Willy Bockl finishing in second place as he did four years earlier.',
 '1941': 'Joe Louis knocks out Gus Dorazio in the second round in Philadelphia to defend his world heavyweight title.',
 '1955': 'Mike Souchak establishes the PGA Tour 72-hole scoring record with a 257 at the Texas Open. Souchak starts with a record-tying 60 at San Antonio’s Brackenridge Park course and ends with a 27-under-par, beating the previous low for a 72-hole event by two shots.',
 '1968': '

In [16]:
selected_years = ['', '', '']
selected_years = ['', '', '']
selected_years = ['', '', '']
selected_years = ['', '', '']
selected_years = ['1968', '2013', '2018']
final_dict = filter_years(clean_dict, selected_years)

for key in final_dict.keys():
    final_dict[key] = re.sub("\\u201c", "'", final_dict[key])
    final_dict[key] = re.sub("\\u201d", "'", final_dict[key])
    final_dict[key] = re.sub("\\u2019", "'", final_dict[key])
    final_dict[key] = re.sub("\\u2014", "--", final_dict[key])

formatted_output = json.dumps(final_dict, indent=4)

print(formatted_output)

{
    "1968": "The Basketball Hall of Fame opens in Springfield, Mass.",
    "2013": "Danica Patrick wins the Daytona 500 pole, becoming the first woman to secure the top spot for any Sprint Cup race.",
    "2018": "Japan's Yuzuru Hanyu becomes the first man to successfully defend his Olympic figure skating title since Dick Button in 1952."
}
