In [2]:
%pip install requests beautifulsoup4 lxml 





[notice] A new release of pip available: 22.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [4]:
def scrape_url(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
        }
        # Send a GET request with headers
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise HTTPError for bad responses

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'lxml')

        # Extract meaningful text (e.g., from <p> tags)
        paragraphs = soup.find_all('p')
        text = '\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs)

        return text if text else "No meaningful text found on this page."

    except requests.exceptions.RequestException as e:
        return f"An error occurred while fetching the URL: {e}"

In [5]:
def clean_text(text, stop_line):
    """
    Remove a specific line and everything after it from the text.
    
    Args:
        text (str): The full extracted text.
        stop_line (str): The line indicating where to stop.
        
    Returns:
        str: The cleaned text.
    """
    lines = text.split('\n')  # Split the text into lines
    cleaned_lines = []

    year_pattern = re.compile(r"^(\d{4})—(.+)$")
    
    for line in lines:
        if stop_line in line:
            break  # Stop processing once the stop line is found
        cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)  # Rejoin the cleaned lines

In [6]:
def process_text(input_text):
    """
    Extract lines starting with a year in various formats and format them as a dictionary.

    Args:
        input_text (str): Raw text input.

    Returns:
        dict: A dictionary with years as keys and the text after the separator as values.
    """
    result = {}

    # Unified regex to handle multiple patterns
    year_pattern = re.compile(r"^(\d{4})\s*[—–-]{1,2}\s*(.+)$")


    # Split the text into lines and process each
    lines = input_text.splitlines()
    for line in lines:
        match = year_pattern.match(line)
        if match:
            year = match.group(1)  # Extract the year
            text = match.group(2).strip()  # Extract and clean the text after the year
            result[year] = text

    return result

In [7]:
def filter_years(data, selected_years):
    """
    Filter the dictionary to only keep selected years.

    Args:
        data (dict): The full dictionary of years and their text.
        selected_years (list): List of years to keep.

    Returns:
        dict: Filtered dictionary with only the selected years.
    """
    return {year: data[year] for year in selected_years if year in data}

In [46]:
url = 'https://triblive.com/sports/this-date-in-sports-history-feb-13/'

extracted_text = scrape_url(url)
# print(extracted_text)

clean_text_ = clean_text(extracted_text, "Remove the adsfrom your TribLIVE reading experience")

clean_dict = process_text(clean_text_)

print(f"COPY: {url}")
display(clean_dict)

COPY: https://triblive.com/sports/this-date-in-sports-history-feb-13/


{'1923': 'The New York Renaissance, the first all-black pro basketball team, is organized. The Rens become one of the dominant basketball team of the 1920s and 1930s.',
 '1937': 'The NFL Redskins move from Boston to Washington.',
 '1948': 'Dick Button, the Olympic gold medalist, beats Hans Gerschwiler again to win the men’s World Figure Skating championship in Davos, Switzerland.',
 '1954': 'Furman’s Frank Selvey scores 100 points in a 149-95 victory over Newberry. Selvey breaks the record of 73 points — set by Temple’s Bill Mlkvy in 1951 — with 41 field goals and 18 free throws.',
 '1973': 'Frank Mahovlich of the Montreal Canadiens scores his 1,000th career point with an assist in a 7-6 loss to the Philadelphia Flyers.',
 '1975': 'Boston’s Bobby Orr gets an assist in the Bruins’ in a 3-1 loss to the Buffalo Sabres to become the first player in NHL history to reach 100 points in six consecutive seasons. It is the final 100-point season of his career.',
 '1977': 'Julius Erving, playing 

In [47]:
selected_years = ['', '', '']
selected_years = ['1994', '1995', '2003']
final_dict = filter_years(clean_dict, selected_years)

for key in final_dict.keys():
    final_dict[key] = re.sub("\\u201c", "'", final_dict[key])
    final_dict[key] = re.sub("\\u201d", "'", final_dict[key])
    final_dict[key] = re.sub("\\u2019", "'", final_dict[key])
    final_dict[key] = re.sub("\\u2014", "--", final_dict[key])

formatted_output = json.dumps(final_dict, indent=4)

print(formatted_output)

{
    "1994": "Tommy Moe wins the men's downhill over local hero Kjetil Andre Aamodt at the Winter Olympics in Lillehammer, Norway. Moe won by .04 seconds, the closest Alpine race in Olympic history. Norwegian speed skater Johann Olav Koss has a world record-setting gold medal performance in the 5,000 meters in 6 minutes, 34.96 seconds.",
    "1995": "Connecticut is voted No. 1 in the Associated Press Top 25 and joins the school's women's team at the top. It is the first time teams from one school were ranked No. 1 in the men's and women's college basketball polls.",
    "2003": "Teresa Phillips becomes the first woman to coach a men's Division I team, but her presence couldn't stop Tennessee State from losing for the 17th straight time, 71-56 at Austin Peay."
}
