# scraping

> Scraping from o2cm

In [6]:
#| default_exp scoring

In [7]:
#| hide
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, NamedTuple

In [8]:
#| export

class Event(NamedTuple):
    division: str
    level: str
    event: str

    number: int

    def __str__(self):
        return f"{self.division} {self.level} {self.event}"

def parse_event_name(event_name):
    words = event_name.split()
    number = int(words[0][:-1])
    division = ' '.join(words[1:3])
    level = ' '.join(words[3:5]) if words[4].isdigit() else words[3]
    event = ' '.join(words[5:]) if words[4].isdigit() else ' '.join(words[4:])
    return Event(number=number, division=division, level=level, event=event)

event_names = ['9) Amateur Collegiate Gold Standard', '19) Amateur Collegiate Silver Rhythm', '4) Amateur Collegiate Gold Rhythm', '10) Amateur Adult Silver Rhythm', '2) Amateur Collegiate Silver Standard', '3) Amateur Collegiate Silver Intl. Tango', '4) Amateur Collegiate All Syllabus Standard', '10) Amateur Adult Silver Intl. V. Waltz', '3) Amateur Adult Silver Standard', '3) Amateur Adult Silver Intl. Tango', '11) Amateur Collegiate Silver Smooth', '2) Amateur Adult Gold Smooth', '1) Amateur Collegiate Gold Smooth', '6) Amateur Adult Novice Smooth', '23) Amateur Adult Silver Latin', '9) Amateur Collegiate Silver Latin', '8) Amateur Collegiate Bronze Latin', '15) Amateur Adult Bronze 1 Latin', '8) Amateur Adult Bronze Latin']
for name in event_names:
    print(parse_event_name(name))

Amateur Collegiate Gold Standard
Amateur Collegiate Silver Rhythm
Amateur Collegiate Gold Rhythm
Amateur Adult Silver Rhythm
Amateur Collegiate Silver Standard
Amateur Collegiate Silver Intl. Tango
Amateur Collegiate All Syllabus Standard
Amateur Adult Silver Intl. V. Waltz
Amateur Adult Silver Standard
Amateur Adult Silver Intl. Tango
Amateur Collegiate Silver Smooth
Amateur Adult Gold Smooth
Amateur Collegiate Gold Smooth
Amateur Adult Novice Smooth
Amateur Adult Silver Latin
Amateur Collegiate Silver Latin
Amateur Collegiate Bronze Latin
Amateur Adult Bronze 1 Latin
Amateur Adult Bronze Latin


In [9]:
#| export

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get(url, **kwargs):
    return requests.get(url, headers=headers, **kwargs)

def get_event_list_html(name: str) -> BeautifulSoup:
    first, last = name.split(" ", 1)
    url = f"https://results.o2cm.com/individual.asp?szLast={last}&szFirst={first}"
    page: requests.Response = get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

soup = get_event_list_html("Sasha Hydrie")

In [10]:
#| export

def extract_events_from_html(soup: BeautifulSoup) -> List[Tuple[str, str]]:
    events = []
    include_events = False

    for tag in soup.find_all(['b', 'a']):  
        if tag.name == 'b' and tag.text == '03-22-24 - USA DANCE National DanceSport Championships':
            include_events = True
        elif tag.name == 'b' and include_events:
            break 
        elif tag.name == 'a' and include_events:
            events.append((parse_event_name(tag.text), tag.get('href')))

    return events

events = extract_events_from_html(soup)
events

[(Event(division='Amateur Adult', level='Silver', event='Intl. Tango', number=15),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019'),
 (Event(division='Amateur Adult', level='Bronze 1', event='Latin', number=21),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40422B20'),
 (Event(division='Amateur Collegiate', level='Silver', event='Smooth', number=30),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323030'),
 (Event(division='Amateur Collegiate', level='Silver', event='Rhythm', number=15),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323040'),
 (Event(division='Amateur Collegiate', level='Silver', event='Amer. Bolero', number=15),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=4032304B'),
 (Event(division='Amateur Adult', level='Silver', event='Rhythm', number=7),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423040'),
 (Event(division='Amateur Adult', level='Silver', event='Amer.

In [11]:
#| export 

class Result(NamedTuple):
    callbacks: int
    placement: float | None

In [12]:
#| export

def extract_max_callbacks(soup: BeautifulSoup) -> int:
    select_element = soup.find('select', {'id': 'selCount'})
    possible_callbacks = len(select_element.find_all('option')) - 1 if select_element else 0
    return possible_callbacks

def extract_placement(soup: BeautifulSoup, name: str, verbose=False) -> float | None: 
    """assumes that soup is a finals page"""
    couple_number = None
    for link in soup.find_all('a'):
        if link.text == name:
            parent_td = link.find_parent('td')  # Parent <td> which should have sibling with couple number
            if parent_td:
                # The immediate previous sibling <td> of `parent_td` contains the couple number
                prev_td = parent_td.find_previous_sibling("td", class_="t1b")
                if prev_td:
                    couple_number = prev_td.text.strip()
                    if verbose:
                        print(f"Found {couple_number} associated with {name}")
                    break
    else:
        if verbose:
            print(f"Dancer {name} didn't final")
        return None

    results_table = soup.find('table', class_='t1n') 
    for row in results_table.find_all('tr'):
        cells = row.find_all('td') 
        if cells and cells[0].get_text(strip=True) == couple_number:
            averaged_place = cells[-2].get_text(strip=True)  
            return float(averaged_place)

In [13]:

url = "https://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019"
name = "Khalid Ali"

initial_res = get(url)
soup = BeautifulSoup(initial_res.content, "html.parser")

extract_placement(soup, name)


3.5

In [20]:
#| export

def get_event_result(name: str, url: str) -> Result:
    query_string = url.split("?")[1]
    key_value_pairs = [query.split("=") for query in query_string.split("&")]
    data: dict[str, str | int] = {key: value for key, value in key_value_pairs}

    # CR shy: factor session creation out
    session = requests.Session()
    session.headers.update(headers)

    initial_res = session.get(url)
    soup = BeautifulSoup(initial_res.content, "html.parser")

    possible_callbacks = extract_max_callbacks(soup)

    if (place := extract_placement(soup, name)) is not None:
        return Result(callbacks=possible_callbacks, placement=place)


    url = "https://results.o2cm.com/scoresheet3.asp"
    # CR shy: binary search eventually
    for selector in range(1, possible_callbacks + 1):
        data["selCount"] = selector
        response = session.post(url, data=data)

        # Process the response
        soup = BeautifulSoup(response.content, 'html.parser')

        if name in soup.get_text():
            return Result(callbacks=possible_callbacks - selector, placement=None)
    else:
        raise ValueError(f"Couldn't find {name} in {url}")

get_event_result("Khalid Ali", "https://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019")
    


Result(callbacks=2, placement=3.5)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [23]:
soup = get_event_list_html("Khalid Ali")
events = extract_events_from_html(soup)

In [26]:
for event_name, event_url in events:
    try:
        result = get_event_result("Khalid Ali", event_url)
        if result.placement:
            print(f"In {event_name}, Khalid got {result.callbacks} callbacks and placed {result.placement}")
        else:
            print(f"In {event_name}, Khalid got {result.callbacks} callbacks")
    except:
        print(f"No result for {event_name}")
        continue

In Amateur Collegiate Gold Standard, Khalid got 0 callbacks
In Amateur Collegiate Silver Rhythm, Khalid got 0 callbacks
In Amateur Collegiate Gold Rhythm, Khalid got 0 callbacks and placed 6.0
In Amateur Adult Silver Rhythm, Khalid got 1 callbacks
In Amateur Collegiate Silver Standard, Khalid got 2 callbacks and placed 2.0
In Amateur Collegiate Silver Intl. Tango, Khalid got 2 callbacks and placed 3.0
In Amateur Collegiate All Syllabus Standard, Khalid got 2 callbacks and placed 4.0
In Amateur Adult Silver Intl. V. Waltz, Khalid got 1 callbacks
In Amateur Adult Silver Standard, Khalid got 2 callbacks and placed 3.0
In Amateur Adult Silver Intl. Tango, Khalid got 2 callbacks and placed 3.5
In Amateur Collegiate Silver Smooth, Khalid got 2 callbacks
In Amateur Adult Gold Smooth, Khalid got 2 callbacks and placed 2.0
In Amateur Collegiate Gold Smooth, Khalid got 1 callbacks and placed 1.0
In Amateur Adult Novice Smooth, Khalid got 2 callbacks and placed 3.0
In Amateur Adult Silver Latin, 