# scraping

> Scraping from o2cm

In [2]:
#| default_exp scoring

In [88]:
#| hide
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, NamedTuple

In [128]:
#| export

class Event(NamedTuple):
    division: str
    level: str
    event: str

    number: int

    def __str__(self):
        return f"{self.division} {self.level} {self.event}"

def parse_event_name(event_name):
    words = event_name.split()
    number = int(words[0][:-1])
    division = ' '.join(words[1:3])
    level = ' '.join(words[3:5]) if words[4].isdigit() else words[3]
    event = ' '.join(words[5:]) if words[4].isdigit() else ' '.join(words[4:])
    return Event(number=number, division=division, level=level, event=event)

event_names = ['9) Amateur Collegiate Gold Standard', '19) Amateur Collegiate Silver Rhythm', '4) Amateur Collegiate Gold Rhythm', '10) Amateur Adult Silver Rhythm', '2) Amateur Collegiate Silver Standard', '3) Amateur Collegiate Silver Intl. Tango', '4) Amateur Collegiate All Syllabus Standard', '10) Amateur Adult Silver Intl. V. Waltz', '3) Amateur Adult Silver Standard', '3) Amateur Adult Silver Intl. Tango', '11) Amateur Collegiate Silver Smooth', '2) Amateur Adult Gold Smooth', '1) Amateur Collegiate Gold Smooth', '6) Amateur Adult Novice Smooth', '23) Amateur Adult Silver Latin', '9) Amateur Collegiate Silver Latin', '8) Amateur Collegiate Bronze Latin', '15) Amateur Adult Bronze 1 Latin', '8) Amateur Adult Bronze Latin']
for name in event_names:
    print(parse_event_name(name))

Amateur Collegiate Gold Standard
Amateur Collegiate Silver Rhythm
Amateur Collegiate Gold Rhythm
Amateur Adult Silver Rhythm
Amateur Collegiate Silver Standard
Amateur Collegiate Silver Intl. Tango
Amateur Collegiate All Syllabus Standard
Amateur Adult Silver Intl. V. Waltz
Amateur Adult Silver Standard
Amateur Adult Silver Intl. Tango
Amateur Collegiate Silver Smooth
Amateur Adult Gold Smooth
Amateur Collegiate Gold Smooth
Amateur Adult Novice Smooth
Amateur Adult Silver Latin
Amateur Collegiate Silver Latin
Amateur Collegiate Bronze Latin
Amateur Adult Bronze 1 Latin
Amateur Adult Bronze Latin


In [130]:
#| export

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get(url, **kwargs):
    return requests.get(url, headers=headers, **kwargs)

def get_event_html(name: str) -> BeautifulSoup:
    first, last = name.split(" ", 1)
    url = f"https://results.o2cm.com/individual.asp?szLast={last}&szFirst={first}"
    page: requests.Response = get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

soup = get_event_html("Sasha Hydrie")

In [15]:
#| hide

soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<meta content="noindex, nofollow" name="robots"/>
<head>
<link href="compdb.css" rel="stylesheet"/>
</head>
<body alink="#000000" bgcolor="#FFFFFF" class="t1n" link="#000000" text="#000000" vlink="#0000cc">
<div align="center" class="vs">
<table width="320">
<tr>
<td align="center">
<span class="h2">Sasha Hydrie</span></td></tr><tr><td class="t1n"><br/><b>04-01-22 - USA DANCE National DanceSport Championships</b></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=4042284A">66) Amateur Adult Bronze Amer. Swing</a></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=40422840">66) Amateur Adult Bronze Rhythm</a></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=4032

In [127]:
#| export

def extract_events_from_html(soup: BeautifulSoup) -> List[Tuple[str, str]]:
    events = []
    include_events = False

    for tag in soup.find_all(['b', 'a']):  
        if tag.name == 'b' and tag.text == '03-22-24 - USA DANCE National DanceSport Championships':
            include_events = True
        elif tag.name == 'b' and include_events:
            break 
        elif tag.name == 'a' and include_events:
            events.append((parse_event_name(tag.text), tag.get('href')))

    return events

events = extract_events_from_html(soup)
events

[(Event(division='Amateur Collegiate', level='Gold', event='Standard', number=9),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323810'),
 (Event(division='Amateur Collegiate', level='Silver', event='Rhythm', number=19),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323040'),
 (Event(division='Amateur Collegiate', level='Gold', event='Rhythm', number=4),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323840'),
 (Event(division='Amateur Adult', level='Silver', event='Rhythm', number=10),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423040'),
 (Event(division='Amateur Collegiate', level='Silver', event='Standard', number=2),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323010'),
 (Event(division='Amateur Collegiate', level='Silver', event='Intl. Tango', number=3),
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323019'),
 (Event(division='Amateur Collegiate', level='All', event='Syl

In [39]:
#| export 

class Result(NamedTuple):
    callbacks: int
    placement: float | None

In [98]:
#| export

def extract_max_callbacks(soup: BeautifulSoup) -> int:
    select_element = soup.find('select', {'id': 'selCount'})
    possible_callbacks = len(select_element.find_all('option')) - 1 if select_element else 0
    return possible_callbacks

def extract_placement(soup: BeautifulSoup, name: str, verbose=False) -> float | None: 
    """assumes that soup is a finals page"""
    couple_number = None
    for link in soup.find_all('a'):
        if link.text == name:
            parent_td = link.find_parent('td')  # Parent <td> which should have sibling with couple number
            if parent_td:
                # The immediate previous sibling <td> of `parent_td` contains the couple number
                prev_td = parent_td.find_previous_sibling("td", class_="t1b")
                if prev_td:
                    couple_number = prev_td.text.strip()
                    if verbose:
                        print(f"Found {couple_number} associated with {name}")
                    break
    else:
        if verbose:
            print(f"Dancer {name} didn't final")
        return None

    results_table = soup.find('table', class_='t1n') 
    for row in results_table.find_all('tr'):
        cells = row.find_all('td') 
        if cells and cells[0].get_text(strip=True) == couple_number:
            averaged_place = cells[-2].get_text(strip=True)  
            return float(averaged_place)

In [84]:

url = "https://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019"
name = "Khalid Ali"

initial_res = get(url)
soup = BeautifulSoup(initial_res.content, "html.parser")

extract_placement(soup, name)


Found 528 associated with Khalid Ali


3.5

In [85]:
#| export

def get_event_result(name: str, url: str) -> Result:
    query_string = url.split("?")[1]
    key_value_pairs = [query.split("=") for query in query_string.split("&")]
    data = {key: value for key, value in key_value_pairs}

    # CR shy: factor session creation out
    session = requests.Session()
    session.headers.update(headers)

    initial_res = session.get(url)
    soup = BeautifulSoup(initial_res.content, "html.parser")

    if (place := extract_placement(soup, name)) is not None:
        return Result(callbacks=extract_max_callbacks(soup), placement=place)


    url = "https://results.o2cm.com/scoresheet3.asp"
    # data = {
    #     "heatid": "40423019",  
    #     "event": "usa24",      
    #     "selCount": "1"        # 0: final, 1: semi final, 2: quarter final, etc
    # }

    response = session.post(url, data=data)

    # Process the response
    soup = BeautifulSoup(response.content, 'html.parser')
    raise NotImplementedError()

get_event_result("Khalid Ali", ", headers=headershttps://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019")
    


Found 528 associated with Khalid Ali


Result(callbacks=2, placement=3.5)

In [28]:

url = "https://results.o2cm.com/scoresheet3.asp"
data = {
    "heatid": "40423019",  # Hidden input for the specific heat
    "event": "usa24",      # Hidden input for the specific event
    "selCount": "1"        # Dropdown selection for Semi-Final
}

# Send the POST request
response = requests.post(url, data=data)

# Process the response with BeautifulSoup or another tool if needed
soup = BeautifulSoup(response.content, 'html.parser')


In [37]:
name = "Carmen Schultz"
if name in soup.get_text():
    print(f"{name} is in the results")
else:
    print(f"{name} is not in the results")

Carmen Schultz is in the results


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [102]:
soup = get_event_html("Khalid Ali")
events = extract_events_from_html(soup)

In [104]:
for event_name, event_url in events:
    try:
        result = get_event_result("Khalid Ali", event_url)
        print(f"In `{event_name}`, Khalid got {result.callbacks} callbacks and placed {result.placement}")
    except:
        print(f"No result for {event_name}")
        continue

No result for 9) Amateur Collegiate Gold Standard
No result for 19) Amateur Collegiate Silver Rhythm
In `4) Amateur Collegiate Gold Rhythm`, Khalid got 0 callbacks and placed 6.0
No result for 10) Amateur Adult Silver Rhythm
In `2) Amateur Collegiate Silver Standard`, Khalid got 2 callbacks and placed 2.0
In `3) Amateur Collegiate Silver Intl. Tango`, Khalid got 2 callbacks and placed 3.0
In `4) Amateur Collegiate All Syllabus Standard`, Khalid got 2 callbacks and placed 4.0
No result for 10) Amateur Adult Silver Intl. V. Waltz
In `3) Amateur Adult Silver Standard`, Khalid got 2 callbacks and placed 3.0
In `3) Amateur Adult Silver Intl. Tango`, Khalid got 2 callbacks and placed 3.5
No result for 11) Amateur Collegiate Silver Smooth
In `2) Amateur Adult Gold Smooth`, Khalid got 2 callbacks and placed 2.0
In `1) Amateur Collegiate Gold Smooth`, Khalid got 1 callbacks and placed 1.0
In `6) Amateur Adult Novice Smooth`, Khalid got 2 callbacks and placed 3.0
No result for 23) Amateur Adult 