# scraping

> Scraping from o2cm

In [2]:
#| default_exp scoring

In [3]:
#| hide
import requests
from bs4 import BeautifulSoup
from typing import List, Tuple, NamedTuple

In [14]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

def get_event_html(name: str) -> BeautifulSoup:
    first, last = name.split(" ", 1)
    url = f"https://results.o2cm.com/individual.asp?szLast={last}&szFirst={first}"
    page: requests.Response = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.content, "html.parser")
    return soup

soup = get_event_html("Sasha Hydrie")

In [15]:
soup


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<meta content="noindex, nofollow" name="robots"/>
<head>
<link href="compdb.css" rel="stylesheet"/>
</head>
<body alink="#000000" bgcolor="#FFFFFF" class="t1n" link="#000000" text="#000000" vlink="#0000cc">
<div align="center" class="vs">
<table width="320">
<tr>
<td align="center">
<span class="h2">Sasha Hydrie</span></td></tr><tr><td class="t1n"><br/><b>04-01-22 - USA DANCE National DanceSport Championships</b></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=4042284A">66) Amateur Adult Bronze Amer. Swing</a></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=40422840">66) Amateur Adult Bronze Rhythm</a></td></tr><tr><td class="t1n">   <a href="http://Results.o2cm.com/scoresheet3.asp?event=usa22&amp;heatid=4032

In [20]:
#| export

def extract_events_from_html(soup: BeautifulSoup) -> List[Tuple[str, str]]:
    events = []
    include_events = False

    for tag in soup.find_all(['b', 'a']):  
        if tag.name == 'b' and tag.text == '03-22-24 - USA DANCE National DanceSport Championships':
            include_events = True
        elif tag.name == 'b' and include_events:
            break 
        elif tag.name == 'a' and include_events:
            events.append((tag.text, tag.get('href')))

    return events

events = extract_events_from_html(soup)
events

[('15) Amateur Adult Silver Intl. Tango',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019'),
 ('21) Amateur Adult Bronze 1 Latin',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40422B20'),
 ('30) Amateur Collegiate Silver Smooth',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323030'),
 ('15) Amateur Collegiate Silver Rhythm',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323040'),
 ('15) Amateur Collegiate Silver Amer. Bolero',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=4032304B'),
 ('7) Amateur Adult Silver Rhythm',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423040'),
 ('5) Amateur Adult Silver Amer. Bolero',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=4042304B'),
 ('13) Amateur Collegiate Silver Standard',
  'http://Results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40323010'),
 ('11) Amateur Adult Silver Standard',
  'http://Results.o2cm.com/scoreshe

In [39]:
#| export 

class Result(NamedTuple):
    callbacks: int
    placement: float | None

In [83]:
#| export

def extract_max_callbacks(soup: BeautifulSoup) -> int:
    select_element = soup.find('select', {'id': 'selCount'})
    possible_callbacks = len(select_element.find_all('option')) - 1 if select_element else 0
    return possible_callbacks

def extract_placement(soup: BeautifulSoup, name: str) -> float | None: 
    """assumes that soup is a finals page"""
    couple_number = None
    for link in soup.find_all('a'):
        if link.text == name:
            parent_td = link.find_parent('td')  # Parent <td> which should have sibling with couple number
            if parent_td:
                # The immediate previous sibling <td> of `parent_td` contains the couple number
                prev_td = parent_td.find_previous_sibling("td", class_="t1b")
                if prev_td:
                    couple_number = prev_td.text.strip()
                    print(f"Found {couple_number} associated with {name}")
                    break
    else:
        print(f"Dancer {name} didn't final: {soup}")
        return None

    results_table = soup.find('table', class_='t1n') 
    for row in results_table.find_all('tr'):
        cells = row.find_all('td') 
        if cells and cells[0].get_text(strip=True) == couple_number:
            averaged_place = cells[-2].get_text(strip=True)  
            return float(averaged_place)

In [84]:

url = "https://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019"
name = "Khalid Ali"

initial_res = requests.get(url, headers=headers)
soup = BeautifulSoup(initial_res.content, "html.parser")

extract_placement(soup, name)


Found 528 associated with Khalid Ali


3.5

In [85]:
#| export

def get_event_result(name: str, url: str) -> Result:
    query_string = url.split("?")[1]
    key_value_pairs = [query.split("=") for query in query_string.split("&")]
    data = {key: value for key, value in key_value_pairs}

    session = requests.Session()
    session.headers.update(headers)

    initial_res = session.get(url)
    soup = BeautifulSoup(initial_res.content, "html.parser")

    if (place := extract_placement(soup, name)) is not None:
        return Result(callbacks=extract_max_callbacks(soup), placement=place)


    url = "https://results.o2cm.com/scoresheet3.asp"
    # data = {
    #     "heatid": "40423019",  
    #     "event": "usa24",      
    #     "selCount": "1"        # 0: final, 1: semi final, 2: quarter final, etc
    # }

    response = session.post(url, data=data)

    # Process the response
    soup = BeautifulSoup(response.content, 'html.parser')
    raise NotImplementedError()

get_event_result("Khalid Ali", "https://results.o2cm.com/scoresheet3.asp?event=usa24&heatid=40423019")
    


Found 528 associated with Khalid Ali


Result(callbacks=2, placement=3.5)

In [28]:

url = "https://results.o2cm.com/scoresheet3.asp"
data = {
    "heatid": "40423019",  # Hidden input for the specific heat
    "event": "usa24",      # Hidden input for the specific event
    "selCount": "1"        # Dropdown selection for Semi-Final
}

# Send the POST request
response = requests.post(url, data=data)

# Process the response with BeautifulSoup or another tool if needed
soup = BeautifulSoup(response.content, 'html.parser')


In [34]:
print(soup)


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml">
<meta content="noindex, nofollow" name="robots"/>
<head>
<link href="compdb.css" rel="stylesheet"/>
<style type="text/css">
	    .h1 {color:#000000}
	    .h2 {color:#000000}
	    .h4 {color:#000000}
	    .h5n {color:#000000}
	    .h5b {color:#000000}
	    .t1n {color:#000000}
	    .t1b {color:#000000}
	    .t2n {color:#000000}
	    .t2b {color:#000000}
    </style>
<script type="text/javascript">
			function MM_findObj(name) { 
				var i,obj=document[name];
				if (!obj && document.all)
					obj=document.all[name];
				if (!obj && document.getElementById)
					obj = document.getElementById(name);
				if (!obj && document.layers)
					obj=document.layers[name];
				if (!obj && document.forms)
					for (i=0;!obj&&i<document.forms.length;i++) 
						obj=document.forms[i][name];
				return obj;
			}
			
			function MyWind

In [37]:
name = "Carmen Schultz"
if name in soup.get_text():
    print(f"{name} is in the results")
else:
    print(f"{name} is not in the results")

Carmen Schultz is in the results


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()