In [None]:

from helpers import retrieve_benchmark_info
from tqdm import tqdm
import pandas as pd

In [None]:
import wod_predictor
from wod_predictor.data_loader import DataLoader


data_path = wod_predictor.__path__[0].replace("wod_predictor", "Data")
loader = DataLoader(root_path = data_path, objects= ['open_results','descriptions','benchmark_stats',  'athlete_info'])
data = loader.load()

In [3]:
all_ids = data['athlete_info'].index
newly_scraped = pd.read_csv("benchmark_stats_scraped.csv")
old_errors = pd.read_csv("benchmark_stats_errors.csv")['0']
already_scraped = data['benchmark_stats'].index.union(newly_scraped['athlete_id'])
id_not_scraped = all_ids.difference(already_scraped)

In [4]:
# find some priorities between the ones that have not been scraped
less_errors = id_not_scraped.difference(old_errors)
# ids_to_scrape = data['open_results'].loc[id_not_scraped].notna().sum(axis=1).sort_values(ascending=False).index
ids_to_scrape = data['open_results'].loc[less_errors].index

In [None]:
# temp for my testing
ids_to_scrape

In [6]:
import concurrent.futures

# Function to scrape multiple athlete IDs in parallel
def scrape_athletes_in_parallel(athlete_ids):
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit the tasks and get the results
        future_to_id = {executor.submit(retrieve_benchmark_info, athlete_id): athlete_id for athlete_id in athlete_ids}
        
        for future in concurrent.futures.as_completed(future_to_id):
            athlete_id = future_to_id[future]
            try:
                result = future.result()
                results.append((athlete_id, result))
            except Exception as e:
                print(f"Error fetching data for athlete {athlete_id}: {e}")
    
    return results

def scrape_results_serial(athlete_ids):
    results = []
    for athlete_id in (athlete_ids):
        try:
            result = retrieve_benchmark_info(athlete_id)
            results.append((athlete_id, result))
        except Exception as e:
            print(f"Error fetching data for athlete {athlete_id}: {e}")
            results.append((athlete_id, e))
    
    return results

def save_results(results):
    already_scraped = pd.read_csv("benchmark_stats_scraped.csv")
    newly_scraped = [x[1] for x in results if isinstance(x[1], dict)]
    newly_scraped = pd.DataFrame(newly_scraped)

    all_scraped = pd.concat([newly_scraped, already_scraped])
    if all_scraped.duplicated(subset=['athlete_id']).sum() > 0:
        print("Warning: Duplicates found in scraped data")
        all_scraped.drop_duplicates(subset=['athlete_id'], inplace=True)

    old_errors = pd.read_csv("benchmark_stats_errors.csv")['0']
    new_errors = [x[0] for x in results if not isinstance(x[1], dict)]
    new_errors = pd.Series(new_errors)
    errors = pd.concat([new_errors, old_errors])

    # print how many errors and how many successes we have
    print(f"Errors: {new_errors.shape[0]}")
    print(f"Successes: {newly_scraped.shape[0]}")
    print(f"Total scraped: {all_scraped.shape[0]}")

    all_scraped.to_csv("benchmark_stats_scraped.csv", index=False)
    errors.to_csv("benchmark_stats_errors.csv", index=False)

In [None]:
BATCH_SIZE = 4
batch_results = []
for i in (range(0, len(ids_to_scrape), BATCH_SIZE)):
    batch = ids_to_scrape[i:i+BATCH_SIZE]

    batch_results += scrape_athletes_in_parallel(batch)
    if i % 100 == 0:
        save_results(batch_results)
        batch_results = []
        