In [30]:

from helpers import retrieve_benchmark_info
from tqdm import tqdm
import pandas as pd


In [None]:
import wod_predictor
from wod_predictor.data_loader import DataLoader


data_path = wod_predictor.__path__[0].replace("wod_predictor", "Data")
loader = DataLoader(root_path = data_path, objects= ['open_results','descriptions','benchmark_stats',  'athlete_info'])
data = loader.load()

In [32]:
all_ids = data['athlete_info'].index
newly_scraped = pd.read_csv("benchmark_stats_scraped.csv")
old_errors = pd.read_csv("benchmark_stats_errors.csv")['0']
already_scraped = data['benchmark_stats'].index.union(newly_scraped['athlete_id'])
id_not_scraped = all_ids.difference(already_scraped)

In [33]:
# find some priorities between the ones that have not been scraped
less_errors = id_not_scraped.difference(old_errors)
ids_to_scrape = data['open_results'].loc[id_not_scraped].notna().sum(axis=1).sort_values(ascending=False).index

In [34]:
import concurrent.futures

# Function to scrape multiple athlete IDs in parallel
def scrape_athletes_in_parallel(athlete_ids):
    results = []
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Submit the tasks and get the results
        future_to_id = {executor.submit(retrieve_benchmark_info, athlete_id): athlete_id for athlete_id in athlete_ids}
        
        for future in concurrent.futures.as_completed(future_to_id):
            athlete_id = future_to_id[future]
            try:
                result = future.result()
                results.append((athlete_id, result))
            except Exception as e:
                print(f"Error fetching data for athlete {athlete_id}: {e}")
    
    return results

In [None]:
benchmark_stats = scrape_athletes_in_parallel(ids_to_scrape[:100])

In [None]:
benchmark_stats

In [41]:
benchmark_stats_df = [x[1] for x in benchmark_stats if isinstance(x[1], dict)]
benchmark_stats_df = pd.DataFrame(benchmark_stats_df)

benchmark_stats_df = pd.concat([newly_scraped, benchmark_stats_df])
benchmark_stats_df.drop_duplicates(subset=['athlete_id'], inplace=True)
benchmark_stats_df.to_csv("benchmark_stats_scraped.csv", index=False)

In [42]:
errors = [x[0] for x in benchmark_stats if not isinstance(x[1], dict)]
errors = pd.Series(errors)
errors = pd.concat([errors, old_errors])
errors.to_csv("benchmark_stats_errors.csv", index=False)

In [None]:
benchmark_stats_df