# Scraping workout description from the website
This notebook shows how to scrape and process a workout description from the website, and extract some useful features from it using the openAI API. 
## Demo using one workout

In [1]:
import requests
import json
from bs4 import BeautifulSoup
import sys
import os

sys.path.append('..')
from utils import scrape_workout_description, clean_description


In [2]:
# Add open AI API key in some parent folder and add to environment variable
with open('../../../openai-api-key.txt', 'r') as f:
    open_ai_api_key = f.read().strip()
os.environ['OPENAI_API_KEY'] = open_ai_api_key

In [3]:
# choose workout
name = 17.1

# get year and number
year = 2000 + int(name)
workout = round((name - int(name)) * 10)
print(year, workout)


2017 1


In [4]:
# scrape workout description from crossfit.com
description = scrape_workout_description(year, workout)
print(description)


Workout 17.1
For time:
10 dumbbell snatches
15 burpee box jump-overs
20 dumbbell snatches
15 burpee box jump-overs
30 dumbbell snatches
15 burpee box jump-overs
40 dumbbell snatches
15 burpee box jump-overs
50 dumbbell snatches
15 burpee box jump-overs
Men use 50-lb. dumbbell and 24-in. box
Time cap: 20 minutes



In [5]:
# clean description using openai
if description:
    cleaned_description = clean_description(name, description)
    cleaned_description_dic = eval(cleaned_description.replace('null','None'))
cleaned_description_dic

AttributeError: 'function' object has no attribute 'replace'

## Loop through mutliple workouts
Loop though multiple workouts and store details in 
`Data/workout_descriptions/open_parsed_descriptions.json` file. It may also be useful to store the raw (uncleaned descriptions) in a separate file.


In [13]:
with open("../../Data/workout_descriptions/open_parsed_descriptions.json", "r") as f:
    open_parsed_descriptions = json.load(f)
open_parsed_descriptions.keys()

dict_keys(['17.1'])

In [21]:
overwrite = False
for name in [17.1,17.2,17.3,17.4,17.5]:
    # prevent overwriting for the ones already saved
    if (not overwrite) and str(name) in open_parsed_descriptions:
        continue
    
    # get year and number
    year = 2000 + int(name)
    workout = round((name - int(name)) * 10)
    print(year, workout)

    # scrape workout description from crossfit.com
    description = scrape_workout_description(year, workout)

    # clean description using openai
    if description:
        cleaned_description = clean_description(name, description)
        cleaned_description_json = json.loads(cleaned_description)
        for key, value in cleaned_description_json.items():
            cleaned_description_json[key] = value

        print(f"Successfully parsed workout {workout}!")
    else:
        print(f"Failed to extract workout {workout}!")
        
    
    


2017 2
Successfully parsed workout 2!
2017 3
Successfully parsed workout 3!
2017 4
Successfully parsed workout 4!
2017 5
Successfully parsed workout 5!


{'17.5': {'goal': 'for time',
  'time_cap': None,
  'total_reps': None,
  'description': '10 rounds for time of: 9 thrusters (95 lb), 35 double-unders'}}

In [23]:
# save to file
with open("../../Data/workout_descriptions/open_parsed_descriptions.json", "w") as f:
    json.dump(open_parsed_descriptions, f)