In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# Load the CSV file
input_csv_path = 'All_Occupations.csv'
df_input = pd.read_csv(input_csv_path)
urls = df_input['URL'].tolist()

In [None]:
# Function to handle general scraping tasks
def scrape_onet_page(url, section_id=None, data_title_main=None, data_title_desc=None, output_columns=None, special_scrape=False):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    data = []
    
    # General scraping for sections with tables
    if section_id and data_title_main and data_title_desc and output_columns:
        table_section = soup.select_one(section_id)
        if table_section:
            for row in table_section.find_all('tr'):
                importance_element = row.find('td', {'data-title': data_title_main})
                element = row.find('td', {'data-title': data_title_desc})
                if importance_element and element:
                    importance = importance_element.get_text(strip=True)
                    title_element = element.find('b')
                    title = title_element.get_text(strip=True) if title_element else ''
                    description = element.get_text(strip=True).replace(title, '').strip('— ')
                    data.append({
                        'URL': url,
                        output_columns[0]: importance,
                        output_columns[1]: title,
                        output_columns[2]: description
                    })
                else:
                    print(f"Missing elements in URL: {url}")
    # Special cases like tools, work activities, work context, etc.
    elif special_scrape:
        if section_id == 'occupation_details':
            occupation = soup.select_one('#content > h1 > span:nth-child(1)').get_text(strip=True) if soup.select_one('#content > h1 > span:nth-child(1)') else None
            soc_code = soup.select_one('#content > h1 > span:nth-child(2) > div > div:nth-child(1)').get_text(strip=True) if soup.select_one('#content > h1 > span:nth-child(2) > div > div:nth-child(1)') else None
            occupation_description = soup.select_one('#content > p:nth-of-type(1)').text.strip() if soup.select_one('#content > p:nth-of-type(1)') else None
            sample_of_reported_job_titles = soup.select_one('#content > p:nth-of-type(2)').text.strip() if soup.select_one('#content > p:nth-of-type(2)') else None

            data.append({
                'url': url,
                'occupation': occupation,
                'soc_code': soc_code,
                'occupation_description': occupation_description,
                'sample_of_reported_job_titles': sample_of_reported_job_titles
            })
        elif section_id == 'technology_skills':
            tech_skills_section = soup.select_one('#TechnologySkills > div > ul')
            if tech_skills_section:
                for item in tech_skills_section.find_all('li'):
                    category = item.find('b').get_text(strip=True) if item.find('b') else None
                    example_text = item.get_text(strip=True).replace(category, '').strip() if category else item.get_text(strip=True)
                    examples = [example.strip() for example in example_text.split(';')]
                    for example in examples:
                        if example:
                            data.append({
                                'URL': url,
                                'Category': category,
                                'Example': example
                            })
        elif section_id == 'tools':
            tools_list = soup.select_one('#ToolsUsed ul')
            if tools_list:
                for item in tools_list.find_all('li'):
                    category_example = item.select_one('div.order-2.flex-grow-1').get_text(strip=True)
                    if "—" in category_example:
                        category, example = category_example.split("—", 1)
                        data.append({'Category': category.strip(), 'Example': example.strip(), 'url': url})
                    else:
                        data.append({'Category': category_example.strip(), 'Example': '', 'url': url})
        elif section_id == 'work_activities':
            work_activities_section = soup.select_one('#WorkActivities > div > table')
            if work_activities_section:
                for row in work_activities_section.find('tbody').find_all('tr'):
                    importance = row.find('td', {'data-title': 'Importance'}).get_text(strip=True)
                    work_activity_element = row.find('td', {'data-title': 'Work Activity'})
                    work_activity_title = work_activity_element.find('b').get_text(strip=True)
                    description_text = work_activity_element.get_text(strip=True).replace(work_activity_title, '').strip('— ')
                    data.append({
                        'URL': url,
                        'Importance': importance,
                        'Work Activity': work_activity_title,
                        'Work Activity Description': description_text
                    })
        elif section_id == 'work_context':
            work_context_section = soup.select_one('#WorkContext > div > table')
            if work_context_section:
                for row in work_context_section.find('tbody').find_all('tr'):
                    context_value = row.find('td', {'data-title': 'Context'}).get_text(strip=True)
                    work_context_element = row.find('td', {'data-title': 'Work Context'})
                    work_context_title = work_context_element.find('b').get_text(strip=True)
                    context_description = work_context_element.get_text(strip=True).replace(work_context_title, '').strip('— ')
                    data.append({
                        'URL': url,
                        'Context': context_value,
                        'Work Context': work_context_title,
                        'Work Context Description': context_description
                    })

    return data

In [None]:
# Scraping occupation details
occupation_data = []
for url in urls:
    occupation_data.extend(scrape_onet_page(url, section_id='occupation_details', special_scrape=True))
pd.DataFrame(occupation_data).to_csv('occupation_details.csv', index=False)

In [None]:
# Scraping technology skills
tech_skills_data = []
for url in urls:
    tech_skills_data.extend(scrape_onet_page(url, section_id='technology_skills', special_scrape=True))
pd.DataFrame(tech_skills_data).to_csv('technology_skills.csv', index=False)

In [None]:
# Scraping tools used
tools_data = []
for url in urls:
    tools_data.extend(scrape_onet_page(url, section_id='tools', special_scrape=True))
pd.DataFrame(tools_data).to_csv('tools_used.csv', index=False)

In [None]:
# Scraping work activities
work_activities_data = []
for url in urls:
    work_activities_data.extend(scrape_onet_page(url, section_id='work_activities', special_scrape=True))
pd.DataFrame(work_activities_data).to_csv('work_activities.csv', index=False)

In [None]:
# Scraping work context
work_context_data = []
for url in urls:
    work_context_data.extend(scrape_onet_page(url, section_id='work_context', special_scrape=True))
pd.DataFrame(work_context_data).to_csv('work_context.csv', index=False)

In [None]:
# Sections to scrape with table-based extraction
sections = [
    {'id': '#Abilities > div > table > tbody', 'main': 'Importance', 'desc': 'Ability', 'output': ['Importance', 'Ability', 'Ability Description'], 'file': 'abilities.csv'},
    {'id': '#Interests > table > tbody', 'main': 'Occupational Interest', 'desc': 'Interest', 'output': ['Importance', 'Interest', 'Interest Description'], 'file': 'interests.csv'},
    {'id': '#WorkValues > table > tbody', 'main': 'Extent', 'desc': 'Work Value', 'output': ['Extent', 'Work Value', 'Work Value Description'], 'file': 'work_values.csv'},
    {'id': '#WorkStyles > div > table > tbody', 'main': 'Importance', 'desc': 'Work Style', 'output': ['Importance', 'Work Style', 'Work Style Description'], 'file': 'work_styles.csv'},
    {'id': '#Knowledge > div > table > tbody', 'main': 'Importance', 'desc': 'Knowledge', 'output': ['Importance', 'Knowledge', 'Knowledge Description'], 'file': 'knowledge.csv'},
    {'id': '#Skills > div > table > tbody', 'main': 'Importance', 'desc': 'Skill', 'output': ['Importance', 'Skill', 'Skill Description'], 'file': 'skills.csv'}
]

In [None]:
# Scraping tables for each section
for section in sections:
    section_data = []
    for url in urls:
        section_data.extend(scrape_onet_page(url, section_id=section['id'], data_title_main=section['main'], data_title_desc=section['desc'], output_columns=section['output']))
    pd.DataFrame(section_data).to_csv(section['file'], index=False)
    print(f"Data scraped and saved to '{section['file']}'")