### Imports

In [231]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Data Scraping (Labels)

In [232]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

def get_soup(url):
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content, 'html.parser')    
    return soup

In [233]:
url = 'https://www.bls.gov/iag/tgs/iag_index_naics.htm'
soup = get_soup(url)

It is unlikely that the format of this website changes, we can simply use the default css selectors

In [234]:
# Search for elements containing the sector's heading
service_sector_names = soup.select('#bodytext > ul:nth-child(6) > li')
goods_sectors_names = soup.select('#bodytext > ul:nth-child(9) > li')

sector_names = service_sector_names + goods_sectors_names

In [235]:
# Search for elements containing the sector's data
service_sector_data = soup.select('#bodytext > ul:nth-child(6) > ul')
goods_sector_data = soup.select('#bodytext > ul:nth-child(9) > ul')

sector_data = service_sector_data + goods_sector_data

In [236]:
assert len(sector_names) == len(sector_data)

There's no special need to further segment the data into sub industries<br>
We can simply use the flattened format

In [237]:
df = pd.DataFrame(columns=['Sector', 'Industry', 'Industry_URL', 'Industry_description'])

In [238]:
i = 0
for sector_name, sector_datum in zip(sector_names, sector_data):
    sector_text = sector_name.get_text().strip()

    
    # Get all industries from a sector
    industries = sector_datum.select('li')
    
    for industry in industries:
        industry_text = industry.get_text().strip()
        # Not all industry elements contain a hyperlink
        try:
            industry_url = 'https://www.bls.gov/iag/tgs/' + industry.select_one('a')['href']
        except TypeError:
            sector_url = 'None'
        
        # Store data to dataframe
        df.loc[i] = (sector_text, industry_text, industry_url, '')
        i += 1
        

df

Unnamed: 0,Sector,Industry,Industry_URL,Industry_description
0,Natural Resources and Mining,"Agriculture, Forestry, Fishing and Hunting (NA...",https://www.bls.gov/iag/tgs/iag11.htm,
1,Natural Resources and Mining,Crop Production (NAICS 111),https://www.bls.gov/iag/tgs/iag111.htm,
2,Natural Resources and Mining,Animal Production (NAICS 112),https://www.bls.gov/iag/tgs/iag112.htm,
3,Natural Resources and Mining,Forestry and Logging (NAICS 113),https://www.bls.gov/iag/tgs/iag113.htm,
4,Natural Resources and Mining,"Fishing, Hunting and Trapping (NAICS 114)",https://www.bls.gov/iag/tgs/iag114.htm,
...,...,...,...,...
102,Other Services (except Public Administration),Other Services (except Public Administration) ...,https://www.bls.gov/iag/tgs/iag81.htm,
103,Other Services (except Public Administration),Repair and Maintenance (NAICS 811),https://www.bls.gov/iag/tgs/iag811.htm,
104,Other Services (except Public Administration),Personal and Laundry Services (NAICS 812),https://www.bls.gov/iag/tgs/iag812.htm,
105,Other Services (except Public Administration),"Religious, Grantmaking, Civic, Professional, a...",https://www.bls.gov/iag/tgs/iag813.htm,


### Data Scraping (Descriptions)

In [None]:
for _, row in df.iterrows():
    print(f"Scraping {row['Industry']}")
    
    # Scrape description
    soup = get_soup(row['Industry_URL'])
    try:
        description = soup.select_one('#bodytext > div.iag-def').get_text()
    except AttributeError:
        description ='None'
    
    # Clean the text
    description = description.strip('\n')
    description = description.rstrip('\nNorth American Industry Classification System')
    
    # Append data
    row['Industry_description'] = description

In [243]:
df.to_csv('data.csv')