In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import urllib.parse
from copy import copy

# Scrape index


In [2]:
# Define some functions to help scrape the index

def get_osha_url(query, start_date_str, end_date_str):
    
    results_per_page = 1000
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # make params dictionary
    base_url = f"https://www.osha.gov/ords/imis/establishment.search"
    params = {
        'establishment': '+'.join(query.split()),
        'state': "all",
        'officetype': "all",
        'office': "all",
        'sitezip': 100000,
        'startmonth': start_date.month,
        'startday': start_date.day,
        'startyear': start_date.year,
        'endmonth': end_date.month,
        'endday': end_date.day,
        'endyear': end_date.year,
        'p_case': "all",
        'p_sort': 12,
        'p_desc': "DESC",
        'p_direction': "Prev",
        'p_show': results_per_page,
        'p_violations_exist': "yes"
    }

    return base_url + '?' + urlencode(params)

def get_index_table(osha_url):
    return pd.read_html(osha_url)[2]
    

In [3]:
queries = [
    ('Dollar Tree', '2012-12-19', '2022-12-19'),
    ('Dollar Tree', '2010-01-01', '2012-12-18'),
    ('Family Tree', '2016-01-01', '2022-12-19')
]

In [4]:
activities_df = []
for query in queries:
    
    # break the query into 3 variables, and pass those to get_osha_url
    establishment, start, end = query   
    osha_url = get_osha_url(establishment, start, end)
    print(osha_url)
    
    # get the index table from the osha url, drop a junk column    
    index_df_for_this_query = get_index_table(osha_url)
    index_df_for_this_query = index_df_for_this_query.drop(columns=['Unnamed: 0', '#'])
    
    # tack on the query parameters to the table
    index_df_for_this_query['query'] = establishment
    index_df_for_this_query['query_start'] = start
    index_df_for_this_query['query_end'] = end
    
    # print to verify that pagination isn't a problem 
    # IMPORTANT: 👀 look at these values to make sure we don't need to paginate     
    print(f"found {len(index_df_for_this_query)} results for {query}\n")

    # append to dataframe
    activities_df.append(index_df_for_this_query)
    
# combine the queries
activities_df = pd.concat(activities_df)
activities_df.to_csv('Dec22_activities_df.csv', index=False)
activities_df

https://www.osha.gov/ords/imis/establishment.search?establishment=Dollar%2BTree&state=all&officetype=all&office=all&sitezip=100000&startmonth=12&startday=19&startyear=2012&endmonth=12&endday=19&endyear=2022&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 382 results for ('Dollar Tree', '2012-12-19', '2022-12-19')

https://www.osha.gov/ords/imis/establishment.search?establishment=Dollar%2BTree&state=all&officetype=all&office=all&sitezip=100000&startmonth=1&startday=1&startyear=2010&endmonth=12&endday=18&endyear=2012&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 46 results for ('Dollar Tree', '2010-01-01', '2012-12-18')

https://www.osha.gov/ords/imis/establishment.search?establishment=Family%2BTree&state=all&officetype=all&office=all&sitezip=100000&startmonth=1&startday=1&startyear=2016&endmonth=12&endday=19&endyear=2022&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist

Unnamed: 0,Activity,Opened,RID,St,Type,Sc,SIC,NAICS,Vio,Establishment Name,query,query_start,query_end
0,1.616331e+06,08/22/2022,453720,NC,Complaint,Partial,5331.0,452990,6,"152579 - Dollar Tree Stores, Inc.",Dollar Tree,2012-12-19,2022-12-19
1,1.618010e+06,08/10/2022,316400,WV,Complaint,Partial,,452319,1,"Dollar Tree Stores, Inc",Dollar Tree,2012-12-19,2022-12-19
2,1.612868e+06,08/05/2022,953210,NV,Referral,Partial,,452319,1,"Dollar Tree Stores, Inc. #1224",Dollar Tree,2012-12-19,2022-12-19
3,1.611611e+06,08/01/2022,1054111,OR,Complaint,Partial,5331.0,452990,5,317731000 - Dollar Tree Stores Inc,Dollar Tree,2012-12-19,2022-12-19
4,1.610668e+06,07/27/2022,1054111,OR,Complaint,Partial,5331.0,452990,1,317730993 - Dollar Tree Stores Inc,Dollar Tree,2012-12-19,2022-12-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,3.114835e+08,01/29/2010,155010,VT,Complaint,Partial,5331.0,452990,1,Dollar Tree Store #1742,Dollar Tree,2010-01-01,2012-12-18
45,3.134076e+08,01/29/2010,728500,MO,Complaint,Partial,5331.0,452990,1,Dollar Tree Stores Inc Dba Deals Stores,Dollar Tree,2010-01-01,2012-12-18
0,1.606471e+06,07/01/2022,1055330,WA,Referral,Partial,,561730,1,Wa317969445 - Family Tree Care Professionals Llc,Family Tree,2016-01-01,2022-12-19
1,1.569668e+06,12/17/2021,418800,FL,Fat/Cat,Complete,,112120,3,"Family Tree Enterprises Limited Partnership, Lllp",Family Tree,2016-01-01,2022-12-19


# Scrape individual activity pages

In [5]:
# Figure out how to scrape individual activity
def get_related_activity_table(url):
    tables = pd.read_html(url, match="Related Activity")
    return tables[0]

def get_violation_summary_table(url):
    tables = pd.read_html(url, match="Violation Summary")
    return tables[0]

def get_violation_items_table(url):
    tables = pd.read_html(url, match="Violation Items")
    return tables[0]

def get_investigated_inspection_table(url):
    tables = pd.read_html(url, match="Investigated Inspection")
    return tables[0]

def extract_key_value(tag):
    assert ':' in tag.text
    assert tag.find('strong')
    
    key = tag.text.split(':')[0].strip()
    value = tag.text.split(':')[1].strip()
    return key, value

def get_main_container(inspection_url):
    # get HTML from inspection page
    response = requests.get(inspection_url)
    html_doc = response.text
    soup = BeautifulSoup(html_doc, 'html.parser')

    # get main container (ignore footer/header/etc...)
    html_main_container = soup.find(id="maincontain")
    
    return html_main_container

def has_investigation_summary(html_main_container):
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text.strip() == "Investigation Summary":
            return True
    return False

def get_details(html_main_container):

    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_all_next():
                e.clear()
    
    # get details
    details = {}
    
    # get case status
    html_wells = html_main_container.find_all("div", class_="well")
    try:
        assert len(html_wells) == 2
        assert html_wells[0] == html_wells[1]
        case_status = html_wells[0]
        key, value = extract_key_value(case_status)                
    except:
        key = 'Case Status'
        value = 'ERROR'
        print("ERROR - couldn't scrape case status")
    details[key] = value
    
    # get remaining details
    html_spans = html_main_container.find_all("div", class_="span4")
    for span in html_spans:
        html_p_tags = span.find_all('p')
        if len(html_p_tags) == 0:
            columns_without_colon = 0
            key, value = extract_key_value(span)                
            details[key] = value
        else:
            for p_tag in html_p_tags:
                key, value = extract_key_value(p_tag)
                details[key] = value
    
    return details
    
def get_investigation_summary_details(html_main_container):
    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = copy(html_main_container).find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_previous_siblings():
                e.decompose()

    details = {}

    spans = html_main_container.find_all('div', class_='span4')
    p_tags = html_main_container.find_all('p')
    spans_and_ptags = spans + p_tags
    
    details = {}
    notes_columns = 0
    for tag in spans_and_ptags:
        if ':' in tag.text and tag.find('strong'):
            key, value = extract_key_value(tag)
        else:
            notes_columns += 1
            key = f"investigation_summary_notes_{notes_columns}"
            value = tag.text
        details[key] = value
    
    return(details)


In [6]:
violation_summary_tables = []
violation_items_tables = []
related_activity_tables = []
details_dictionaries = []
investigated_inspections_table = []

for index, row in activities_df.reset_index(drop=True).iterrows():
    activity_code = row['Activity']
    url = f"https://www.osha.gov/ords/imis/establishment.inspection_detail?id={activity_code}"
    print(f"{index} of {len(activities_df)} - scraping {url}")

    violation_summary = get_violation_summary_table(url)
    violation_summary['activity_code'] = activity_code
    violation_summary_tables.append(violation_summary)
    
    violation_items = get_violation_items_table(url)
    violation_items['activity_code'] = activity_code
    violation_items_tables.append(violation_items)

    try:
        related_activity = get_related_activity_table(url)
        related_activity['activity_code'] = activity_code
        related_activity_tables.append(related_activity)
    except:
        print("ERROR - related activity table wasn't scraped")
        
    
    html = get_main_container(url)
    details = {}
    
    if has_investigation_summary(html):
        print("Has investigation summary")
        investigation_summary_details = get_investigation_summary_details(html)
        details.update(investigation_summary_details)
        
        investigated_inspections = get_investigated_inspection_table(url)
        investigated_inspections['activity_code'] = activity_code
        investigated_inspections_table.append(investigated_inspections)
        

    details.update(get_details(html))
    details['activity_code'] = activity_code    
    details_dictionaries.append(details)


0 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1616331.015
1 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1618010.015
2 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1612868.015
3 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1611611.015
4 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1610668.015
5 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1609488.015
6 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1608368.015
7 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1608303.015
8 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1607656.015
9 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1606852.015
10 of 431 - scraping

82 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1476681.015
83 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1476491.015
84 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1475358.015
85 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1473721.015
86 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1470036.015
87 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1467899.015
88 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1468240.015
ERROR - related activity table wasn't scraped
89 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1464979.015
90 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1464924.015
91 of 431 - scraping https://www.osha.gov/ords/imis/establishme

164 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1389326.015
165 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1389880.015
166 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1386120.015
167 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1362039.015
168 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1379729.015
Has investigation summary
169 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1378126.015
170 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1374186.015
171 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1365813.015
172 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1366312.015
173 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspect

245 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1251587.015
246 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1244037.015
247 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1242835.015
ERROR - related activity table wasn't scraped
248 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1242205.015
249 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1260736.015
ERROR - related activity table wasn't scraped
250 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1259454.015
ERROR - related activity table wasn't scraped
251 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1230177.015
ERROR - related activity table wasn't scraped
252 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1229633.015
ERROR - related 

318 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1070970.015
319 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1070357.015
320 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1068869.015
321 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1079686.015
ERROR - related activity table wasn't scraped
322 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1065780.015
323 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1066179.015
324 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1060953.015
325 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1059353.015
326 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1059472.015
327 of 431 - scraping https://www.osha.gov/ords/imis/e

385 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=316590967.0
ERROR - couldn't scrape case status
386 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315667832.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
387 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=316509975.0
ERROR - couldn't scrape case status
388 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=316234004.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
389 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315936179.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
390 of 431 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315936088.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status


In [9]:
violation_summary_df = pd.concat(violation_summary_tables)
violation_items_df = pd.concat(violation_items_tables)
related_activity_df = pd.concat(related_activity_tables)
investigated_inspections_df = pd.concat(investigated_inspections_table)
details_df = pd.DataFrame(details_dictionaries)

In [8]:
!mkdir 

mkdir: data: File exists


In [10]:
violation_summary_df.to_csv('Dec22_violation_summary_df.csv', index=False)
violation_items_df.to_csv('Dec22_violation_items_df.csv', index=False)
related_activity_df.to_csv('Dec22_related_activity_df.csv', index=False)
investigated_inspections_df.to_csv('Dec22_investigated_inspections_df.csv', index=False)
details_df.to_csv('Dec22_details_df.csv', index=False)