In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import urllib.parse
from copy import copy

# Scrape index


In [2]:
# Define some functions to help scrape the index

def get_osha_url(query, start_date_str, end_date_str):
    
    results_per_page = 1000
    start_date = pd.to_datetime(start_date_str)
    end_date = pd.to_datetime(end_date_str)

    # make params dictionary
    base_url = f"https://www.osha.gov/ords/imis/establishment.search"
    params = {
        'establishment': '+'.join(query.split()),
        'state': "all",
        'officetype': "all",
        'office': "all",
        'sitezip': 100000,
        'startmonth': start_date.month,
        'startday': start_date.day,
        'startyear': start_date.year,
        'endmonth': end_date.month,
        'endday': end_date.day,
        'endyear': end_date.year,
        'p_case': "all",
        'p_sort': 12,
        'p_desc': "DESC",
        'p_direction': "Prev",
        'p_show': results_per_page,
        'p_violations_exist': "yes"
    }

    return base_url + '?' + urlencode(params)

def get_index_table(osha_url):
    return pd.read_html(osha_url)[2]
    

In [3]:
queries = [
    ('Walmart', '2012-12-31', '2022-12-31'),
    ('Walmart', '2010-01-01', '2012-12-30')
]

In [4]:
activities_df = []
for query in queries:
    
    # break the query into 3 variables, and pass those to get_osha_url
    establishment, start, end = query   
    osha_url = get_osha_url(establishment, start, end)
    print(osha_url)
    
    # get the index table from the osha url, drop a junk column    
    index_df_for_this_query = get_index_table(osha_url)
    index_df_for_this_query = index_df_for_this_query.drop(columns=['Unnamed: 0', '#'])
    
    # tack on the query parameters to the table
    index_df_for_this_query['query'] = establishment
    index_df_for_this_query['query_start'] = start
    index_df_for_this_query['query_end'] = end
    
    # print to verify that pagination isn't a problem 
    # IMPORTANT: 👀 look at these values to make sure we don't need to paginate     
    print(f"found {len(index_df_for_this_query)} results for {query}\n")

    # append to dataframe
    activities_df.append(index_df_for_this_query)
    
# combine the queries
activities_df = pd.concat(activities_df)
activities_df.to_csv('1_activities_df.csv', index=False)
activities_df

https://www.osha.gov/ords/imis/establishment.search?establishment=Walmart&state=all&officetype=all&office=all&sitezip=100000&startmonth=12&startday=31&startyear=2012&endmonth=12&endday=31&endyear=2022&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 253 results for ('Walmart', '2012-12-31', '2022-12-31')

https://www.osha.gov/ords/imis/establishment.search?establishment=Walmart&state=all&officetype=all&office=all&sitezip=100000&startmonth=1&startday=1&startyear=2010&endmonth=12&endday=30&endyear=2012&p_case=all&p_sort=12&p_desc=DESC&p_direction=Prev&p_show=1000&p_violations_exist=yes
found 78 results for ('Walmart', '2010-01-01', '2012-12-30')



Unnamed: 0,Activity,Opened,RID,St,Type,Sc,SIC,NAICS,Vio,Establishment Name,query,query_start,query_end
0,1.617962e+06,08/29/2022,552652,MI,Complaint,Partial,,445110,1,Walmart #1754,Walmart,2012-12-31,2022-12-31
1,1.617558e+06,08/25/2022,951510,HI,Prog Other,Complete,,722513,1,Mcdonalds Of Walmart - Kapolei,Walmart,2012-12-31,2022-12-31
2,1.618144e+06,08/24/2022,452110,KY,Planned,Records,5411.0,445110,1,65308 - Walmart Inc,Walmart,2012-12-31,2022-12-31
3,1.616220e+06,08/22/2022,552652,MI,Complaint,Partial,,445110,1,Walmart Supercenter # 2700,Walmart,2012-12-31,2022-12-31
4,1.608665e+06,07/18/2022,950624,CA,Complaint,Partial,,452210,1,Walmart Inc.,Walmart,2012-12-31,2022-12-31
...,...,...,...,...,...,...,...,...,...,...,...,...,...
73,3.129146e+08,04/06/2010,950625,CA,Complaint,Partial,5311.0,452112,3,Walmart Stores Inc,Walmart,2010-01-01,2012-12-30
74,3.134090e+08,03/10/2010,728500,MO,Planned,Complete,5311.0,452111,1,Walmart #29,Walmart,2010-01-01,2012-12-30
75,3.134539e+08,02/16/2010,257220,PR,Referral,Partial,5311.0,452111,1,Walmart Puerto Rico Inc,Walmart,2010-01-01,2012-12-30
76,3.139923e+08,01/11/2010,214200,NJ,Complaint,Partial,5399.0,452990,1,Walmart Inc-#3520,Walmart,2010-01-01,2012-12-30


# Scrape individual activity pages

In [5]:
# Figure out how to scrape individual activity
def get_related_activity_table(url):
    tables = pd.read_html(url, match="Related Activity")
    return tables[0]

def get_violation_summary_table(url):
    tables = pd.read_html(url, match="Violation Summary")
    return tables[0]

def get_violation_items_table(url):
    tables = pd.read_html(url, match="Violation Items")
    return tables[0]

def get_investigated_inspection_table(url):
    tables = pd.read_html(url, match="Investigated Inspection")
    return tables[0]

def extract_key_value(tag):
    assert ':' in tag.text
    assert tag.find('strong')
    
    key = tag.text.split(':')[0].strip()
    value = tag.text.split(':')[1].strip()
    return key, value

def get_main_container(inspection_url):
    # get HTML from inspection page
    response = requests.get(inspection_url)
    html_doc = response.text
    soup = BeautifulSoup(html_doc, 'html.parser')

    # get main container (ignore footer/header/etc...)
    html_main_container = soup.find(id="maincontain")
    
    return html_main_container

def has_investigation_summary(html_main_container):
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text.strip() == "Investigation Summary":
            return True
    return False

def get_details(html_main_container):

    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = html_main_container.find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_all_next():
                e.clear()
    
    # get details
    details = {}
    
    # get case status
    html_wells = html_main_container.find_all("div", class_="well")
    try:
        assert len(html_wells) == 2
        assert html_wells[0] == html_wells[1]
        case_status = html_wells[0]
        key, value = extract_key_value(case_status)                
    except:
        key = 'Case Status'
        value = 'ERROR'
        print("ERROR - couldn't scrape case status")
    details[key] = value
    
    # get remaining details
    html_spans = html_main_container.find_all("div", class_="span4")
    for span in html_spans:
        html_p_tags = span.find_all('p')
        if len(html_p_tags) == 0:
            columns_without_colon = 0
            key, value = extract_key_value(span)                
            details[key] = value
        else:
            for p_tag in html_p_tags:
                key, value = extract_key_value(p_tag)
                details[key] = value
    
    return details
    
def get_investigation_summary_details(html_main_container):
    # Delete anything after "Investigation Summary H4"
    # we will scrape that stuff in a separate function
    h4s = copy(html_main_container).find_all('h4')
    for h4 in h4s:
        if h4.text == "Investigation Summary":
            for e in h4.find_previous_siblings():
                e.decompose()

    details = {}

    spans = html_main_container.find_all('div', class_='span4')
    p_tags = html_main_container.find_all('p')
    spans_and_ptags = spans + p_tags
    
    details = {}
    notes_columns = 0
    for tag in spans_and_ptags:
        if ':' in tag.text and tag.find('strong'):
            key, value = extract_key_value(tag)
        else:
            notes_columns += 1
            key = f"investigation_summary_notes_{notes_columns}"
            value = tag.text
        details[key] = value
    
    return(details)


In [6]:
violation_summary_tables = []
violation_items_tables = []
related_activity_tables = []
details_dictionaries = []
investigated_inspections_table = []

for index, row in activities_df.reset_index(drop=True).iterrows():
    activity_code = row['Activity']
    url = f"https://www.osha.gov/ords/imis/establishment.inspection_detail?id={activity_code}"
    print(f"{index} of {len(activities_df)} - scraping {url}")

    violation_summary = get_violation_summary_table(url)
    violation_summary['activity_code'] = activity_code
    violation_summary_tables.append(violation_summary)
    
    violation_items = get_violation_items_table(url)
    violation_items['activity_code'] = activity_code
    violation_items_tables.append(violation_items)

    try:
        related_activity = get_related_activity_table(url)
        related_activity['activity_code'] = activity_code
        related_activity_tables.append(related_activity)
    except:
        print("ERROR - related activity table wasn't scraped")
        
    
    html = get_main_container(url)
    details = {}
    
    if has_investigation_summary(html):
        print("Has investigation summary")
        investigation_summary_details = get_investigation_summary_details(html)
        details.update(investigation_summary_details)
        
        investigated_inspections = get_investigated_inspection_table(url)
        investigated_inspections['activity_code'] = activity_code
        investigated_inspections_table.append(investigated_inspections)
        

    details.update(get_details(html))
    details['activity_code'] = activity_code    
    details_dictionaries.append(details)


0 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1617962.015
1 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1617558.015
ERROR - related activity table wasn't scraped
2 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1618144.015
ERROR - related activity table wasn't scraped
3 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1616220.015
4 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1608665.015
5 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1606171.015
6 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1605577.015
7 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1605398.015
8 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1604505.015
9 of 331 - scraping https:

77 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1485107.015
78 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1485188.015
79 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1484241.015
80 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1483167.015
81 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1483151.015
82 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1483019.015
83 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1483129.015
84 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1469714.015
85 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1463894.015
86 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1463070.015
87 of 331 

Has investigation summary
151 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1259424.015
152 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1253084.015
153 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1251641.015
154 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1241743.015
ERROR - related activity table wasn't scraped
155 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1236205.015
Has investigation summary
156 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1233977.015
157 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1228790.015
158 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1224611.015
159 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=1218260.015
16

ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
227 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=957918.015
228 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317528206.0
ERROR - couldn't scrape case status
229 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=317428811.0
ERROR - couldn't scrape case status
230 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=952351.015
231 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=313751091.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
232 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=313751083.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
233 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail

ERROR - couldn't scrape case status
285 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315918748.0
ERROR - couldn't scrape case status
286 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=314726423.0
ERROR - couldn't scrape case status
287 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=98356.015
288 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315502880.0
ERROR - couldn't scrape case status
289 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315502476.0
ERROR - couldn't scrape case status
290 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=315477539.0
ERROR - related activity table wasn't scraped
ERROR - couldn't scrape case status
291 of 331 - scraping https://www.osha.gov/ords/imis/establishment.inspection_detail?id=310099916.0
ERROR - couldn't scrape case status
292 

In [7]:
violation_summary_df = pd.concat(violation_summary_tables)
violation_items_df = pd.concat(violation_items_tables)
related_activity_df = pd.concat(related_activity_tables)
investigated_inspections_df = pd.concat(investigated_inspections_table)
details_df = pd.DataFrame(details_dictionaries)

In [8]:
!mkdir 

usage: mkdir [-pv] [-m mode] directory_name ...


In [9]:
violation_summary_df.to_csv('1_violation_summary_df.csv', index=False)
violation_items_df.to_csv('1_violation_items_df.csv', index=False)
related_activity_df.to_csv('1_related_activity_df.csv', index=False)
investigated_inspections_df.to_csv('1_investigated_inspections_df.csv', index=False)
details_df.to_csv('1_details_df.csv', index=False)