# Libraries

In [1]:
import os
import datetime
import re
import requests
import time
from bs4 import BeautifulSoup
from collections import defaultdict, Counter
import random
import shutil
from pathlib import Path

# Scraping Data

## Get Links

In [16]:
years = {'year07':"https://arlweb.msha.gov/fatals/indices/FABM2007.asp",
         'year06':"https://arlweb.msha.gov/fatals/indices/FABM2006.asp",
         'year05':"https://arlweb.msha.gov/fatals/indices/FABM2005.asp",
         'year04':"https://arlweb.msha.gov/fatals/indices/FABM2004.HTM",
         'year03':"https://arlweb.msha.gov/fatals/indices/FABM2003.HTM",
         'year02':"https://arlweb.msha.gov/fatals/indices/FABM2002.HTM",
         'year01':"https://arlweb.msha.gov/fatals/indices/FABM2001.HTM",
         'year00':"https://arlweb.msha.gov/fatals/indices/FABM2000.HTM",
         'year99':"https://arlweb.msha.gov/fatals/indices/FABM99.HTM",
         'year98':"https://arlweb.msha.gov/fatals/indices/FABM98.HTM",
         'year97':"https://arlweb.msha.gov/fatals/indices/FABM97.HTM",
         'year96':"https://arlweb.msha.gov/fatals/indices/FABM96.HTM",
         'year95':"https://arlweb.msha.gov/fatals/indices/FABM95.HTM",
        } 

In [42]:
report_pages = defaultdict(list)
urls = []

for year, year_page in years.items() :
    r = requests.get(year_page)
    time.sleep(5 + 10*random.random())
    soup = BeautifulSoup(r.text, 'html.parser')
    links = soup.find_all('a', href=True)
    for link in links:
        if '/FATALS/' and '/FTL' in link['href']:
            urls.append(link.get('href'))
            report_pages[year].append(link.get('href'))
            

In [43]:
for year, links in report_pages.items() : 
    print(f"For {year} we have {len(links)}.")
    print(f"The full pull will take for this year will take {round(len(links)*10/3600,2)} hours.")

For year07 we have 32.
The full pull will take for this artist will take 0.09 hours.
For year06 we have 26.
The full pull will take for this artist will take 0.07 hours.
For year05 we have 35.
The full pull will take for this artist will take 0.1 hours.
For year04 we have 26.
The full pull will take for this artist will take 0.07 hours.
For year03 we have 26.
The full pull will take for this artist will take 0.07 hours.
For year02 we have 40.
The full pull will take for this artist will take 0.11 hours.
For year01 we have 28.
The full pull will take for this artist will take 0.08 hours.
For year00 we have 41.
The full pull will take for this artist will take 0.11 hours.
For year99 we have 53.
The full pull will take for this artist will take 0.15 hours.
For year98 we have 51.
The full pull will take for this artist will take 0.14 hours.
For year97 we have 61.
The full pull will take for this artist will take 0.17 hours.
For year96 we have 46.
The full pull will take for this artist wil

In [44]:
urls

['/FATALS/2007/FTL07m01.asp',
 '/FATALS/2007/FTL07m02.asp',
 '/FATALS/2007/FTL07m28.asp',
 '/FATALS/2007/FTL07m03.asp',
 '/FATALS/2007/FTL07m04.asp',
 '/FATALS/2007/FTL07m05.asp',
 '/FATALS/2007/FTL07m06.asp',
 '/FATALS/2007/FTL07m07.asp',
 '/FATALS/2007/FTL07m08.asp',
 '/FATALS/2007/FTL07m09.asp',
 '/FATALS/2007/FTL07m10.asp',
 '/FATALS/2007/FTL07m11.asp',
 '/FATALS/2007/FTL07m12.asp',
 '/FATALS/2007/FTL07m13.asp',
 '/FATALS/2007/FTL07m14.asp',
 '/FATALS/2007/FTL07m15.asp',
 '/FATALS/2007/FTL07m1617.asp',
 '/FATALS/2007/FTL07m18.asp',
 '/FATALS/2007/FTL07m19.asp',
 '/FATALS/2007/FTL07m20.asp',
 '/FATALS/2007/FTL07m21.asp',
 '/FATALS/2007/FTL07m22.asp',
 '/FATALS/2007/FTL07m23.asp',
 '/FATALS/2007/FTL07m33.asp',
 '/FATALS/2007/FTL07m24.asp',
 '/FATALS/2007/FTL07m25.asp',
 '/FATALS/2007/FTL07m26.asp',
 '/FATALS/2007/FTL07m31.asp',
 '/FATALS/2007/FTL07m27.asp',
 '/FATALS/2007/FTL07m30.asp',
 '/FATALS/2007/FTL07m29.asp',
 '/FATALS/2007/FTL07m32.asp',
 '/FATALS/2006/FTL06m01.asp',
 '/FATAL

## Get Data

In [19]:
def generate_filename_from_link(link) :
    
    if not link :
        return None
    
    # drop the http or https and the html
    name = link.replace("https","").replace("http","")
    name = link.replace(".html","")

    name = name.replace("/reports/","")
    
    # Replace useless chareacters with UNDERSCORE
    name = name.replace("://","").replace(".","_").replace("/","_")
    
    # tack on .txt
    name = name + ".txt"
    
    return(name)

In [20]:
if os.path.isdir("reports") : 
    shutil.rmtree("reports/")

os.mkdir("reports")

In [39]:
soup = BeautifulSoup(requests.get("https://arlweb.msha.gov/FATALS/1999/FTL99M01.HTM").content, "html.parser")
soup.find(id="content").get_text(
        strip=True, separator="\n"
    )

<div id="content">
<center><b>UNITED STATES
DEPARTMENT OF LABOR<br/>
MINE SAFETY AND HEALTH ADMINISTRATION<br/>
<br/>

Western District <br/>
Metal and Nonmetal Mine Safety and Health <br/>
<br/>

Accident Investigation Report <br/>
Underground Metal Mine <br/>
(Gold) <br/>
<br/>
Fatal Powered Haulage Accident <br/>
<br/>

Barrick Goldstrike Mines, Incorporated <br/>
Meikle Mine <br/>
Carlin, Elko County, Nevada <br/>
ID No. 26-02246 <br/>
<br/>
January 10, 1999<br/>
<br/>
<br/>
By <br/>
<br/>
John Widows <br/>
Supervisory Mine Safety and Health Inspector <br/>
<br/>
Bobby Caples <br/>
Mine Safety and Health Inspector <br/>
<br/>
Dennis Ferlich <br/>
Mechanical Engineer <br/>
<br/>
Originating Office: <br/>
Western District Office <br/>
Mine Safety and Health Administration <br/>
2060 Peabody Road, Suite 610 <br/>
Vacaville, CA 95687 <br/>
<br/>
James M. Salois <br/>
District Manager</b></center>
<hr noshade="" size="3"/>
<b><div align="center">GENERAL INFORMATION</div></b>
<br/><br/>


In [41]:
soup

 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html lang="en" xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<!-- ****************************************** Begin META TAGS ********************************************* -->
<meta content="Mine Safety and Health Administration (MSHA)" name="agency"/>
<meta content="workers: miners, safety, all" name="audience"/>
<meta content="Mining, mining industry, mine safety, mine health, mining safety, mining health, mine safety rules, mine safety laws, government regulations mining, mining news, news mining, Assistant Secretary of Labor for Mine Safety and Health, Joseph A. Main, Assistant Secretary Main" name="keywords"/>
<meta content="Federal enforcement agency responsible for the health and safety of the nation's miners." name="description"/>
<!-- ***********

In [23]:
url_stub = "https://arlweb.msha.gov/" 
start = time.time()

total_pages = 0 

os.chdir('C:\\Users\\halle.davis\\Downloads\\reports')

path = os.getcwd()

for x in urls :
    #if total_pages == 22:
    #    break
    name0 = x.replace('/', '')
    name = name0.replace('.html', '')
    filename = name+".txt"
    if os.path.exists(filename):
        file = open(filename, "r+")
    else:
        file = open(filename, "w+")
    soup = BeautifulSoup(requests.get(url_stub+x).content, "html.parser")
    time.sleep(5 + 10*random.random())
    report = soup.select_one(".ringtone ~ div").get_text(strip=True, separator="\n")
    title = name.replace('lyricslildicky', '')
    file.writelines(title+'\n'+'\n'+lyrics)
    time.sleep(5 + 10*random.random())
    total_pages += 1

KeyboardInterrupt: 