In [1]:
# use the import keyword to import pandas, requests, and bs4 modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np

In [2]:
# assign the NY WARN notice url to a variable
url = "https://labor.ny.gov/app/warn/"

In [3]:
# define headers
headers = {'accept-encoding': 'deflate', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}

In [4]:
# make a get request to the url using the requests library and assign the response to a variable called 'response'
response = requests.get(url, headers=headers)

In [5]:
# print out status code of response to confirm that your request worked
response.status_code

200

In [6]:
# parse the response text using Beautiful Soup's html parser and assign output to a variable called 'soup'
# response.text
type(response.text)

str

In [7]:
# scrape the first table on the page and assign it to a variable called 'table'
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
# grab all rows from the table and assign to a variable called 'rows'
table = soup.find("table")

In [9]:
# print out the number of rows — this is how many WARN notices there were in 2020
rows = soup.find_all("tr")

In [10]:
len(rows)

1350

In [11]:
# make an array called 'results'
results = []
event_numbers = set()
# loop through the rows using a for loop. each row here is a company
for row in rows:
    # grab the anchor tag (the link tag) in the row and then grab the href attribute from the tag
    a = row.find("a")['href']
    
    # concatenate the root url from above with this href attribute and assign to a variable called 'company_url'
    company_url = f'{url}{a}'
    #company_url = 'https://labor.ny.gov/app/warn/details.asp?id=7341'
    #print(company_url)
    
    # make a get request to the company url assign the response to a variable called 'company_response'
    company_response = requests.get(company_url, headers=headers)
    
    # parse the response text and assign output to a variable called 'company_soup'
    company_soup = BeautifulSoup(company_response.text, 'html.parser')

    # grab the first table on the page
    company_table = company_soup.find("table")

    # unwrap all of the spans
    
    # loop through all of the p tags
    paragraphs = company_table.find_all("p")
    skip = False
    for p in paragraphs:
        # grab all of the values we want
        text = p.get_text('\n').replace('\xa0', '')
        if 'Date of Notice:' in text:
            split_notice_date = text.split(":")
            #print(split_notice_date)
            if len(split_notice_date) == 3:
                print(split_notice_date)
                notice_date = split_notice_date[2].strip().split()[0].strip()
            else:
                notice_date = text.split(":")[1].strip().split()[0].strip().replace(',', '').replace(';', '')
            print(notice_date)
        elif 'Event Number:' in text:
            event_number = text.split(":")[1].strip()
            if event_number in event_numbers:
                print('repeated event number')
                print(company_url)
                skip = True
                break
            else:
                event_numbers.add(event_number)
        elif 'Reason Stated for Filing:' in text:
            reason = text.split(":")[1].strip()
            #print(reason)
        elif 'Company:' in text:
            split_company = [x.strip() for x in text.split('\n')]
            #print(split_company)
            company = split_company[1].strip()
            address = ' '.join(split_company[2:])
#             print(company)
#             print(address)
        elif 'County:' in text:
            county = f'{text.split(":")[1].strip().split("|")[0].strip()} County'
            #print(county)
        elif 'Phone:' in text:
            phone = text.split(":")[1].strip()
            #print(phone)
        elif 'Business Type:' in text:
            business_type = text.split(":")[1].strip().replace('Restaurants', 'Restaurant')
            #print(business_type)
        elif 'Number Affected:' in text:
            affected = text.split(":")[1].strip().split(" ")[0].strip().split('\n')[0].strip().replace(',', '').replace('(', '')
            if affected in ['------', '-----', '----']:
                affected = 0
        elif 'Total Employees:' in text:
            total_employees = text.split(":")[1].strip().split(" ")[0].strip().replace(',', '')
            if total_employees in ['------', '-----', '----']:
                total_employees = 0
            #print(total_employees)
        elif 'Layoff Date:' in text:
            #print(text)
            layoff_date = text.split(":")[1].strip().split(" ")[0].strip().split(" ")[0].strip()
            #print(layoff_date)
        elif ('Reason for Dislocation:' in text):
            dislocation = text.split(":")[1].strip()
            #print(dislocation)
        elif ('Union:' in text):
            union = text.split(":")[1].strip()
            #print(union)
        elif ('Classification:' in text):
            classification = text.split(":")[1].strip()
            #print(classification)
            
    # store values in a result object
    if not skip:
        result = {
            'notice_date': notice_date,
            'event_number': event_number,
            'reason': reason,
            'company': company,
            'address': address,
            'county': county,
            'phone': phone,
            'business_type': business_type,
            'affected': affected,
            'total_employees': total_employees,
            'layoff_date': layoff_date,
            'dislocation': dislocation,
            'union': union,
            'classification': classification
         }

        # append result object to results
        results.append(result)
    #break

['Date of Notice', ' 1/24/2020 \nAmendment', ' 5/7/2020']
5/7/2020
['Date of Notice', ' 1/15/2020 \nAmendment', ' 4/29/2020']
4/29/2020
5/8/2020
5/7/2020
5/8/2020
['Date of Notice', ' 4/9/2020 \nAmendment', ' 5/8/2020']
5/8/2020
['Date of Notice', ' 4/9/2020\nAmendment', ' 5/8/2020']
5/8/2020
['Date of Notice', ' 3/20/2020 \nAmendment', ' 5/11/2020']
5/11/2020
['Date of Notice', ' 4/3/2020 \nAmendment', ' 5/6/2020']
5/6/2020
['Date of Notice', ' 3/22/2020\nAmendment', ' 5/2/2020']
5/2/2020
['Date of Notice', ' 2/19/2020\nAmendment', ' 5/4/2020']
5/4/2020
['Date of Notice', ' 4/2/2020\nAmendment', ' 5/5/2020']
5/5/2020
['Date of Notice', ' 4/28/2020\nAmendment', ' 5/6/2020']
5/6/2020
4/27/2020
5/7/2020
3/30/2020
5/4/2020
4/28/2020
5/7/2020
5/7/2020
5/6/2020
5/6/2020
5/7/2020
['Date of Notice', ' 3/17/2020 \nAmendment', ' 3/27/2020']
3/27/2020
['Date of Notice', ' 2/13/2020 \nAmendment', ' 5/6/2020']
5/6/2020
['Date of Notice', ' 3/25/2020 \nAmendment', ' 5/1/2020']
5/1/2020
3/30/2020
3/

In [13]:
# wrap results in a dataframe
df = pd.DataFrame(results)
df.shape

(1259, 14)

In [14]:
df['affected'].unique()

array(['165', '450', '84', '25', '243', '244', '17', '416', '177', '170',
       '47', '492', '37', '366', '200', '71', '1191', '82', '890', '52',
       '396', '95', '98', '83', '53', '81', '166', '40', '24', '50', '54',
       '77', '102', '65', '97', '123', '46', '79', '33', '349', '49',
       '352', '36', '14', '5', '4', '174', '1', '316', '26', '125', '550',
       '109', '180', '435', '78', '407', 'To', '260', '239', '289', '154',
       '447', '393', '696', '274', '93', '18', '169', '192', '38', '8',
       '21', '317', '39', '29', '196', '22', '140', '113', '218', '131',
       '142', '35', '149', '359', '132', '209', '814', '222', '146',
       '1004', '689', '44', '64', '32', '178', '6', '107', '28', '70',
       '68', '85', '99', '124', '120', '60', '157', '75', 0, '91', '58',
       '15', '41', '210', '34', '74', '16', '61', '62', '48', '456',
       '133', '2', '116', '27', '23', '678', '459', '19', '66', '429',
       '104', '20', '397', '55', '115', '357', '67', '76', '

In [15]:
# output dataframe to a csv
df.to_csv('../data/warn.csv', index=False)