In [1]:
# use the import keyword to import pandas, requests, and bs4 modules
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
# assign the NY WARN notice url to a variable
url = "https://labor.ny.gov/app/warn/default.asp"

In [3]:
# define headers
headers = {'accept-encoding': 'deflate', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}

In [4]:
# make a get request to the url using the requests library and assign the response to a variable called 'response'
response = requests.get(url, headers=headers)

In [5]:
# print out status code of response to confirm that your request worked
response.status_code

200

In [6]:
# parse the response text using Beautiful Soup's html parser and assign output to a variable called 'soup'
# response.text
type(response.text)

str

In [7]:
# scrape the first table on the page and assign it to a variable called 'table'
soup = BeautifulSoup(response.text, 'html.parser')

In [8]:
# get the dropdown menu
select = soup.find("select", id="warnYr")
select

<select id="warnYr" name="warnYr" onchange="pageRefresh()">
<option selected="" value="2020">2020</option>
<option value="2019">2019</option>
<option value="2018">2018</option>
<option value="2017">2017</option>
<option value="2016">2016</option>
<option value="2015">2015</option>
<option value="2014">2014</option>
<option value="2013">2013</option>
<option value="2012">2012</option>
</select>

In [9]:
years = select.find_all("option")
years

[<option selected="" value="2020">2020</option>,
 <option value="2019">2019</option>,
 <option value="2018">2018</option>,
 <option value="2017">2017</option>,
 <option value="2016">2016</option>,
 <option value="2015">2015</option>,
 <option value="2014">2014</option>,
 <option value="2013">2013</option>,
 <option value="2012">2012</option>]

In [10]:
years = [option.text for option in select.find_all("option")]
years

['2020', '2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012']

In [None]:
# make an array called 'results'
results = []
control_numbers = set()
#loop through all years
for year in years:
    # form url for a specific year, eg https://labor.ny.gov/app/warn/default.asp?warnYr=2019
    year_url = f'https://labor.ny.gov/app/warn/default.asp?warnYr={year}'
    print(year_url)
    # make get request to url
    response = requests.get(year_url, headers=headers)
    # scrape the first table on the page and assign it to a variable called 'table'
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find("table")

    # get all the rows in the table — this is how many WARN notices there were in a specific year
    rows = table.find_all("tr")
    print(len(rows))

    # grab all rows from the table and assign to a variable called 'rows'
    # loop through the rows using a for loop. each row here is a company
    for row in rows:
        # grab the anchor tag (the link tag) in the row and then grab the href attribute from the tag
        a = row.find("a")['href']

        # concatenate the root url from above with this href attribute and assign to a variable called 'company_url'
        company_url = "https://labor.ny.gov/app/warn/"+a
        print(company_url)

        # make a get request to the company url assign the response to a variable called 'company_response'
        company_response = requests.get(company_url, headers=headers)

        # parse the response text and assign output to a variable called 'company_soup'
        company_soup = BeautifulSoup(company_response.text, 'html.parser')

        # grab the first table on the page
        company_table = company_soup.find("table")

        # loop through all of the p tags
        paragraphs = company_table.find_all("p")
        paragraphs_iter = iter(paragraphs)
        skip = False
        for p in paragraphs_iter:
            # grab all of the values we want
            text = p.get_text('\n').replace('\xa0', '')
            if 'Date of Notice:' in text:
                notice_date = text.split(":")[1].strip().split('\n')[0].strip().replace(',', '').replace(';', '')
            elif 'Control Number:' in text:
                control_number = text.split(":")[1].strip()
                if control_number in control_numbers:
                    print('repeated control number')
                    skip = True
                    break
                else:
                    control_numbers.add(control_number)
            elif 'Reason Stated for Filing:' in text:
                reason = text.split(":")[1].strip()
                #print(reason)
            elif 'Company:' in text:
                split_company = [x.strip() for x in text.split('\n')]
                if len(split_company) == 1:
                    next_text = next(paragraphs_iter)
                else: 
                    company = split_company[1]
                    address = ' '.join(split_company[2:])
            elif 'County:' in text:
                county = f'{text.split(":")[1].strip().split("|")[0].strip()} County'
                #print(county)
            elif 'Phone:' in text:
                phone = text.split(":")[1].strip()
                #print(phone)
            elif 'Business Type:' in text:
                business_type = text.split(":")[1].strip().replace('Restaurants', 'Restaurant')
                #print(business_type)
            elif 'Number Affected:' in text:
                if '-----' in text:
                    affected = ''
                else:
                    affected = text.split(":")[1].strip().split(" ")[0].strip().split('\n')[0].strip()
                #print(affected)
            elif 'Total Employees:' in text:
                if '-----' in text:
                    total_employees = ''
                else:
                    total_employees = text.split(":")[1].strip().split(" ")[0].strip().replace(',', '')
                    #print(total_employees)
            elif 'Layoff Date:' in text:
                #print(text)
                layoff_date = text.split(":")[1].strip().split(" ")[0].strip().split(" ")[0].strip()
                #print(layoff_date)
            elif ('Reason for Dislocation:' in text):
                dislocation = text.split(":")[1].strip()
                #print(dislocation)
            elif ('Union:' in text):
                union = text.split(":")[1].strip()
                #print(union)
            elif ('Classification:' in text):
                classification = text.split(":")[1].strip()
                #print(classification)

        # store values in a result object
        if not skip:
            result = {
                'notice_date': notice_date,
                'reason': reason,
                'company': company,
                'address': address,
                'county': county,
                'phone': phone,
                'business_type': business_type,
                'affected': affected,
                'total_employees': total_employees,
                'layoff_date': layoff_date,
                'dislocation': dislocation,
                'union': union,
                'classification': classification
             }
            # append result object to results
            results.append(result)

print(len(results))

https://labor.ny.gov/app/warn/default.asp?warnYr=2020
1430
https://labor.ny.gov/app/warn/details.asp?id=8480
https://labor.ny.gov/app/warn/details.asp?id=8481
https://labor.ny.gov/app/warn/details.asp?id=8482
https://labor.ny.gov/app/warn/details.asp?id=8483
https://labor.ny.gov/app/warn/details.asp?id=8484
https://labor.ny.gov/app/warn/details.asp?id=8485
https://labor.ny.gov/app/warn/details.asp?id=8486
https://labor.ny.gov/app/warn/details.asp?id=8487
https://labor.ny.gov/app/warn/details.asp?id=8488
https://labor.ny.gov/app/warn/details.asp?id=8489
https://labor.ny.gov/app/warn/details.asp?id=8490
https://labor.ny.gov/app/warn/details.asp?id=8491
https://labor.ny.gov/app/warn/details.asp?id=8492
https://labor.ny.gov/app/warn/details.asp?id=8493
https://labor.ny.gov/app/warn/details.asp?id=8494
https://labor.ny.gov/app/warn/details.asp?id=8467
https://labor.ny.gov/app/warn/details.asp?id=8468
https://labor.ny.gov/app/warn/details.asp?id=8469
https://labor.ny.gov/app/warn/details.asp

In [55]:
results

[{'notice_date': '12/24/2019',
  'reason': 'Plant Layoff',
  'company': 'Timeless Décor, LLC (LCO Destiny DBA Timeless Frames)',
  'address': '22419 Fisher Road Watertown, NY 13601',
  'county': 'Jefferson County',
  'phone': '(315) 782-5759',
  'business_type': 'A supplier of Picture Frames',
  'affected': '45',
  'total_employees': '81',
  'layoff_date': 'The',
  'dislocation': 'Economic',
  'union': 'The employees are not represented by a union.',
  'classification': 'Plant layoff'},
 {'notice_date': '10/16/2019',
  'reason': 'Plant Closing',
  'company': 'Barneys New York',
  'address': '575 5th Avenue New York, NY 10017',
  'county': 'New York County',
  'phone': '(212) 450-8606',
  'business_type': 'Retail',
  'affected': '165',
  'total_employees': '165',
  'layoff_date': 'Employment',
  'dislocation': 'Economic',
  'union': 'New York New Jersey Regional Joint Board Workers United A/W SEIU',
  'classification': 'Plant Closing'},
 {'notice_date': '9/18/2019',
  'reason': 'Plant C

In [56]:
df = pd.DataFrame.from_dict(results)

In [57]:
df

Unnamed: 0,notice_date,reason,company,address,county,phone,business_type,affected,total_employees,layoff_date,dislocation,union,classification
0,12/24/2019,Plant Layoff,"Timeless Décor, LLC (LCO Destiny DBA Timeless ...","22419 Fisher Road Watertown, NY 13601",Jefferson County,(315) 782-5759,A supplier of Picture Frames,45,81,The,Economic,The employees are not represented by a union.,Plant layoff
1,10/16/2019,Plant Closing,Barneys New York,"575 5th Avenue New York, NY 10017",New York County,(212) 450-8606,Retail,165,165,Employment,Economic,New York New Jersey Regional Joint Board Worke...,Plant Closing
2,9/18/2019,Plant Closing,"General Parts Distribution, LLC d/b/a Carquest...","215 Business Park Drive Armonk, NY 10504",Westchester County,(860) 375-1514,Auto parts distribution,80,80,Of,Economic,Teamsters Local 202,Plant Closing
3,10/7/2019,Plant Closing,"Somos Healthcare Providers, Inc.","519 Eighth Avenue, 14th Floor New York, NY 10118",New York County,(516) 642-4068,Community care and advocate community provider,130,130,Separations,New York State contract expired,The employees are not represented by a union.,Plant Closing
4,12/20/2019,Plant Unit Closing,"Regeneron Healthcare Solutions, Inc., a wholly...","1 Rockwood Rd. Sleepy Hollow, NY 10591",Westchester County,(914) 847-7128,Biotechnology company,15,,2/18/2020,Termination of all field and supporting staff ...,The employees are not represented by a union.,Plant Unit Closing
...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,1/6/2019,Plant Closing,Morrison Healthcare Food and Nutrition Service...,"462 Grider Street Buffalo, NY 14215",Erie County,(914) 426-8240,National food and nutrition services company,172,172,4/6/2019,Contractual Loss,"CSEA Local 1000, AFSCME, AFL-CIO",Plant Closing
342,12/27/2018,Plant Closing,Kmart Store (Unit 07677),"121 Bolivar Road Wellsville, NY 14895",Allegany County,(847) 286-1427,Retail Store,88,88,Employment,Economic,The employees are not represented by a union.,Plant Closing
343,12/27/2018,Plant Closing,Kmart Store (Unit 04928),"49 Dix Avenue Ext. Queensbury, NY 12804",Warren County,(847) 286-1427,Retail Store,60,60,Employment,Economic,The employees are not represented by a union.,Plant Closing
344,12/27/2018,Plant Closing,"Sears, Roebuck and Co. Full Line Store (Unit 0...","10 Miracle Mile Dr. Rochester, NY 14623",Monroe County,(847) 286-1427,Retail Store,70,70,Employment,Economic,The employees are not represented by a union.,Plant Closing
