# Labor Condition Application Scraper
Extract compensation data from LCA form for Top Accounting Firms

In [1]:
import csv
import requests
from datetime import datetime
from bs4 import BeautifulSoup

In [2]:
urls = (
    'https://h1bsalary.online/index.php?searchtext=DELOITTE++TOUCHE+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/search?searchtext=GRANT+THORNTON+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/search?searchtext=PRICEWATERHOUSECOOPERS+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=PRICEWATERHOUSECOOPERS+ADVISORY+SERVICES+LLC&year=&minsalary=&state=&worksite_city=&job_title=',
    "https://h1bsalary.online/index.php?searchtext=PRICEWATERHOUSECOOPERS+CORPORATE+FINANCE+LLC&year=&minsalary=&state=&worksite_city=&job_title=",
    'https://h1bsalary.online/index.php?searchtext=DELOITTE+CONSULTING+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=DELOITTE+TAX+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=DELOITTE+AMP%3B+TOUCHE+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=DELOITTE+TRANSACTIONS+AND+BUSINESS+ANALYTICS+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=DELOITTE+SERVICES+LP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=KPMG+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=BDO+USA+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=RSM+US+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=ERNST++YOUNG+US+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=ERNST++YOUNG+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=ERNST+AMP%3B+YOUNG+US+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=CLIFTONLARSONALLEN+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=COHNREZNICK+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=COHNREZNICK+CAPITAL+MARKETS&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=COHNREZNICK+CAPITAL+MARKETS+SECURITIES+LLC&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=BAKER+TILLY+VIRCHOW+KRAUSE+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=BKD+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=CROWE+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=CBIZ+ACCOUNTING+TAX++ADVISORY+LLC&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=MOSS+ADAMS+LLP&year=&minsalary=&state=&worksite_city=&job_title=',
    'https://h1bsalary.online/index.php?searchtext=DIXON+HUGHES+GOODMAN+LLP&year=&minsalary=&state=&worksite_city=&job_title='
)


In [3]:
def extract_data(url):
    """Extract and return LCA data from website"""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    rows = soup.find_all('div', 'panel-body')

    headers = rows[0:4]
    records = rows[4:]

    data = []
    for record in records:
        this = {}
        for p in record.find_all('p'):
            try:
                kv_split = p.text.split(': ')
                # handle multiple title values
                key = kv_split[0].title().replace(' ', '')
                val = (' - '.join(kv_split[1:])).strip()
                # standardize keys
                if key == 'WorksiteCity':
                    key = 'WorkSite'
                if key == 'Status':
                    key = 'CaseStatus'

                # get url of more info link
                if key == 'MoreInfo':
                    val = 'https://h1bsalary.online/' + p.a['href'][1:]

                this[key] = val.strip()
                # remove null index records
            except TypeError as err:
                print('there was a type err')
                continue    
        try:
            if this['Index'] != '':
                data.append(this)
        except KeyError as err:
            continue

    # get field names
    field_names = []
    for row in data:
        for key in row.keys():
            if key in field_names:
                continue
            else:
                field_names.append(key)

    # add keys if not existing in data
    for row in data:
        for key in field_names:
            if key not in row:
                row[key] = ''    
    return data, field_names

In [4]:
def save_data(data, fieldnames):
    """Save data"""
    today = datetime.today().strftime('%Y%m%d')
    filename = 'LCA_' + today + '.csv'
    with open(filename, 'a+', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames = field_names)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

### Extract and process the data

In [5]:
# extract and save all data
for url in urls:
    data, field_names = extract_data(url)
    if data:
        save_data(data, field_names)