In [137]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [160]:
def parse_data(company):
    """
    Parse company info from Glassdoor. Input: company name, type: string
    """
    #Parse page info
    page = requests.get("https://www.glassdoor.com/Reviews/" + company + "-reviews-SRCH_KE0,4.htm",
                        headers={'User-Agent': 'Mozilla/5.0'})
    
    #Convert into usable format
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #List of links
    link = soup.find("a", {"class": "tightAll h2"}, href=True)
    
    try:
        link = link['href']
    except TypeError:
        return "not found"
    
    #Go to company profile
    company_page = requests.get("https://www.glassdoor.com" + link,
                            headers={'User-Agent': 'Mozilla/5.0'})
    
    #Convert into usable format
    company_info = BeautifulSoup(company_page.content, 'html.parser')
    
    #Company info to be further processed
    info = company_info.find("body", {"class": "main"}).text
    
    return info
    

In [114]:
def employee_size(info):
    """
    Extract size of employer based on the number of employees
    """
    
    #Find part of the text containing employee number
    employees_info = info.find('"employees":')
    #Options for sizes
    emp_sizes = ['SMALL_TO_MEDIUM', 'MEDIUM_TO_LARGE', 'LARGE_TO_GIANT', 'UNKNOWN', 'MEDIUM', 'SMALL', 'LARGE', 'GIANT']
    #Longer to ensure we capute the above
    size = info[employees_count:employees_count+30]
    
    for count in emp_sizes:
        count_ind = size.find(count)
        #-1 if word not found
        if count_ind > -1:
            return count
    #If nothing found
    return "not found"
        


In [116]:
def revenue_size(info):
    """
    Extract revenue size of employer 
    """
    #Find part of the text containing employee number
    revenue_info = info.find('"annualRevenue":')
    #Options for sizes
    revenue_sizes = ["LESS_THAN_ONE_M","ONE_TO_FIVE_M","FIVE_TO_TEN_M","TEN_TO_TWENTYFIVE_M",
                 "TWENTYFIVE_TO_FIFTY_M","FIFTY_TO_ONEHUNDRED_M","ONEHUNDRED_TO_FIVEHUNDRED_M",
                 "FIVEHUNDRED_M_TO_ONE_B","ONE_TO_TWO_B","TWO_TO_FIVE_B","FIVE_TO_TEN_B","MORE_THAN_TEN_B","UNKNOWN"]
    #Longer to ensure we capute the above
    size = info[revenue_info:revenue_info+30]
    
    for count in revenue_sizes:
        count_ind = size.find(count)
        #-1 if word not found
        if count_ind > -1:
            return count
    return "not found"
        

In [169]:
def obtain_info(company_list):
    out = []
    for company in company_list:
        print("Processing: ", company)
        company_data = parse_data(company)
        if company_data == "not found":
            employee_count = "not found"
            revenue = "not found"
        else:
            employee_count = employee_size(company_data)
            revenue = revenue_size(company_data)
        out.append([company, employee_count, revenue])
        
    return out

In [130]:
data = pd.read_csv('distributors.csv', encoding = "ISO-8859-1")

In [154]:
companies = set(data.Company)

In [170]:
results = obtain_info(companies)

Processing:  VERITABLE VEGETABLE
Processing:  ALEX'S MEAT DISTRIBUTORS
Processing:  F.A.P. DISTRIBUTORS
Processing:  JAFCO FOODS, INC
Processing:  L'EPICERIE
Processing:  ACME INTERNATIONAL INC
Processing:  LA ROSA AZZURRA
Processing:  EARTHLINK PRODUCE TRADING CO
Processing:  HIALEAH PRODUCTS/NEW URBAN FARMS
Processing:  ALEX LEE INC.
Processing:  CERAMI SALES COMPANY
Processing:  HARVEST FOOD DISTRIBUTORS
Processing:  GREEN CITY ORGANICS
Processing:  DALE COX DISTRIBUTING
Processing:  SINGH & SINGH DIST.
Processing:  ONESTOP DISTRIBUTORS
Processing:  DAIRY FRESH FOODS
Processing:  SOUTHERN FOODS
Processing:  THE PETERSON COMPANY
Processing:  INTERMARKET GOURMET, LLC
Processing:  SWISS CHALET
Processing:  CAITO FOODS
Processing:  DIVINE SPECIALTIES
Processing:  SYSCO CORP.
Processing:  DOLE & BAILEY INC
Processing:  RUSSELL'S SWEET SUCCESS INC.
Processing:  NASSAU CANDY - SPECIALTY CONFECTIONS & FINE FOODS
Processing:  EASTERN SHORE PRODUCTS
Processing:  BELAIR AND WATERMARK FOODS
Pro

Processing:  NEW STAR 21, INC.
Processing:  HANA FOOD DISTRIBUTORS
Processing:  UNITED NATURAL FOODS, INC. (HONEST GREEN)
Processing:  CLASSIC FOODS
Processing:  BODEK
Processing:  GRECO AND SONS
Processing:  HALPERNS
Processing:  THAYER DISTRIBUTION
Processing:  AGCAL INC
Processing:  ALL ITALIA IMPORTS
Processing:  UBC FOOD DISTRIBUTORS
Processing:  SGS PRODUCE
Processing:  F@I FOOD DISTRIBUTION INC.
Processing:  ANCO FINE CHEESE
Processing:  KEHE DISTRIBUTORS
Processing:  QUALITY FOODS INC.
Processing:  COLVEN UNITED DISTRIBUTORS, INC.
Processing:  UNITED NATURAL FOODS, INC.
Processing:  YORKSHIRE FOOD SALES
Processing:  JJ&K DISTRIBUTORS
Processing:  AGORA FOODS INTERNATIONAL
Processing:  JULIUS SILVERT, INC.
Processing:  PEMBROKE DISTRIBUTION
Processing:  DENVER WHOLESALE
Processing:  INTERNATIONAL GOURMET FOODS
Processing:  PACIFIC COAST PRODUCTS, INC.
Processing:  SYSCO
Processing:  SYSCO SF / EUROPEAN IMPORTS
Processing:  THE CHEFS' WAREHOUSE
Processing:  ALESSI AND SONS LLC
Pr

In [156]:
company = "ALL THE BEST LOCAL"
page = requests.get("https://www.glassdoor.com/Reviews/" + company + "-reviews-SRCH_KE0,4.htm",
                        headers={'User-Agent': 'Mozilla/5.0'})

In [168]:
results