In [137]:
import requests
from bs4 import BeautifulSoup
import numpy as np

In [417]:
def parse_data(company):
    """
    Parse company info from Glassdoor. Input: company name, type: string
    """
    #Put data into the right format for query
    words = company.split(' ')
    company = '-'.join(words)
    
    #Parse page info
    word_len = len(company)
    page = requests.get("https://www.glassdoor.com/Reviews/" + company + "-reviews-SRCH_KE0," + str(word_len) + ".htm",
                        headers={'User-Agent': 'Mozilla/5.0'})
    
    #Convert into usable format
    soup = BeautifulSoup(page.content, 'html.parser')
    
    #List of links
    link = soup.find("a", {"class": "tightAll h2"}, href=True)
    
    try:
        #If we got a list of results
        link = link['href']
        #Go to company profile
        company_page = requests.get("https://www.glassdoor.com" + link,
                            headers={'User-Agent': 'Mozilla/5.0'})
        #Convert into usable format
        company_info = BeautifulSoup(company_page.content, 'html.parser')
        
    except TypeError:
        #If we got only one results (redirected to it directly)
        company_info = soup
    
    #Company info to be further processed
    info = company_info.find("body", {"class": "main"}).text
    
    #This is Glassdoor's encoding for missing data it seems
    if info.find("The Fun Ones") > -1:
        return "not found"
    
    return info
    

In [292]:
def employee_size(info):
    """
    Extract size of employer based on the number of employees
    """
    
    #Find part of the text containing employee number
    employees_info = info.find('"employees":')
    #Options for sizes
    emp_sizes = ['SMALL_TO_MEDIUM', 'MEDIUM_TO_LARGE', 'LARGE_TO_GIANT', 'UNKNOWN', 'MEDIUM', 'SMALL', 'LARGE', 'GIANT']
    #Longer to ensure we capute the above
    size = info[employees_info:employees_info+30]
    
    for count in emp_sizes:
        count_ind = size.find(count)
        #-1 if word not found
        if count_ind > -1:
            return count
            break
    #If nothing found
    if count_ind == -1:
        return "not found"
        


In [368]:
def revenue_size(info):
    """
    Extract revenue size of employer 
    """
    #Find part of the text containing employee number
    revenue_info = info.find('"annualRevenue":')
    #Options for sizes
    revenue_sizes = ["LESS_THAN_ONE_M","ONE_TO_FIVE_M","FIVE_TO_TEN_M","TEN_TO_TWENTYFIVE_M",
                 "TWENTYFIVE_TO_FIFTY_M","FIFTY_TO_ONEHUNDRED_M","ONEHUNDRED_TO_FIVEHUNDRED_M",
                 "FIVEHUNDRED_M_TO_ONE_B","ONE_TO_TWO_B","TWO_TO_FIVE_B","FIVE_TO_TEN_B","MORE_THAN_TEN_B","UNKNOWN"]
    #Longer to ensure we capute the above
    size = info[revenue_info:revenue_info+50]
    
    for count in revenue_sizes:
        count_ind = size.find(count)
        #-1 if word not found
        if count_ind > -1:
            return count
            break
    #If nothing found
    if count_ind ==-1:
        return "not found"
        

In [430]:
def obtain_info(company_list):
    '''
    Takes in a list of companies (strings) and returns a list of lists 
    in the form: [company, employee_size, revenue_size]
    '''
    if not isinstance(company_list, list):
        print('Error: Input must be a list')
        return None
    out = []
    for company in company_list:
        print("Processing: ", company)
        company = company.replace('inc','')
        translate_table = str.maketrans(dict.fromkeys([',','.'],''))
        company = company.translate(translate_table)
        company_data = parse_data(company)
        if company_data == "not found":
            employee_count = "not found"
            revenue = "not found"
        else:
            employee_count = employee_size(company_data)
            revenue = revenue_size(company_data)
        out.append([company, employee_count, revenue])
        
    return out

#### Obtain results and write data into a csv

In [130]:
data = pd.read_csv('distributors.csv', encoding = "ISO-8859-1")

In [395]:
companies = sorted(list(set(data.Company)))

In [429]:
results = obtain_info(companies)

In [432]:
import csv

with open('results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(results)