# _Faces of Fortune_

Determining the average face for the executive boards of each of the top 25 Fortune 500 companies.

In [452]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import json
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import time
import pandas as pd
import cv2
from skimage import io
from collections import Counter
import os
import pickle

# Start the Selenium browser

The Fortune 500 page uses React, so BeautifulSoup HTML parsing won't cut it.

In [24]:
# Start a new instance of Chrome
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/local/bin/chromedriver', chrome_options=chrome_options)
url = "https://fortune.com/fortune500/search/?"
browser.get(url)
time.sleep(5)

# Scroll down to list of endpoints (necessary?)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 8
while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    no_of_pagedowns-=1
    
time.sleep(5)
print("Done.")

  after removing the cwd from sys.path.


Waiting for endpoints to load...


# Get the Fortune 500 page for each company 

In [141]:
# Extract the column names from the table
name = "searchResults__columnTitle--1Brf4"
header = browser.find_elements_by_class_name(name)
columns = [col.text for col in header]
columns.append("URL_FORBES")

In [142]:
# Get each row of the table
rows = browser.find_elements_by_class_name("rt-tr-group")

In [None]:
# For each row in the table, extract the column values
link_name = "searchResults__cellWrapper--39MAj"
data = []
for n, row in enumerate(rows, 1):
    if n % 20 == 0 or n == 1:
        print(f"({n}/{len(rows)})")
    cells = row.find_elements_by_css_selector("div[role='gridcell']")
    values = [cell.text for cell in cells]
    
    # Get the Fortune URL for the company
    link = row.find_element_by_class_name(link_name).get_attribute("href")
    values.append(link)
    data.append({key:val for key, val in zip(columns, values)})

# Store the scraped data as a DataFrame
fortune = (pd.DataFrame(data)
        .set_index("RANK"))
fortune.to_csv("Fortune100.csv")
fortune.head()

# Load each company's web page

In [4]:
fortune = pd.read_csv("./Fortune100.csv").set_index("RANK")
fortune.head()

Unnamed: 0_level_0,NAME,REVENUES ($M),REVENUE PERCENT CHANGE,PROFITS ($M),PROFITS PERCENT CHANGE,ASSETS ($M),"MARKET VALUE — AS OF MARCH 29, 2019 ($M)",CHANGE IN RANK (FULL 1000),EMPLOYEES,CHANGE IN RANK (500 ONLY),URL_FORBES
RANK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Walmart,"$514,405.0",2.8%,"$6,670.0",-32.4%,"$219,295.0","$279,880.3",-,2200000,-,https://fortune.com/fortune500/2019/walmart
2,Exxon Mobil,"$290,212.0",18.8%,"$20,840.0",5.7%,"$346,196.0","$342,172.0",-,71000,-,https://fortune.com/fortune500/2019/exxon-mobil
3,Apple,"$265,595.0",15.9%,"$59,531.0",23.1%,"$365,725.0","$895,667.4",1,132000,1,https://fortune.com/fortune500/2019/apple
4,Berkshire Hathaway,"$247,837.0",2.4%,"$4,021.0",-91.1%,"$707,794.0","$493,870.3",-1,389000,-1,https://fortune.com/fortune500/2019/berkshire-...
5,Amazon.com,"$232,887.0",30.9%,"$10,073.0",232.1%,"$162,648.0","$874,709.5",3,647500,3,https://fortune.com/fortune500/2019/amazon-com


## Get the company website for each company

In [227]:
name = "dataTable__value--3n5tL dataTable__valueAlignLeft--3uvNx"
company_urls = []
for num_url, url_forbes in enumerate(fortune["URL_FORBES"]):
    if num_url % 20 == 0:
        print(url_forbes)
    page = requests.get(url_forbes)
    soup = BeautifulSoup(page.content, "lxml")

    items = list(map(lambda x: x.find("a"), soup.find_all(class_=name)))
    company_url = [item for item in items if item][0].get("href")
    company_urls.append(company_url)
    time.sleep(1)

# Update the DataFrame
fortune["URL_COMPANY"] = company_urls
fortune.to_csv("Fortune100.csv")

https://fortune.com/fortune500/2019/walmart
https://fortune.com/fortune500/2019/general-electric
https://fortune.com/fortune500/2019/ups
https://fortune.com/fortune500/2019/pfizer
https://fortune.com/fortune500/2019/oracle


In [367]:
fortune = pd.read_csv("./Fortune100.csv").set_index("RANK")

# Identify each company's leadership page

In [384]:
def get_google_search_url(company):
    return f"https://www.google.com/search?q={company}+corporate+leadership+page"

In [385]:
# Add the Google leadership search URLs to the DataFrame
fortune["SEARCH_URL"] = fortune["NAME"].apply(get_google_search_url)

# Strip the HTTPS stuff from the front of the company URL
fortune["URL_DOMAIN"] = fortune["URL_COMPANY"].apply(lambda x: "." + x.rsplit(".", 1)[-1])
fortune["URL_COMPANY_SUFFIX"] = fortune["URL_COMPANY"].apply(lambda x: ".".join(x.rsplit(".")[1:]))
fortune.to_csv("./Fortune100.csv")

In [None]:
# Get each company's (likely) leadership page from Google searches
leadership_urls = []
cols = ["NAME", "URL_COMPANY_SUFFIX", "SEARCH_URL"]
for rank, (company, suffix, url) in fortune[cols].iterrows():
    if rank % 20 == 0 or rank == 1:
        print(f"({rank:2.0f}/{fortune.index.max()}): {company}")
        
    # Open the Google search page in Selenium
    browser.get(url)
    time.sleep(3)
    hits = browser.find_elements_by_class_name("bkWMgd")
    
    # Loop through the top hits on the page
    leadership_url = None
    for hit in hits:
        try:
            r = hit.find_element_by_class_name("r")
        except:
            continue
        hit_url = r.find_element_by_css_selector("a").get_attribute("href")
        if suffix in hit_url:
            leadership_url = hit_url
            break
    leadership_urls.append(leadership_url)
    
# Update the DataFrame
fortune["URL_LEADERSHIP"] = leadership_urls
fortune.to_csv("Fortune100.csv")

In [373]:
# Manually add the leadership URLs we missed
col = "URL_LEADERSHIP"
fortune.loc[4, col] = "https://www.berkshirehathawayhs.com/pages/about"
fortune.loc[9, col] = "https://investors.att.com/corporate-governance/leadership"
fortune.loc[12, col] = "https://media.ford.com/content/fordmedia/fna/us/en/people.filter.company-officers.0.50.html" # Might need manual work
fortune.loc[15, col] = "https://abc.xyz/investor/other/board/#" # Need to Google image search the names
fortune.loc[20, col] = "http://ir.kroger.com/management-and-directors" # Need to Google image search the names
fortune.loc[32, col] = "https://www.cmcsa.com/corporate-governance/executive-officers" # Requires manual work
fortune.to_csv("./Fortune100.csv")

In [387]:
# Determine the roots for each of the leadership URLs (for image paths later)
def get_leadership_url_root(row):
    try:
        return row["URL_LEADERSHIP"].rsplit(row["URL_DOMAIN"], 1)[0] + row["URL_DOMAIN"]
    except:
        return None

In [388]:
fortune["URL_LEADERSHIP_ROOT"] = fortune.apply(get_leadership_url_root, 1)
fortune.to_csv("./Fortune100.csv")

# Scrape all images from each company's leadership page

In [431]:
# https://stackoverflow.com/questions/52633697/selenium-python-how-to-capture-network-traffics-response
class ImageScraper(object):
    
    def __init__(self, wait_time=1):
        # Run this once to start the Chrome instance
        caps = DesiredCapabilities.CHROME
        caps['loggingPrefs'] = {'performance': 'ALL'}
        self.driver = webdriver.Chrome(desired_capabilities=caps)
        self.browser_log = None
        self.responses = None
        self.wait_time = wait_time # seconds
        
        # Maximize the window
        kwargs = dict(x=0, y=0, width=1340, height=900)
        self.driver.set_window_rect(**kwargs)
        
    def _load_page(self, url):
        # Load the page
        self.driver.delete_all_cookies()
        self.driver.get(url)
        self.driver.refresh()
        self.driver.delete_all_cookies()
        self.driver.get(url)
        time.sleep(0.2)
        
        # Scroll to the bottom & top of page, prompting image loads                
        elem = self.driver.find_element_by_tag_name("body")
        no_of_pagedowns = 10
        while no_of_pagedowns:
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(0.8)
            no_of_pagedowns-=1
        time.sleep(0.5)
        for pos in 2 * ["0", "document.body.scrollHeight"]:
            scroll = f"window.scrollTo(0, {pos});"
            self.driver.execute_script(scroll)
            time.sleep(0.2)
        time.sleep(0.5)
        self.browser_log = self.driver.get_log('performance')
        return

    def _process_browser_log_entry(self, entry):
        return json.loads(entry['message'])['message']

    def get_all_image_links(self, url):
        """Returns a list of all image URLs on the page"""
        self._load_page(url)
        
        # Parse the network events
        events = [self._process_browser_log_entry(entry) for entry in self.browser_log]
        events = [event for event in events if 'Network.response' in event['method']]
        events = pd.DataFrame([event['params'] for event in events])
        responses = pd.DataFrame(events[events.type == "Image"]['response'].tolist())
        self.responses = responses
        return responses['url'].drop_duplicates()
    
    def close(self):
        print("Closing browser... ", end="")
        self.driver.quit()
        print("Done.")        
    
    def __del__(self):
        self.close()

In [427]:
# Helper functions
def shape_or_ratio_is_good(imshape):
    return (imshape == most_common_shape) or (get_ratio(imshape) == most_common_ratio)

def okay_to_print(num, always_print=False):
    return num == 1 or num == N or num % 5 == 0 or always_print

def mimeType_is_valid(mimeType):
    return mimeType.rsplit("/", 1)[-1].lower() in ["png", "jpg", "jpeg"]

In [432]:
# Create the scraper instance, opening Chrome
scraper = ImageScraper()
scraper

<__main__.ImageScraper at 0x134d7a8d0>

In [443]:
# Get the top 25 company's (likely) leadership page from Google searches
N = 25
cols = ["NAME", "URL_LEADERSHIP", "URL_LEADERSHIP_ROOT"]
all_image_links = {}
skipped_links = []
for rank, (company, url, root) in fortune.head(N)[cols].iterrows():
    images = {}
    sizes = []
    try:
        company = company.replace(" ", "_")
        if okay_to_print(rank, True):
            print(f"({rank:2.0f}/{N}): {company}\n\t{url}")
            
        # Get a link to every image on the pank
        image_links = scraper.get_all_image_links(url)
        valid_links = scraper.responses['mimeType'].apply(mimeType_is_valid)
        image_links = image_links[valid_links]
        all_image_links[company] = image_links

        # Attempt to read each image
        for n, image_url in enumerate(image_links):
            alt = f"{company}_image_{n:02.0f}"
            try:
                image = io.imread(image_url)[..., ::-1]
            except:
                skipped_links.append(image_url)
                continue

            # Keep the image URL, image array, and image size
            images[alt] = [image_url, image]
            sizes.append(image.shape)
        if okay_to_print(rank, True):
            print(f"Downloaded {len(images)} images.")
        if len(images) == 0:
            continue

        # What's the most common image shape and ratio?
        get_ratio = lambda x: round(1000 * x[0] / x[1])
        most_common_shape = Counter([s[:2] for s in sizes]).most_common(1)[0][0]
        most_common_ratio = Counter(list(map(get_ratio, sizes))).most_common(1)[0][0]

        # Create a company images folder if it doesn't already exist
        folder = f"./images/{rank:02.0f}_{company}"
        if not os.path.exists(folder):
            os.mkdir(folder)

        # Save all images, but note the ones with questionable ratios
        for label, (image_url, array) in images.items():
            fn = label
            if not shape_or_ratio_is_good(array.shape[:2]):
                fn += "_bad_size"
            fn += ".jpg"
            fp = os.path.join(folder, fn)
            cv2.imwrite(fp, array)

    except Exception as e:
        print(f"Error with '{company}':\n{e}")
    time.sleep(0.5)    
# Close the browser
scraper.close()
del(scraper)

# Save the image links
with open(f"fortune_{N}_image_links.pickle", "wb") as outfile:
    pickle.dump(all_image_links, outfile)
print("All done.")

( 1/25): Walmart
	https://corporate.walmart.com/our-story/leadership
Downloaded 54 images.
( 2/25): Exxon_Mobil
	https://corporate.exxonmobil.com/company/who-we-are/management-committee


  "Palette images with Transparency expressed in bytes should be "


Downloaded 17 images.
( 3/25): Apple
	https://www.apple.com/leadership/
Downloaded 19 images.
( 4/25): Berkshire_Hathaway
	https://www.berkshirehathawayhs.com/pages/about
Downloaded 1 images.
( 5/25): Amazon.com
	https://ir.aboutamazon.com/board-of-directors
Downloaded 21 images.
( 6/25): UnitedHealth_Group
	https://www.unitedhealthgroup.com/about/executives.html
Downloaded 3 images.
( 7/25): McKesson
	https://www.mckesson.com/about-mckesson/our-company/executive-officers/
Downloaded 14 images.
( 8/25): CVS_Health
	https://cvshealth.com/about/leadership
Downloaded 16 images.
( 9/25): AT&T
	https://investors.att.com/corporate-governance/leadership
Downloaded 12 images.
(10/25): AmerisourceBergen
	https://www.amerisourcebergen.com/about-our-leadership
Downloaded 9 images.
(11/25): Chevron
	https://www.chevron.com/about/leadership
Downloaded 0 images.
(12/25): Ford_Motor
	https://media.ford.com/content/fordmedia/fna/us/en/people.filter.company-officers.0.50.html
Downloaded 58 images.
(13/