# _Faces of Fortune_

Determining the average face for the executive boards of each of the top 25 Fortune 500 companies.

In [186]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import time
import pandas as pd
import cv2
from skimage import io
from collections import Counter

# Start the Selenium browser

The Fortune 500 page uses React, so BeautifulSoup HTML parsing won't cut it.

In [24]:
# Start a new instance of Chrome
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
browser = webdriver.Chrome('/usr/local/bin/chromedriver', chrome_options=chrome_options)
url = "https://fortune.com/fortune500/search/?"
browser.get(url)
time.sleep(5)

# Scroll down to list of endpoints (necessary?)
elem = browser.find_element_by_tag_name("body")
no_of_pagedowns = 8
while no_of_pagedowns:
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(1)
    no_of_pagedowns-=1
    
time.sleep(5)
print("Done.")

  after removing the cwd from sys.path.


Waiting for endpoints to load...


# Get the Fortune 500 page for each company 

In [141]:
# Extract the column names from the table
name = "searchResults__columnTitle--1Brf4"
header = browser.find_elements_by_class_name(name)
columns = [col.text for col in header]
columns.append("URL_FORBES")

In [142]:
# Get each row of the table
rows = browser.find_elements_by_class_name("rt-tr-group")

In [None]:
# For each row in the table, extract the column values
link_name = "searchResults__cellWrapper--39MAj"
data = []
for n, row in enumerate(rows, 1):
    if n % 20 == 0 or n == 1:
        print(f"({n}/{len(rows)})")
    cells = row.find_elements_by_css_selector("div[role='gridcell']")
    values = [cell.text for cell in cells]
    
    # Get the Fortune URL for the company
    link = row.find_element_by_class_name(link_name).get_attribute("href")
    values.append(link)
    data.append({key:val for key, val in zip(columns, values)})

# Store the scraped data as a DataFrame
df = (pd.DataFrame(data)
        .set_index("RANK"))
df.to_csv("Fortune100.csv")
df.head()

# Load each company's web page

In [4]:
df = pd.read_csv("./Fortune100.csv").set_index("RANK")
df.head()

Unnamed: 0_level_0,NAME,REVENUES ($M),REVENUE PERCENT CHANGE,PROFITS ($M),PROFITS PERCENT CHANGE,ASSETS ($M),"MARKET VALUE — AS OF MARCH 29, 2019 ($M)",CHANGE IN RANK (FULL 1000),EMPLOYEES,CHANGE IN RANK (500 ONLY),URL_FORBES
RANK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Walmart,"$514,405.0",2.8%,"$6,670.0",-32.4%,"$219,295.0","$279,880.3",-,2200000,-,https://fortune.com/fortune500/2019/walmart
2,Exxon Mobil,"$290,212.0",18.8%,"$20,840.0",5.7%,"$346,196.0","$342,172.0",-,71000,-,https://fortune.com/fortune500/2019/exxon-mobil
3,Apple,"$265,595.0",15.9%,"$59,531.0",23.1%,"$365,725.0","$895,667.4",1,132000,1,https://fortune.com/fortune500/2019/apple
4,Berkshire Hathaway,"$247,837.0",2.4%,"$4,021.0",-91.1%,"$707,794.0","$493,870.3",-1,389000,-1,https://fortune.com/fortune500/2019/berkshire-...
5,Amazon.com,"$232,887.0",30.9%,"$10,073.0",232.1%,"$162,648.0","$874,709.5",3,647500,3,https://fortune.com/fortune500/2019/amazon-com


## Get the company website for each company

In [227]:
name = "dataTable__value--3n5tL dataTable__valueAlignLeft--3uvNx"
company_urls = []
for num_url, url_forbes in enumerate(df["URL_FORBES"]):
    if num_url % 20 == 0:
        print(url_forbes)
    page = requests.get(url_forbes)
    soup = BeautifulSoup(page.content, "lxml")

    items = list(map(lambda x: x.find("a"), soup.find_all(class_=name)))
    company_url = [item for item in items if item][0].get("href")
    company_urls.append(company_url)
    time.sleep(1)

# Update the DataFrame
df["URL_COMPANY"] = company_urls
df.to_csv("Fortune100.csv")

https://fortune.com/fortune500/2019/walmart
https://fortune.com/fortune500/2019/general-electric
https://fortune.com/fortune500/2019/ups
https://fortune.com/fortune500/2019/pfizer
https://fortune.com/fortune500/2019/oracle


# Identify each company's leadership page

In [235]:
def get_google_search_url(company):
    return f"https://www.google.com/search?q={company}+corporate+leadership+page"

In [268]:
# Add the Google leadership search URLs to the DataFrame
df["SEARCH_URL"] = df["NAME"].apply(get_google_search_url)

In [319]:
# Strip the HTTPS stuff from the front of the company URL
df["URL_DOMAIN"] = df["URL_COMPANY"].apply(lambda x: "." + x.rsplit(".", 1)[-1])
df["URL_COMPANY_SUFFIX"] = df["URL_COMPANY"].apply(lambda x: ".".join(x.rsplit(".")[1:]))

In [None]:
# Get each company's (likely) leadership page from Google searches
leadership_urls = []
cols = ["NAME", "URL_COMPANY_SUFFIX", "SEARCH_URL"]
for rank, (company, suffix, url) in df[cols].iterrows():
    if rank % 20 == 0 or rank == 1:
        print(f"({rank:2.0f}/{df.index.max()}): {company}")
        
    # Open the Google search page in Selenium
    browser.get(url)
    time.sleep(3)
    hits = browser.find_elements_by_class_name("bkWMgd")
    
    # Loop through the top hits on the page
    leadership_url = None
    for hit in hits:
        try:
            r = hit.find_element_by_class_name("r")
        except:
            continue
        hit_url = r.find_element_by_css_selector("a").get_attribute("href")
        if suffix in hit_url:
            leadership_url = hit_url
            break
    leadership_urls.append(leadership_url)
    
# Update the DataFrame
df["URL_LEADERSHIP"] = leadership_urls
df.to_csv("Fortune100.csv")

In [348]:
# Manually add the leadership URLs we missed
col = "URL_LEADERSHIP"
df.loc[4, col] = "https://www.berkshirehathawayhs.com/pages/about"
df.loc[9, col] = "https://investors.att.com/corporate-governance/leadership"
df.loc[12, col] = "https://media.ford.com/content/fordmedia/fna/us/en/people.filter.company-officers.0.50.html" # Might need manual work
df.loc[15, col] = "https://abc.xyz/investor/other/board/#" # Need to Google image search the names
df.loc[20, col] = "http://ir.kroger.com/management-and-directors" # Need to Google image search the names
df.loc[32, col] = "https://www.cmcsa.com/corporate-governance/executive-officers" # Requires manual work

# Scrape all images from each company's leadership page

In [411]:
# Determine the roots for each of the leadership URLs (for image paths later)
def get_leadership_url_root(row):
    try:
        return row["URL_LEADERSHIP"].rsplit(row["URL_DOMAIN"], 1)[0] + row["URL_DOMAIN"]
    except:
        return None

df["URL_LEADERSHIP_ROOT"] = df.apply(get_leadership_url_root, 1)

In [414]:
# Get the top 25 company's (likely) leadership page from Google searches
N = 25
cols = ["NAME", "URL_LEADERSHIP", "URL_LEADERSHIP_ROOT"]
for rank, (company, url, root) in df.head(N)[cols].iterrows():
    if rank % 20 == 0 or rank == 1 or rank == 25:
        print(f"({rank:2.0f}/{df.index.max()}): {company}")
        print(url)

( 1/100): Walmart
https://corporate.walmart.com/our-story/leadership
(20/100): Kroger
http://ir.kroger.com/management-and-directors
(25/100): Bank of America
https://about.bankofamerica.com/en-us/who-we-are/our-leadership.html


In [357]:
# Scrape all images from the company's leadership page
browser.get(url)
time.sleep(3)

In [358]:
# Load the page
page = requests.get(url)
soup = BeautifulSoup(page.content)

In [433]:
company = df.loc[N]["NAME"].replace(" ", "_")

In [419]:
# Download the images
images, sizes = {}, []
skipped_links = []
for n, img in enumerate(soup.findAll('img')):
    alt = img.get("alt").replace(" ", "_")
    alt = alt if alt else f"{company}_image_{n}"
    
    # Read the image URL
    path_to_image = root + img.get('src')
    try:
        image = io.imread(path_to_image)[..., ::-1]
    except:
        print(f"Couldn't read: '{alt}', skipping.")
        skipped_links.append(path_to_image)
        continue

    # Keep the image URL, image array, and image size
    print(alt)
    images[alt] = [path_to_image, image]
    sizes.append(image.shape)
print(f"Downloaded {len(images)} images.")

Couldn't read: 'Bank_of_America_logo', skipping.
Khan
Supporting_people_with_disabilities_
Buzzing_city
Buzzing_city
Supporting_people_with_disabilities_
Report_Center
US
US
US
Bank_of_America_resources
Supporting_people_with_disabilities_
Child_smiling
Service_member
We're_a_company_with_over_200_years_of_leadership_experience.
Delivered_$17_billion_toward_renewable_and_cleaner_energy_initiatives
Brian_T._Moynihan,_Chairman_of_the_Board,_Chief_Executive_Officer
Dean_Athanasia,_President_of_Consumer_and_Small_Business
Catherine_Bessant,_Chief_Operations_and_Technology_Officer
Sheri_B._Bronstein,_Chief_Human_Resources_Officer
Paul_M._Donofrio
Anne_Finucane,_Vice_Chairman
Geoffrey_Greener,_Chief_Risk_Officer
Christine_Katziff,_Chief_Audit_Executive
Katy_Knox,_President_of_Bank_of_America_Private_Bank
David_Leitch
Thomas_Montag,_Chief_Operating_Officer
Thong_Nguyen,_Vice_Chairman
Andy_Sieg,_President,_Merrill_Lynch_Wealth_Management
Andrea_B._Smith,_Chief_Administrative_Officer
Bruce_R._T

In [428]:
# What's the most common image shape and ratio?
get_ratio = lambda x: round(1000 * x[0] / x[1])
most_common_shape = Counter([s[:2] for s in sizes]).most_common(1)[0][0]
most_common_ratio = Counter(list(map(get_ratio, sizes))).most_common(1)[0][0]
print(f"Most common shape: {most_common_shape}")
print(f"Most common ratio: {most_common_ratio}")

Most common shape: (454, 686)
Most common ratio: 662


In [429]:
def shape_or_ratio_is_good(imshape):
    return (imshape == most_common_shape) or (get_ratio(imshape) == most_common_ratio)

In [437]:
# Only save images of the most common size or ratio
num_saved = 0
for label, image in images.items():
    array = image[1]
    if shape_or_ratio_is_good(array.shape[:2]):
        fp = f"images/{company}_{label}.jpg"
        cv2.imwrite(fp, array)
        num_saved += 1
print(f"Saved {num_saved} out of {len(images)} images.")

Saved 15 out of 29 images.
