In [1]:
import json
import requests
import time
import random
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
BASE_URL_COMPANY_GET = "https://fortune.com/worlds-most-admired-companies/{}/{}/"

## get companies

In [4]:
def get_company_urls(base_url, year, company, known_companies=None):
    
    url = base_url.format(year, company)

    driver = webdriver.Chrome(ChromeDriverManager().install())
    companies = []
    driver.get(url)
    companies.append(driver.current_url.split('/')[-2])
    double_hit = 0
    while True:
        time.sleep(random.randint(0,3))
        if double_hit == 5:
            print(f'==> 5 times double hit, last company: {company}')
            double_hit = 0
            if known_companies is not None:
                driver.get(base_url.format(year, known_companies[random.randint(0, len(known_companies)-1)]))
                company = driver.current_url.split('/')[-2]
                if company not in companies:
                    companies.append(company)
        try:
            button = driver.find_element_by_class_name('singlePagination__next2--3D89W')
        except:
            print('No next page available')
            driver.quit()
            break
        button.click()
        company = driver.current_url.split('/')[-2]
        if company not in companies:
            companies.append(company)
        else:
            double_hit += 1
            
    return companies

In [5]:
companies2020 = get_company_urls(BASE_URL_COMPANY_GET, 2020, 'apple')
# companies2019 = get_company_urls(BASE_URL_COMPANY_GET, 2019, 'apple', companies2020)
# merged_companies = list(set([*companies2020, *companies2019]))

[WDM] - Current google-chrome version is 89.0.4389
[WDM] - Get LATEST driver version for 89.0.4389
[WDM] - Driver [/Users/simonvreugdenhil/.wdm/drivers/chromedriver/mac64/89.0.4389.23/chromedriver] found in cache




No next page available


## get data

In [6]:
def get_company_data(companies, year):

    baseurl = "https://fortune.com/worlds-most-admired-companies/{}/{}/"
    companydict = {}
    for c in companies:
        response = requests.get(baseurl.format(year, c))
        soup = BeautifulSoup(response.content, 'html.parser')
        soupmetrics = soup.find_all('tr')
        metricdict = {}
        for s in soupmetrics:
            metric_name = s.find_all('td')[0].text
            try:
                metric_value = re.findall(r'[-+]?\d*\.\d+|\d+', s.find_all('td')[1].text)[0]
                if '.' in metric_value:
                    metric_value = float(metric_value)
                else:
                    metric_value = int(metric_value)
            except:
                metric_value = s.find_all('td')[1].text
            if metric_value == '-':
                metric_value = None
            metricdict[metric_name] = metric_value
        companydict[soup.find_all('title')[0].text.split(' |')[0]] = metricdict
        
    return companydict

In [9]:
companydict2020 = get_company_data(companies2020, 2020)
# companydict2019 = get_company_data(merged_companies, 2019)

with open(f'../data/fortune/f500_reputation_2020.json', 'w') as outfile:
    json.dump(companydict2020, outfile)
# with open(f'../data/fortune/f500_reputation_2019.json', 'w') as outfile:
#     json.dump(companydict2019, outfile)

## to pandas --> 2020

In [10]:
import csv
import json
import pandas as pd

In [11]:
with open(f'../data/fortune/f500_reputation_2020.json', 'r') as infile:
    companydict2020 = json.load(infile)

In [12]:
frames = []
for k, v in companydict2020.items():
    dfsub = pd.DataFrame({'company': [k]})
    for subk, subv in v.items():
        dfsub[subk] = subv
    frames.append(dfsub)
dfcompany2020 = pd.concat(frames)
dfcompany2020 = dfcompany2020.reset_index(drop=True)

In [13]:
dfcompany2020 = dfcompany2020.loc[lambda x: ~x['Overall Score'].isnull()]

In [14]:
dfcompany2020.to_csv(f'../data/fortune/f500_reputation_2020.csv',
                     index=False,
                     quoting=csv.QUOTE_NONNUMERIC,
                     quotechar='"')

## to pandas --> 2019

In [17]:
import csv
import json
import pandas as pd

In [18]:
with open(f'../data/fortune/f500_reputation_2019.json', 'r') as infile:
    companydict2019 = json.load(infile)

In [19]:
frames = []
for k, v in companydict2019.items():
    dfsub = pd.DataFrame({'company': [k]})
    for subk, subv in v.items():
        dfsub[subk] = subv
    frames.append(dfsub)
dfcompany2019 = pd.concat(frames)
dfcompany2019 = dfcompany2019.reset_index(drop=True)

In [20]:
dfcompany2019 = dfcompany2019.loc[lambda x: ~x['Overall Score'].isnull()]

In [None]:
dfcompany2019.to_csv(f'../data/fortune/f500_reputation_2019.csv',
                     index=False,
                     quoting=csv.QUOTE_NONNUMERIC,
                     quotechar='"')