# Scrape data from Fortune500.com

This program reads in html files downloaded from Fortune500.com and creates a Pandas DataFrame with information on assets, revenues, and profits for the top 500 companies (ranked by total revenues for their respective fiscal years).

## Imports

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import platform

## Write out requirements file

Before moving on, generate a requirements file with details about computing environment and versions of imported dependencies

In [50]:
# Write out details about computing environment and dependencies
platform.platform()

'Windows-10-10.0.17134-SP0'

In [55]:
pd.__version__

'0.17.1'

In [56]:
re.__version__

'2.2.1'

In [48]:
import pkg_resources
import types

def get_imports():
    for name, val in globals().items():
        if isinstance(val, types.ModuleType):
            # print (name, val)
            # Split ensures you get root package, 
            # not just imported function
            name = val.__name__.split(".")[0]

        elif isinstance(val, type):
            name = val.__module__.split(".")[0]

        yield name

imports = list(set(get_imports()))

print (imports)

# The only way I found to get the version of the root package
# from only the name of the package is to cross-check the names 
# of installed packages vs. imported packages
requirements = []
for m in pkg_resources.working_set:
    # print (m)
    if m.project_name in imports:
        requirements.append((m.project_name, m.version))

for r in requirements:
    print("{} == {}".format(*r))

['__spec__', '_i47', '_15', '_ii', '_i13', 'r', '_i24', '_i31', '__doc__', 'platform', '_9', '_dh', '_i21', '_i26', '_i33', '_i', 'IPython', 'get_ipython', 'infile', '_i4', '__package__', '_i36', '_i43', '__name__', '_', '_i19', 'requests', 'm', 'i', '_i7', '_i37', '_i44', '_i22', '_i45', '_iii', '_i25', 'filename', 'get_imports', 'bs4', 'Out', '_i34', '_i5', 'types', 're', '_i42', '_i3', '_i32', 'exit', '_i15', '_i23', '_i17', '_i11', 'quit', '_i29', '_i40', '__loader__', '_13', '_i16', '_i38', '_i18', '_oh', 'requirements', '_i46', 'pkg_resources', 'In', '_i28', '_i41', '_i39', '_i14', '_i27', '_i10', '_i1', '_ih', 'html', '_i6', '___', '_i12', '__', 'builtins', '_33', '_i8', '_i20', 'os', '_i9', 'company_revenues', '_i35', 'imports', '_i30', '_28', 'year', '_i48', '_i2', 'pandas']
bs4 == 0.0.1
pandas == 0.17.1
requests == 2.9.1


### A Failed Attempt

This cell would be used if I could figure out how to get the full html file to download with the requests library. As is, the Fortune.com site only partially loads the data in the html file, and then fills it further when the page has been scrolled to the bottom.

In [None]:
#fortune_url = "http://fortune.com/fortune500/"
#year = 2015
#fortune_suffix = "/list/filtered?sortBy=profits&first500"
#page = requests.get(fortune_url + str(year) + fortune_suffix)

### Some Success...

In [None]:
vars = ['assets', 'revenues', 'profits']

In [25]:
filename = "input\\Fortune500_profits_2018.html"

with open(filename, encoding='utf8') as infile:
    html = BeautifulSoup(infile, "html.parser")

In [30]:
# Extract a BeautifulSoup ResultSet
rank = html.findAll("span", {"class": "column small-2 company-rank "}) # Note space after "company-rank"
rank

[<span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$0-company-list-item-Apple.0.0" style="padding:0;">4</span>,
 <span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$1-company-list-item-Berkshire Hathaway.0.0" style="padding:0;">3</span>,
 <span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$2-company-list-item-Verizon.0.0" style="padding:0;">16</span>,
 <span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$3-company-list-item-AT&amp;T.0.0" style="padding:0;">9</span>,
 <span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$4-company-list-item-JPMorgan Chase.0.0" style="padding:0;">20</span>,
 <span class="column small-2 company-rank " data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$5-company-list-item-Comcast.0.0" style="padding:0;">33</span>,
 <span class="column small-2 company-rank " data-r

In [31]:
result_set = html.findAll("span", {"class": "column small-5 company-revenue"}) # Note: not necessarily revenues!
result_set

[<span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$0-company-list-item-Apple.0.2" style="padding:0;">$48,351.0</span>,
 <span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$1-company-list-item-Berkshire Hathaway.0.2" style="padding:0;">$44,940.0</span>,
 <span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$2-company-list-item-Verizon.0.2" style="padding:0;">$30,101.0</span>,
 <span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$3-company-list-item-AT&amp;T.0.2" style="padding:0;">$29,450.0</span>,
 <span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$4-company-list-item-JPMorgan Chase.0.2" style="padding:0;">$24,441.0</span>,
 <span class="column small-5 company-revenue" data-reactid=".maircifqjs.2.0.5.1:2.0.0.0.0.4.1:$5-company-list-item-Comcast.0.2" style="padding:0;">$22,714.0</

In [91]:
revenues_list = []

for tag in company_revenues:
    # Convert BeautifulSoup Tag object to a string
    str_tag = str(tag)  
    
    company = str_tag[str_tag.index('-company-list-item-')+19 : str_tag.index('.0.2"')]
    # Clean up a few company names
    company = re.sub('amp;', '', company) # '&' (AT&T, Procter & Gamble)
    company = re.sub('=1', '.', company) # '.' (Amazon.com, J.P. Morgan Chase)
    
    rank = str_tag[str_tag.index(':$')+2 : str_tag.index('-company-list-item-')] 
    # Convert rank to integer and add 1 to start from 1 instead of 0
    rank = int(rank) + 1
    
    revenues = str_tag[str_tag.index('>$')+2 : str_tag.index('</span>')]
    # Drop ',' and turn string into integer
    revenues = int(re.sub(',','', revenues))
    
    revenues_list.append([year, rank, company, revenues])

In [99]:
columns = ['year', 'rank', 'company_name', 'revenues']
revenues_df = pd.DataFrame(revenues_list, columns=columns)

In [100]:
revenues_df

Unnamed: 0,year,rank,company_name,revenues
0,2015,1,Walmart,482130
1,2015,2,Exxon Mobil,246204
2,2015,3,Apple,233715
3,2015,4,Berkshire Hathaway,210821
4,2015,5,McKesson,181241
5,2015,6,UnitedHealth Group,157107
6,2015,7,CVS Health,153290
7,2015,8,General Motors,152356
8,2015,9,Ford Motor,149558
9,2015,10,AT&T,146801


In [93]:
company_titles = html.findAll("span", {"class": "company-title"})
#company_titles

[<span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$0-company-list-item-Walmart.0.1">Walmart</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$1-company-list-item-Exxon Mobil.0.1">Exxon Mobil</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$2-company-list-item-Apple.0.1">Apple</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$3-company-list-item-Berkshire Hathaway.0.1">Berkshire Hathaway</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$4-company-list-item-McKesson.0.1">McKesson</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$5-company-list-item-UnitedHealth Group.0.1">UnitedHealth Group</span>,
 <span class="column small-5 company-title" data-reactid=".jxp0luhqis.2.0.5.1:2.0.0.0.0.4.1:$6-com