In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
def update_list(df, url):
    """
    Retrieves information from a url and concatenates it with an existing DataFrame
    
    Limitations:
         -slicing assumes Fortune Archives (http://archive.fortune.com/magazines/fortune/fortune500_archive/full/)
         -requires a pre-existing DataFrame with which to concatenate
         
    The second limitation is mitigated by the get_F500 function:
         -get_F500 sets the column name to the year embedded in the url
    """

    r = requests.get(url) # gets all the data from a webpage
    assert r.status_code == requests.codes.ok # checks that we got something
    
    soup = BeautifulSoup(r.text)
    
    entries = [] # list of topics from the webpage, includes some extraneous info
    for wrapper in soup.find_all('a'):
        entries.append(wrapper.text)
    
    companies = entries[5:-11] # slices the irrelevant information
    assert len(companies) == 100 # should only have 100 companies
    
    F5 = pd.DataFrame(data = companies, columns = df.columns) # create a dataframe from the list to concatenate
    assert df.columns == F5.columns # should be identical because they're defined that way
    assert df.shape[1] == F5.shape[1] # verifies they have the same number of columns
    
    df = pd.concat([df, F5], 
                   ignore_index=True) # ignore index to keep ranking
    
    return df

In [3]:
def get_F500(year):
    """
    Creates a DataFrame with the names of the Fortune 500 companies from a given year
    Creates a list of urls with some gimmicky string manipulation, then extracts/transforms with the update_list function
    Returns a DataFrame with the list ordered by Revenue, the website default
    """
    links = [] # empty links table to be populated
    
    link_stem = 'http://archive.fortune.com/magazines/fortune/fortune500_archive/full/' # all links build from this one
    
    links.append(link_stem + str(year)) # adds link to the first page in the list
    
    for i in [101, 201, 301, 401]:
        links.append(link_stem + str(year) + '/' + str(i) + '.html') # adds link to the following pages
    
    assert len(links) == 5
    
    df = pd.DataFrame(columns=[str(year)]) # instantiate empty DataFrame

    for url in links:
        df = update_list(df, url)    

    assert df.shape == (500,1)
    
    return df

In [4]:
# returns the list of companies in 1955
base_df = get_F500(1955)

"""
Concatenates base_df with the dataframes from each subsequent year - stops at 2006 because records stop there
I need to write a new scraper for the new website
"""

for i in range(1956, 2006):
    base_df = pd.concat([base_df, get_F500(i)], axis=1)

In [5]:
base_df.head(5)

Unnamed: 0,1955,1956,1957,1958,1959,1960,1961,1962,1963,1964,...,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005
0,General Motors,General Motors,General Motors,General Motors,General Motors,General Motors,General Motors,General Motors,General Motors,General Motors,...,General Motors,General Motors,General Motors,General Motors,General Motors,Exxon Mobil,Wal-Mart Stores,Wal-Mart Stores,Wal-Mart Stores,Wal-Mart Stores
1,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,Exxon Mobil,...,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Wal-Mart Stores,Wal-Mart Stores,Exxon Mobil,General Motors,Exxon Mobil,Exxon Mobil
2,U.S. Steel,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,...,Exxon Mobil,Exxon Mobil,Exxon Mobil,Wal-Mart Stores,Exxon Mobil,General Motors,General Motors,Exxon Mobil,General Motors,General Motors
3,General Electric,U.S. Steel,U.S. Steel,U.S. Steel,General Electric,General Electric,General Electric,General Electric,General Electric,General Electric,...,Wal-Mart Stores,Wal-Mart Stores,Wal-Mart Stores,Exxon Mobil,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor,Ford Motor
4,Esmark,Chrysler,General Electric,General Electric,U.S. Steel,U.S. Steel,U.S. Steel,Mobil,Mobil,Mobil,...,AT&T,General Electric,General Electric,General Electric,General Electric,General Electric,Enron,General Electric,General Electric,General Electric


In [112]:
base_df.to_csv('base_df.csv')