# Webscraping from Wikipedia - List of S&P 500 Companies
---

## Import libraries and modules

In [1]:
# Basic libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

## Step 1: Create a soup object from the home page

In [2]:
# Use the requests library to get the html from the home page
res = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')

# Create a soup object from the html
soup = bs(res.content, 'lxml')

## Step 2: Isolate the table that has all the S&P 500 companies

In [3]:
table = soup.find('table', {'class': 'wikitable sortable'})

## Step 3: Looping through each rows in the tbody of the table

In [4]:
# Start with empty lists
tickers = []
securities = []
sectors = []
ciks = []

for row in table.find_all('tr')[1:]:
    # We'll use almost all the <td /> tags for each row, might as well create a variable
    cells = row.find_all('td')
    
    # Add ticker symbols to our list of ticker
    tickers.append(cells[0].text)
    
    # Add name of security to our list of securities
    securities.append(cells[1].text)
    
    # Add Global Industry Classification Standard (GICS) Sector to our list of sectors
    sectors.append(cells[4].text)
    
    # Add Central Index Key (CIK) to our list of CIKs
    ciks.append(cells[7].text)

## Step 4: Get rid of the `\n` at the end of the texts by using map, lambda and strip

In [5]:
tickers

['MMM\n',
 'ABT\n',
 'ABBV\n',
 'ABMD\n',
 'ACN\n',
 'ATVI\n',
 'ADBE\n',
 'AMD\n',
 'AAP\n',
 'AES\n',
 'AFL\n',
 'A\n',
 'APD\n',
 'AKAM\n',
 'ALK\n',
 'ALB\n',
 'ARE\n',
 'ALXN\n',
 'ALGN\n',
 'ALLE\n',
 'AGN\n',
 'ADS\n',
 'LNT\n',
 'ALL\n',
 'GOOGL\n',
 'GOOG\n',
 'MO\n',
 'AMZN\n',
 'AMCR\n',
 'AEE\n',
 'AAL\n',
 'AEP\n',
 'AXP\n',
 'AIG\n',
 'AMT\n',
 'AWK\n',
 'AMP\n',
 'ABC\n',
 'AME\n',
 'AMGN\n',
 'APH\n',
 'ADI\n',
 'ANSS\n',
 'ANTM\n',
 'AON\n',
 'AOS\n',
 'APA\n',
 'AIV\n',
 'AAPL\n',
 'AMAT\n',
 'APTV\n',
 'ADM\n',
 'ARNC\n',
 'ANET\n',
 'AJG\n',
 'AIZ\n',
 'T\n',
 'ATO\n',
 'ADSK\n',
 'ADP\n',
 'AZO\n',
 'AVB\n',
 'AVY\n',
 'BKR\n',
 'BLL\n',
 'BAC\n',
 'BK\n',
 'BAX\n',
 'BDX\n',
 'BRK.B\n',
 'BBY\n',
 'BIIB\n',
 'BLK\n',
 'BA\n',
 'BKNG\n',
 'BWA\n',
 'BXP\n',
 'BSX\n',
 'BMY\n',
 'AVGO\n',
 'BR\n',
 'BF.B\n',
 'CHRW\n',
 'COG\n',
 'CDNS\n',
 'CPB\n',
 'COF\n',
 'CPRI\n',
 'CAH\n',
 'KMX\n',
 'CCL\n',
 'CAT\n',
 'CBOE\n',
 'CBRE\n',
 'CDW\n',
 'CE\n',
 'CNC\n',
 'CNP\

In [6]:
securities

['3M Company',
 'Abbott Laboratories',
 'AbbVie Inc.',
 'ABIOMED Inc',
 'Accenture plc',
 'Activision Blizzard',
 'Adobe Inc.',
 'Advanced Micro Devices Inc',
 'Advance Auto Parts',
 'AES Corp',
 'AFLAC Inc',
 'Agilent Technologies Inc',
 'Air Products & Chemicals Inc',
 'Akamai Technologies Inc',
 'Alaska Air Group Inc',
 'Albemarle Corp',
 'Alexandria Real Estate Equities',
 'Alexion Pharmaceuticals',
 'Align Technology',
 'Allegion',
 'Allergan, plc',
 'Alliance Data Systems',
 'Alliant Energy Corp',
 'Allstate Corp',
 'Alphabet Inc Class A',
 'Alphabet Inc Class C',
 'Altria Group Inc',
 'Amazon.com Inc.',
 'Amcor plc',
 'Ameren Corp',
 'American Airlines Group',
 'American Electric Power',
 'American Express Co',
 'American International Group',
 'American Tower Corp.',
 'American Water Works Company Inc',
 'Ameriprise Financial',
 'AmerisourceBergen Corp',
 'AMETEK Inc.',
 'Amgen Inc.',
 'Amphenol Corp',
 'Analog Devices, Inc.',
 'ANSYS',
 'Anthem',
 'Aon plc',
 'A.O. Smith Corp'

In [7]:
sectors

['Industrial Conglomerates',
 'Health Care Equipment',
 'Pharmaceuticals',
 'Health Care Equipment',
 'IT Consulting & Other Services',
 'Interactive Home Entertainment',
 'Application Software',
 'Semiconductors',
 'Automotive Retail',
 'Independent Power Producers & Energy Traders',
 'Life & Health Insurance',
 'Health Care Equipment',
 'Industrial Gases',
 'Internet Services & Infrastructure',
 'Airlines',
 'Specialty Chemicals',
 'Office REITs',
 'Biotechnology',
 'Health Care Supplies',
 'Building Products',
 'Pharmaceuticals',
 'Data Processing & Outsourced Services',
 'Electric Utilities',
 'Property & Casualty Insurance',
 'Interactive Media & Services',
 'Interactive Media & Services',
 'Tobacco',
 'Internet & Direct Marketing Retail',
 'Paper Packaging',
 'Multi-Utilities',
 'Airlines',
 'Electric Utilities',
 'Consumer Finance',
 'Property & Casualty Insurance',
 'Specialized REITs',
 'Water Utilities',
 'Asset Management & Custody Banks',
 'Health Care Distributors',
 'Elec

In [8]:
ciks

['0000066740',
 '0000001800',
 '0001551152',
 '0000815094',
 '0001467373',
 '0000718877',
 '0000796343',
 '0000002488',
 '0001158449',
 '0000874761',
 '0000004977',
 '0001090872',
 '0000002969',
 '0001086222',
 '0000766421',
 '0000915913',
 '0001035443',
 '0000899866\n',
 '0001097149',
 '0001579241\n',
 '0001578845\n',
 '0001101215\n',
 '0000352541\n',
 '0000899051\n',
 '0001652044\n',
 '0001652044\n',
 '0000764180',
 '0001018724\n',
 '0001748790',
 '0001002910\n',
 '0000006201\n',
 '0000004904',
 '0000004962\n',
 '0000005272\n',
 '0001053507\n',
 '0001410636\n',
 '0000820027\n',
 '0001140859\n',
 '0001037868\n',
 '0000318154\n',
 '0000820313\n',
 '0000006281\n',
 '0001013462\n',
 '0001156039\n',
 '0000315293\n',
 '0000091142\n',
 '0000006769\n',
 '0000922864\n',
 '0000320193\n',
 '0000006951\n',
 '0001521332\n',
 '0000007084\n',
 '0000004281',
 '0001596532',
 '0000354190\n',
 '0001267238\n',
 '0000732717',
 '0000731802',
 '0000769397\n',
 '0000008670\n',
 '0000866787\n',
 '0000915912\

In [9]:
tickers = list(map(lambda s: s.strip(), tickers))
securities = list(map(lambda s: s.strip(), securities))
sectors = list(map(lambda s: s.strip(), sectors))
ciks = list(map(lambda s: s.strip(), ciks))

In [10]:
tickers

['MMM',
 'ABT',
 'ABBV',
 'ABMD',
 'ACN',
 'ATVI',
 'ADBE',
 'AMD',
 'AAP',
 'AES',
 'AFL',
 'A',
 'APD',
 'AKAM',
 'ALK',
 'ALB',
 'ARE',
 'ALXN',
 'ALGN',
 'ALLE',
 'AGN',
 'ADS',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AAL',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'ABC',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'ANSS',
 'ANTM',
 'AON',
 'AOS',
 'APA',
 'AIV',
 'AAPL',
 'AMAT',
 'APTV',
 'ADM',
 'ARNC',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'BKR',
 'BLL',
 'BAC',
 'BK',
 'BAX',
 'BDX',
 'BRK.B',
 'BBY',
 'BIIB',
 'BLK',
 'BA',
 'BKNG',
 'BWA',
 'BXP',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BF.B',
 'CHRW',
 'COG',
 'CDNS',
 'CPB',
 'COF',
 'CPRI',
 'CAH',
 'KMX',
 'CCL',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'CE',
 'CNC',
 'CNP',
 'CTL',
 'CERN',
 'CF',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CTXS',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'CL',
 'CMCSA',
 'CMA

In [11]:
securities

['3M Company',
 'Abbott Laboratories',
 'AbbVie Inc.',
 'ABIOMED Inc',
 'Accenture plc',
 'Activision Blizzard',
 'Adobe Inc.',
 'Advanced Micro Devices Inc',
 'Advance Auto Parts',
 'AES Corp',
 'AFLAC Inc',
 'Agilent Technologies Inc',
 'Air Products & Chemicals Inc',
 'Akamai Technologies Inc',
 'Alaska Air Group Inc',
 'Albemarle Corp',
 'Alexandria Real Estate Equities',
 'Alexion Pharmaceuticals',
 'Align Technology',
 'Allegion',
 'Allergan, plc',
 'Alliance Data Systems',
 'Alliant Energy Corp',
 'Allstate Corp',
 'Alphabet Inc Class A',
 'Alphabet Inc Class C',
 'Altria Group Inc',
 'Amazon.com Inc.',
 'Amcor plc',
 'Ameren Corp',
 'American Airlines Group',
 'American Electric Power',
 'American Express Co',
 'American International Group',
 'American Tower Corp.',
 'American Water Works Company Inc',
 'Ameriprise Financial',
 'AmerisourceBergen Corp',
 'AMETEK Inc.',
 'Amgen Inc.',
 'Amphenol Corp',
 'Analog Devices, Inc.',
 'ANSYS',
 'Anthem',
 'Aon plc',
 'A.O. Smith Corp'

In [12]:
sectors

['Industrial Conglomerates',
 'Health Care Equipment',
 'Pharmaceuticals',
 'Health Care Equipment',
 'IT Consulting & Other Services',
 'Interactive Home Entertainment',
 'Application Software',
 'Semiconductors',
 'Automotive Retail',
 'Independent Power Producers & Energy Traders',
 'Life & Health Insurance',
 'Health Care Equipment',
 'Industrial Gases',
 'Internet Services & Infrastructure',
 'Airlines',
 'Specialty Chemicals',
 'Office REITs',
 'Biotechnology',
 'Health Care Supplies',
 'Building Products',
 'Pharmaceuticals',
 'Data Processing & Outsourced Services',
 'Electric Utilities',
 'Property & Casualty Insurance',
 'Interactive Media & Services',
 'Interactive Media & Services',
 'Tobacco',
 'Internet & Direct Marketing Retail',
 'Paper Packaging',
 'Multi-Utilities',
 'Airlines',
 'Electric Utilities',
 'Consumer Finance',
 'Property & Casualty Insurance',
 'Specialized REITs',
 'Water Utilities',
 'Asset Management & Custody Banks',
 'Health Care Distributors',
 'Elec

In [13]:
ciks

['0000066740',
 '0000001800',
 '0001551152',
 '0000815094',
 '0001467373',
 '0000718877',
 '0000796343',
 '0000002488',
 '0001158449',
 '0000874761',
 '0000004977',
 '0001090872',
 '0000002969',
 '0001086222',
 '0000766421',
 '0000915913',
 '0001035443',
 '0000899866',
 '0001097149',
 '0001579241',
 '0001578845',
 '0001101215',
 '0000352541',
 '0000899051',
 '0001652044',
 '0001652044',
 '0000764180',
 '0001018724',
 '0001748790',
 '0001002910',
 '0000006201',
 '0000004904',
 '0000004962',
 '0000005272',
 '0001053507',
 '0001410636',
 '0000820027',
 '0001140859',
 '0001037868',
 '0000318154',
 '0000820313',
 '0000006281',
 '0001013462',
 '0001156039',
 '0000315293',
 '0000091142',
 '0000006769',
 '0000922864',
 '0000320193',
 '0000006951',
 '0001521332',
 '0000007084',
 '0000004281',
 '0001596532',
 '0000354190',
 '0001267238',
 '0000732717',
 '0000731802',
 '0000769397',
 '0000008670',
 '0000866787',
 '0000915912',
 '0000008818',
 '0001701605',
 '0000009389',
 '0000070858',
 '00013907

## Step 5: Creating a Pandas DataFrame from our list of S&P 500 companies

In [14]:
# Create a DataFrame from our lists
df = pd.DataFrame({
    'ticker': tickers,
    'security': securities,
    'sector': sectors,
    'cik': ciks
})
df.head()

Unnamed: 0,ticker,security,sector,cik
0,MMM,3M Company,Industrial Conglomerates,66740
1,ABT,Abbott Laboratories,Health Care Equipment,1800
2,ABBV,AbbVie Inc.,Pharmaceuticals,1551152
3,ABMD,ABIOMED Inc,Health Care Equipment,815094
4,ACN,Accenture plc,IT Consulting & Other Services,1467373


## Step 6: Export to csv

**Note**: Don't export the index column from our DataFrame

In [15]:
# Export to csv
df.to_csv('../data/sp500.csv', index=False)