# Capstone Project Part 03: Webscraping from Wikipedia - List of S&P 400 Companies

## Import libraries and modules

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

## Step 1: Create a soup object from the home page

In [2]:
# Use the requests library to get the html from the home page
res = requests.get('https://en.wikipedia.org/wiki/List_of_S%26P_400_companies')

# Create a soup object from the html
soup = bs(res.content, 'lxml')

## Step 2: Isolate the table that has all the S&P 400 companies

In [3]:
table = soup.find('table', {'class': 'wikitable sortable'})

## Step 3: Looping through each rows in the tbody of the table

In [4]:
# Start with empty lists
tickers = []
securities = []

for row in table.find_all('tr')[1:]:
    # We'll use almost all the <td /> tags for each row, might as well create a variable
    cells = row.find_all('td')
    
    # Add ticker symbols to our list of ticker
    tickers.append(cells[1].text)
    
    # Add name of security to our list of securities
    securities.append(cells[0].text)

## Step 4: Get rid of the `\n` at the end of the texts by using map, lambda and strip

In [5]:
tickers

['AAN',
 'ACHC',
 'ACIW',
 'ADNT',
 'ATGE',
 'ACM',
 'ACC',
 'AEO',
 'AFG',
 'AGCO',
 'ALE',
 'ALEX',
 'AMED',
 'AM',
 'APY',
 'ATI',
 'AMCX',
 'AN',
 'ARW',
 'ASB',
 'ASGN',
 'ASH',
 'ATO',
 'ATR',
 'AVNS',
 'AVT',
 'AYI',
 'AAXN',
 'BBBY',
 'BC',
 'BCO',
 'BDC',
 'BIO',
 'BKH',
 'BLKB',
 'BOH',
 'BRO',
 'BXS',
 'BYD',
 'BHF',
 'EAT',
 'BRX',
 'CABO',
 'CZR',
 'CAKE',
 'CAR',
 'CACI',
 'CASY',
 'CATY',
 'CFX',
 'CBSH',
 'CBT',
 'CC',
 'CDK',
 'CFR',
 'CGNX',
 'CHE',
 'CHDN',
 'CHK',
 'CIEN',
 'CLB',
 'CLGX',
 'CLH',
 'CLI',
 'CMC',
 'CMD',
 'CMP',
 'CNK',
 'CNO',
 'COHR',
 'CONE',
 'COR',
 'CPT',
 'CR',
 'CREE',
 'CRI',
 'CRL',
 'CRS',
 'CRUS',
 'CNX',
 'CSL',
 'CTLT',
 'CUZ',
 'CVLT',
 'CXW',
 'CVET',
 'CW',
 'CBRL',
 'CY',
 'DAN',
 'DCI',
 'DDS',
 'DECK',
 'DEI',
 'DKS',
 'DLPH',
 'DLX',
 'DNKN',
 'DNOW',
 'DPZ',
 'DY',
 'EGP',
 'EPC',
 'ERI',
 'EHC',
 'EME',
 'ENR',
 'ENS',
 'EPR',
 'EQT',
 'ETRN',
 'WTRG',
 'ETSY',
 'EV',
 'EVR',
 'EWBC',
 'EXEL',
 'EXP',
 'FAF',
 'FDS',
 'FCFS',


In [6]:
securities

["Aaron's Inc.\n",
 'Acadia Healthcare\n',
 'ACI Worldwide\n',
 'Adient plc\n',
 'Adtalem Global Education\n',
 'AECOM\n',
 'American Campus Communities\n',
 'American Eagle Outfitters\n',
 'American Financial Group\n',
 'AGCO\n',
 'ALLETE Inc.\n',
 'Alexander & Baldwin\n',
 'Amedisys\n',
 'Antero Midstream\n',
 'Apergy Corp.\n',
 'Allegheny Technologies\n',
 'AMC Networks\n',
 'AutoNation\n',
 'Arrow Electronics\n',
 'Associated Banc-Corp\n',
 'ASGN\n',
 'Ashland Inc.\n',
 'Atmos Energy\n',
 'AptarGroup Inc\n',
 'Avanos Medical, Inc.\n',
 'Avnet\n',
 'Acuity Brands\n',
 'Axon Enterprise\n',
 'Bed Bath & Beyond\n',
 'Brunswick Corporation\n',
 "Brink's Company (The)\n",
 'Belden Inc\n',
 'Bio-Rad Laboratories\n',
 'Black Hills Corporation\n',
 'Blackbaud\n',
 'Bank of Hawaii\n',
 'Brown & Brown\n',
 'BancorpSouth\n',
 'Boyd Gaming\n',
 'Brighthouse Financial\n',
 'Brinker International Inc\n',
 'Brixmor Property Group\n',
 'Cable One Inc\n',
 'Caesars Entertainment\n',
 'Cheesecake Fac

In [7]:
tickers = list(map(lambda s: s.strip(), tickers))
securities = list(map(lambda s: s.strip(), securities))

In [8]:
tickers

['AAN',
 'ACHC',
 'ACIW',
 'ADNT',
 'ATGE',
 'ACM',
 'ACC',
 'AEO',
 'AFG',
 'AGCO',
 'ALE',
 'ALEX',
 'AMED',
 'AM',
 'APY',
 'ATI',
 'AMCX',
 'AN',
 'ARW',
 'ASB',
 'ASGN',
 'ASH',
 'ATO',
 'ATR',
 'AVNS',
 'AVT',
 'AYI',
 'AAXN',
 'BBBY',
 'BC',
 'BCO',
 'BDC',
 'BIO',
 'BKH',
 'BLKB',
 'BOH',
 'BRO',
 'BXS',
 'BYD',
 'BHF',
 'EAT',
 'BRX',
 'CABO',
 'CZR',
 'CAKE',
 'CAR',
 'CACI',
 'CASY',
 'CATY',
 'CFX',
 'CBSH',
 'CBT',
 'CC',
 'CDK',
 'CFR',
 'CGNX',
 'CHE',
 'CHDN',
 'CHK',
 'CIEN',
 'CLB',
 'CLGX',
 'CLH',
 'CLI',
 'CMC',
 'CMD',
 'CMP',
 'CNK',
 'CNO',
 'COHR',
 'CONE',
 'COR',
 'CPT',
 'CR',
 'CREE',
 'CRI',
 'CRL',
 'CRS',
 'CRUS',
 'CNX',
 'CSL',
 'CTLT',
 'CUZ',
 'CVLT',
 'CXW',
 'CVET',
 'CW',
 'CBRL',
 'CY',
 'DAN',
 'DCI',
 'DDS',
 'DECK',
 'DEI',
 'DKS',
 'DLPH',
 'DLX',
 'DNKN',
 'DNOW',
 'DPZ',
 'DY',
 'EGP',
 'EPC',
 'ERI',
 'EHC',
 'EME',
 'ENR',
 'ENS',
 'EPR',
 'EQT',
 'ETRN',
 'WTRG',
 'ETSY',
 'EV',
 'EVR',
 'EWBC',
 'EXEL',
 'EXP',
 'FAF',
 'FDS',
 'FCFS',


In [9]:
securities

["Aaron's Inc.",
 'Acadia Healthcare',
 'ACI Worldwide',
 'Adient plc',
 'Adtalem Global Education',
 'AECOM',
 'American Campus Communities',
 'American Eagle Outfitters',
 'American Financial Group',
 'AGCO',
 'ALLETE Inc.',
 'Alexander & Baldwin',
 'Amedisys',
 'Antero Midstream',
 'Apergy Corp.',
 'Allegheny Technologies',
 'AMC Networks',
 'AutoNation',
 'Arrow Electronics',
 'Associated Banc-Corp',
 'ASGN',
 'Ashland Inc.',
 'Atmos Energy',
 'AptarGroup Inc',
 'Avanos Medical, Inc.',
 'Avnet',
 'Acuity Brands',
 'Axon Enterprise',
 'Bed Bath & Beyond',
 'Brunswick Corporation',
 "Brink's Company (The)",
 'Belden Inc',
 'Bio-Rad Laboratories',
 'Black Hills Corporation',
 'Blackbaud',
 'Bank of Hawaii',
 'Brown & Brown',
 'BancorpSouth',
 'Boyd Gaming',
 'Brighthouse Financial',
 'Brinker International Inc',
 'Brixmor Property Group',
 'Cable One Inc',
 'Caesars Entertainment',
 'Cheesecake Factory Inc',
 'Avis Budget Group',
 'CACI International',
 "Casey's General Stores",
 'Cat

## Step 5: Creating a Pandas DataFrame from our list of S&P 500 companies

In [10]:
# Create a DataFrame from our lists
df = pd.DataFrame({
    'ticker': tickers,   
    'security': securities    
})
df.head()

Unnamed: 0,ticker,security
0,AAN,Aaron's Inc.
1,ACHC,Acadia Healthcare
2,ACIW,ACI Worldwide
3,ADNT,Adient plc
4,ATGE,Adtalem Global Education


## Step 6: Export to csv

**Note**: Don't export the index column from our DataFrame

In [11]:
# Export to csv
df.to_csv('../data/sp400.csv', index=False)