# Wiki Table Scraping

In [1]:
import os
import re
import pandas as pd
import numpy as np
import requests
import urllib.parse
from bs4 import BeautifulSoup

In [2]:
baseurl = 'https://en.wikipedia.org/wiki'
myurl = urllib.parse.urljoin(baseurl, 'wiki/S%26P_100')
r = requests.get(myurl)
soup_r = r.text

In [3]:
soup = BeautifulSoup(soup_r, 'lxml')

In [4]:
print(soup.prettify()[1:5000])

!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   S&amp;P 100 - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgMonthNamesShort":["","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],"wgRequestId":"XlGE5QpAICwAABuEwyEAAABA","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"S\u0026P_100","wgTitle":"S\u0026P 100","wgCurRevisionId":938082694,"wgRevisionId":938082694,"wgArticleId":2658424,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","American stock market indices","S\u00

In [5]:
Title = soup.title.string

In [6]:
tables = soup.find_all('table',{'class': 'wikitable sortable'})
my_table = tables[0]

In [7]:
my_cells = my_table.find_all('td')
Cells = []
for cells in my_cells:
    contents = ''
    for c in cells.stripped_strings:
        contents = contents + c
    Cells.append(contents)

To dael with Google, for example.

In [8]:
my_label = my_table.find_all('th')
Labels = []
for labels in my_label:
    labelname = ''
    for n in labels.stripped_strings:
        labelname = labelname + n
    Labels.append(n)
for i, n in enumerate(Labels):
    globals()[n] = Cells[i::len(Labels)]
    #print('<', n, '>', 'is saved as a global object')

In [9]:
my_links = my_table.find_all('a')
for key in my_links[0].attrs.keys():
    globals()[key] = []
    #print('<', key, '>', 'is saved as a global object')
    if key == 'href':
        for link in my_links:
            hyperlink = urllib.parse.urljoin(baseurl, link.get(key))
            globals()[key].append(hyperlink)
    else:
        for link in my_links:
            globals()[key].append(link.get(key))

href
'/wiki/Apple_Inc.'
'https://en.wikipedia.org/wiki/Apple_Inc.'

For example, 

('Capital One', 'Capital One Financial Corp.') <br>
('Emerson Electric', 'Emerson Electric Co.') <br>
('Alphabet Inc.', 'Alphabet Inc.(Class C)') <br>
('Alphabet Inc.', 'Alphabet Inc.(Class A)') <br>
('IBM', 'International Business Machines') <br>

```python
for n in zip(title, Name):
    if n[0]!=n[1]:
        print(n)
```

In [10]:
df = pd.DataFrame()
for c in Labels:
    df[c] = globals()[c]
df['Links'] = href

df

Unnamed: 0,Symbol,Name,Links
0,AAPL,Apple Inc.,https://en.wikipedia.org/wiki/Apple_Inc.
1,ABBV,AbbVie Inc.,https://en.wikipedia.org/wiki/AbbVie_Inc.
2,ABT,Abbott Laboratories,https://en.wikipedia.org/wiki/Abbott_Laboratories
3,ACN,Accenture,https://en.wikipedia.org/wiki/Accenture
4,ADBE,Adobe Inc.,https://en.wikipedia.org/wiki/Adobe_Inc.
...,...,...,...
96,VZ,Verizon Communications,https://en.wikipedia.org/wiki/Verizon_Communic...
97,WBA,Walgreens Boots Alliance,https://en.wikipedia.org/wiki/Walgreens_Boots_...
98,WFC,Wells Fargo,https://en.wikipedia.org/wiki/Wells_Fargo
99,WMT,Walmart,https://en.wikipedia.org/wiki/Walmart


In [11]:
df.to_csv("S&P100_wiki", sep='\t', encoding='utf-8')