In [8]:
#--Web scraping packages
from bs4 import BeautifulSoup
import requests

#Pandas/numpy for data manipulation
import pandas as pd
import numpy as np

In [9]:
#load URLs we want to scrape into an array
BASE_URL = [
    'https://www.reuters.com/companies/GOOGL.OQ/people',
    'https://www.reuters.com/companies/AMZN.OQ/people',
    'https://www.reuters.com/companies/AAPL.OQ/people'
]

In [10]:
#loading empty array for board members
board_members = []
#Loop through our URLs we loaded above
for b in BASE_URL:
    html = requests.get(b).text
    soup = BeautifulSoup(html, "html.parser")
    #identify table we want to scrape
    officer_table = soup.find('table', {"class" : "MarketsTable-officers-1Yb5u"})

    #try clause to skip any companies with missing/empty board member tables

    try:
        #loop through table, grab each of the 4 columns shown (try one of the links yourself to see the layout)
        for row in officer_table.find_all('tr'):
            cols = row.find_all('td')
            if len(cols) == 4:
               board_members.append((b, cols[0].text.strip(), cols[1].text.strip(), cols[2].text.strip(), cols[3].text.strip()))
    except: pass
print(board_members)

[('https://www.reuters.com/companies/GOOGL.OQ/people', 'John L. HennessyIndependent Chairman of the Board', '68', 'Independent Chairman of the Board', '2018'), ('https://www.reuters.com/companies/GOOGL.OQ/people', 'Sundar PichaiChief Executive Officer, Director', '49', 'Chief Executive Officer, Director', '2019'), ('https://www.reuters.com/companies/GOOGL.OQ/people', 'Sergey BrinCo-Founder, Director', '47', 'Co-Founder, Director', '2019'), ('https://www.reuters.com/companies/GOOGL.OQ/people', 'Ruth M. PoratChief Financial Officer, Senior Vice President', '63', 'Chief Financial Officer, Senior Vice President', '2015'), ('https://www.reuters.com/companies/GOOGL.OQ/people', 'Kent WalkerSenior VicePresident of Global Affairs and Chief Legal Officer, Google, and CorporateSecretary of Alphabet', '60', 'Senior VicePresident of Global Affairs and Chief Legal Officer, Google, and CorporateSecretary of Alphabet', '2020'), ('https://www.reuters.com/companies/GOOGL.OQ/people', 'Prabhakar RaghavanS

In [11]:
officer_table

<table class="table-container MarketsTable-officers-1Yb5u"><thead><tr class="MarketsTable-header-2NLA1"><th cellpadding="1" cellspacing="1" class="MarketsTable-officer_name-AAQuH" scope="column"><span class="TextLabel__text-label___3oCVw TextLabel__gray___1V4fk TextLabel__medium___t9PWg">Name</span></th><th cellpadding="1" cellspacing="1" class="MarketsTable-officer_age-yjWsZ" scope="column"><span class="TextLabel__text-label___3oCVw TextLabel__gray___1V4fk TextLabel__medium___t9PWg">Age</span></th><th cellpadding="1" cellspacing="1" class="MarketsTable-officer_title-1Vc6L" scope="column"><span class="TextLabel__text-label___3oCVw TextLabel__gray___1V4fk TextLabel__medium___t9PWg">Position</span></th><th cellpadding="1" cellspacing="1" class="MarketsTable-officer_since-31qfq" scope="column"><span class="TextLabel__text-label___3oCVw TextLabel__gray___1V4fk TextLabel__medium___t9PWg">Appointed</span></th></tr></thead><tbody><tr class="data"><td cellpadding="1" cellspacing="1" class="Mar

In [12]:
#convert output to new array, check length
board_array = np.asarray(board_members)
len(board_array)

43

In [13]:
#convert new array to dataframe
df = pd.DataFrame(board_array)

In [14]:
df.head()

Unnamed: 0,0,1,2,3,4
0,https://www.reuters.com/companies/GOOGL.OQ/people,John L. HennessyIndependent Chairman of the Board,68,Independent Chairman of the Board,2018
1,https://www.reuters.com/companies/GOOGL.OQ/people,"Sundar PichaiChief Executive Officer, Director",49,"Chief Executive Officer, Director",2019
2,https://www.reuters.com/companies/GOOGL.OQ/people,"Sergey BrinCo-Founder, Director",47,"Co-Founder, Director",2019
3,https://www.reuters.com/companies/GOOGL.OQ/people,"Ruth M. PoratChief Financial Officer, Senior V...",63,"Chief Financial Officer, Senior Vice President",2015
4,https://www.reuters.com/companies/GOOGL.OQ/people,Kent WalkerSenior VicePresident of Global Affa...,60,Senior VicePresident of Global Affairs and Chi...,2020


In [15]:
#rename columns, check output
df.columns = ['URL', 'Name', 'Age','Year_Joined', 'Title']
df.head()


Unnamed: 0,URL,Name,Age,Year_Joined,Title
0,https://www.reuters.com/companies/GOOGL.OQ/people,John L. HennessyIndependent Chairman of the Board,68,Independent Chairman of the Board,2018
1,https://www.reuters.com/companies/GOOGL.OQ/people,"Sundar PichaiChief Executive Officer, Director",49,"Chief Executive Officer, Director",2019
2,https://www.reuters.com/companies/GOOGL.OQ/people,"Sergey BrinCo-Founder, Director",47,"Co-Founder, Director",2019
3,https://www.reuters.com/companies/GOOGL.OQ/people,"Ruth M. PoratChief Financial Officer, Senior V...",63,"Chief Financial Officer, Senior Vice President",2015
4,https://www.reuters.com/companies/GOOGL.OQ/people,Kent WalkerSenior VicePresident of Global Affa...,60,Senior VicePresident of Global Affairs and Chi...,2020


In [16]:
#Saving File as captured
df.to_csv('test.csv')
