## Web Scraping Example (originally made for a financial advisor client)

In [1]:
import numpy as np
import pandas as pd

import requests
from bs4 import BeautifulSoup as bs
import re
from IPython.display import HTML

In [2]:
url = 'https://www.forbes.com/lists/wealth-management-teams-best-in-state/?sh=576dedb376f2'
req = requests.get(url)
soup = bs(req.text, 'html.parser')   #lxml

## Tags to search: 
#### organizationName , parentCompany.name, groupMembers.name, headquarters table-cell main office , qas-0 table-cell min account size, qas-1 table-cell assets under management

In [3]:
href = soup.findAll('a', attrs={'class':'organizationName'})

#### Extracting the result into a data frame with 5 columns: TEAM, FIRM, TEAM MEMBERS, MAIN OFFICE, MIN ACCOUNT SIZE

In [4]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [5]:
org_name = soup.findAll('div', attrs={'class':'organizationName'})
firm = soup.findAll('div', attrs={'class':'parentCompany.name'})
team = soup.findAll('div', attrs={'class':'groupMembers.name'})
main_office = soup.findAll('div', attrs={'class':'headquarters table-cell main office'}) 
min_size =  soup.findAll('div', attrs={'class':'qas-0 table-cell min account size'})
aum =  soup.findAll('div', attrs={'class':'qas-1 table-cell assets under management'})

In [6]:
org_name = [remove_html_tags(str(t)) for t in list(org_name)]
firm = [remove_html_tags(str(t)) for t in list(firm)]
team = [remove_html_tags(str(t)) for t in list(team)]

main_office = [remove_html_tags(str(t)) for t in list(main_office)]
min_size = [remove_html_tags(str(t)) for t in list(min_size)]
aum = [remove_html_tags(str(aum)) for t in list(aum)]

In [7]:
# Initialize the organization to page link dictionary
link_dict = {org:None for org in org_name}

#### Note: There are some entries in the table without asset under management ($ AUM) and Forbes doe not bother to put a div tag around those entries. In addition, there is no (at least to me) systematic way to detect those entries. Therefore, I decided not to include AUM in the output html file (inconsistent number of entries will throw an error) but you can access those information in the web pages via the links for each entry.

In [8]:
link_tags = soup.find_all('a') # get all the a tag
href_list = []

# Forbe web pages have more hyper links with element 'aria-label' before and after the full table. Hence,
# We have to match the number of hyperlinks with the number of organizations in the table.
# That is why 11: len(org_name) + 11 is used here. This part is subhect to re-calibration.

for tag in link_tags[11:(len(org_name)+11)]: 
    #if tag.get('aria-label') in link_dict.keys():
    #href_list.append('<a href="' +str(tag.get('href')) + '">' + str(tag.get('aria-label')) + '</a>')
    href_list.append(str(tag.get('href')))


In [9]:
# construct pandas data frame
result = pd.DataFrame({'Team':org_name,'Firm':firm,'Team Member':team,'Main Office':main_office,\
                       'Min Account Size':min_size, 'Link':href_list})

# convert to html file
result_html = result.to_html(render_links=True,escape=False)

In [10]:
# csv version
result.to_csv('result.csv', index=False)

In [11]:
# output to html file
with open('output.html','w', encoding='utf-8') as file:
    file.write(result_html)