In [7]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import requests
from tqdm import tqdm_notebook as tqdm
import time
import random

from bs4 import BeautifulSoup
from copy import deepcopy

In [8]:
# boot up scraper for chrome
driver = webdriver.Chrome()

In [314]:
# go to ameriprise website
driver.get('https://www.ameripriseadvisors.com/')

# Isolate each box/container

In [13]:
# use beautiful soup to load website information
soup = BeautifulSoup(driver.page_source)

In [15]:
# match number of ameriprise advisors
len(soup.find_all('div',{'class':'right-pane'}))

42

In [18]:
# save each container to a list
advisor_list = soup.find_all('div',{'class':'right-pane'})

## Get Name

In [35]:
# code to get name
advisor_list[0].find('h3').text

'Vasso & Associates'

## Get Titles

In [73]:
# find advisor title: some are more important that others
advisor_list[1].find('p', {'class':'title'}).text

'Private Wealth Advisor\n • Vasso & Associates\n'

In [315]:
# # iterate through advisor list to check for mistakes
# # spits out '\n'
# for i in advisor_list:
#     print(i.find('h3').text)
#     print(i.find('p', {'class':'title'}).text)
#     print('\n')

### Team affiliation is after * charachter

In [75]:
# find team that advisor is on
advisor_list[1].find('p', {'class':'title'}).text

'Private Wealth Advisor\n • Vasso & Associates\n'

## Phone

In [76]:
# find phone number
advisor_list[1].find('a', {'class':'phone-link'}).text

'972.692.5069'

## Email

In [92]:
# find email address
advisor_list[1].find('p', {'class':'email'}).find('a', href=True)['href']

'mailto:neal.d.vasso@ampf.com'

In [95]:
# # test 
# for i in advisor_list:
#     print(i.find('p', {'class':'email'}).find('a', href=True)['href'])

## Address

In [101]:
# find address
advisor_list[1].find('a', {'class':'address'}).text

'\n5000 Quorum Dr Ste 375\n\nDallas, TX 75254-7091\n'

In [105]:
# # test 
# for i in advisor_list:
#     print(i.find('a', {'class':'address'}).text)
#     print('\n')

## Link to website

In [107]:
# find link to website
advisor_list[1].find('a', href=True)['href']


'/neal.d.vasso/?awsrc='

In [109]:
# # test 
# for i in advisor_list:
#     print(i.find('a', href=True)['href'])
#     print('\n')

# sample pipeline

In [119]:
## create a list of advisors
# list_of_advisor_dicts = []

In [185]:
# save page information
soup = BeautifulSoup(driver.page_source)

In [186]:
# create list of advisors from page
advisor_list = soup.find_all('div',{'class':'right-pane'})

In [187]:
# match number of advisors to result number
len(advisor_list)

3

In [188]:
# save each part of what we found into the dictionaries
for i in advisor_list:
    advisor = {}
    try:
        advisor['name'] = i.find('h3').text
    except:
        advisor['name'] = None
    try:
        advisor['title'] = i.find('p', {'class':'title'}).text
    except:
        advisor['title'] = None
    
    try:
        advisor['phone'] = i.find('a', {'class':'phone-link'}).text
    except: 
        advisor['phone'] = None
    
    try:
        advisor['email'] = i.find('p', {'class':'email'}).find('a', href=True)['href']
    except:
        advisor['email'] = None
    
    try:
        advisor['address'] = i.find('a', {'class':'address'}).text
    except:
        advisor['address'] = None
    
    try:
        advisor['link_to_website'] = i.find('a', href=True)['href']
    except:
        advisor['link_to_website'] = None
    
    list_of_advisor_dicts.append(advisor)

In [189]:
# how many advisors are on the list now
len(list_of_advisor_dicts)

299

In [191]:
df = pd.DataFrame(list_of_advisor_dicts)

In [194]:
raw_df = deepcopy(df)

In [195]:
raw_df.shape

(299, 6)

In [196]:
raw_df.to_csv('ameriprise_morski.csv')

In [198]:
df.head()

Unnamed: 0,name,title,phone,email,address,link_to_website
0,Vasso & Associates,\n,972.692.5069,mailto:neal.d.vasso@ampf.com,"\n5000 Quorum Dr Ste 375\n\nDallas, TX 75254-7...",/team/vasso-associates/?awsrc=
1,Neal D Vasso,Private Wealth Advisor\n • Vasso & Associates\n,972.692.5069,mailto:neal.d.vasso@ampf.com,"\n5000 Quorum Dr Ste 375\n\nDallas, TX 75254-7...",/neal.d.vasso/?awsrc=
2,"Jones, Cannon and Scofield",\n,214.272.7919,mailto:heidi.a.brown@ampf.com,"\n5910 N Central Expy\nSte 1040\nDallas, TX 75...",/team/jones-cannon-and-scofield/?awsrc=
3,Charles Jones,"Private Wealth Advisor\n • Jones, Cannon and S...",214.272.7919,mailto:charles.c.jones@ampf.com,"\n5910 N Central Expy\nSte 1040\nDallas, TX 75...",/charles.c.jones/?awsrc=
4,Ky L Fiser,Private Wealth Advisor\n • Fiser Wealth Manage...,214.420.0600,mailto:Ky.Fiser@ampf.com,"\n4144 N Central Expy Ste 538\n\nDallas, TX 75...",/ky.fiser/?awsrc=


In [200]:
df = df.sort_values('name').drop_duplicates(subset=['name','email'], keep = 'last')

In [204]:
df.reset_index(inplace=True)

In [212]:
df['correct_title'] = df['title'].str.split('\n', expand = True)[0]

In [214]:
df['team'] = df['title'].str.split('\n', expand = True)[1]

In [220]:
df['title'].str.split('\n', expand = True)[2]

0          
1          
2          
3          
4          
       ... 
153    None
154    None
155    None
156        
157        
Name: 2, Length: 158, dtype: object

In [222]:
df.drop(columns = ['new_title', 'unknow', 'index', 'title', 'misc'], inplace =True)

In [229]:
df['email'] = df['email'].str[7:]

In [230]:
df.head()

Unnamed: 0,name,phone,email,address,link_to_website,team,correct_title
0,Alana Doyle,214.389.8515,Alana.K.Doyle@ampf.com,"\n2435 N Central Expy\nSte 915\nRichardson, TX...",/alana.k.doyle/?awsrc=,• Doyle & Associates,Associate Financial Advisor
1,Alex Swisher,214.272.7919,Alex.Swisher@ampf.com,"\n5910 N Central Expy\nSte 1040\nDallas, TX 75...",/alex.swisher/?awsrc=,"• Jones, Cannon and Scofield",Financial Advisor
2,Amy Legate,214.469.0060,Amy.Legate@ampf.com,\n5300 Town &amp; Country Blvd\nSte 160\nFrisc...,/amy.legate/?awsrc=,• Silverman LeGate and Associates,Financial Advisor
3,Amy Ormsby,469.865.1082,AMY.M.ORMSBY@ampf.com,"\n1308 Village Creek Dr\nSte 2000\nPlano, TX 7...",/amy.m.ormsby/?awsrc=,,Financial Advisor
4,Andee Sloot,469.221.0413,Andee.Sloot@ampf.com,"\n5960 Berkshire Ln Ste 1200\n\nDallas, TX 752...",/andee.sloot/?awsrc=,• LaunchPoint Wealth Advisors,Financial Advisor


In [237]:
df['city, state, zip'] = df['address'].str.split('\n', expand =True)[3]

In [240]:
df['address1'] = df['address'].str.split('\n', expand =True)[1]

In [243]:
df['address2'] = df['address'].str.split('\n', expand =True)[2]

In [248]:
df['link_to_website'] = df['link_to_website'].str[:-8]

In [250]:
df['link'] = 'https://www.ameripriseadvisors.com/'

In [253]:
df['link'] = df['link']+df['link_to_website'].astype(str)

In [254]:
df.head()

Unnamed: 0,name,phone,email,address,link_to_website,team,correct_title,"city, state, zip",address1,address2,link
0,Alana Doyle,214.389.8515,Alana.K.Doyle@ampf.com,"\n2435 N Central Expy\nSte 915\nRichardson, TX...",/alana.k.doyle,• Doyle & Associates,Associate Financial Advisor,"Richardson, TX 75080-2753",2435 N Central Expy,Ste 915,https://www.ameripriseadvisors.com//alana.k.doyle
1,Alex Swisher,214.272.7919,Alex.Swisher@ampf.com,"\n5910 N Central Expy\nSte 1040\nDallas, TX 75...",/alex.swisher,"• Jones, Cannon and Scofield",Financial Advisor,"Dallas, TX 75206-0904",5910 N Central Expy,Ste 1040,https://www.ameripriseadvisors.com//alex.swisher
2,Amy Legate,214.469.0060,Amy.Legate@ampf.com,\n5300 Town &amp; Country Blvd\nSte 160\nFrisc...,/amy.legate,• Silverman LeGate and Associates,Financial Advisor,"Frisco, TX 75034-6888",5300 Town &amp; Country Blvd,Ste 160,https://www.ameripriseadvisors.com//amy.legate
3,Amy Ormsby,469.865.1082,AMY.M.ORMSBY@ampf.com,"\n1308 Village Creek Dr\nSte 2000\nPlano, TX 7...",/amy.m.ormsby,,Financial Advisor,"Plano, TX 75093-4464",1308 Village Creek Dr,Ste 2000,https://www.ameripriseadvisors.com//amy.m.ormsby
4,Andee Sloot,469.221.0413,Andee.Sloot@ampf.com,"\n5960 Berkshire Ln Ste 1200\n\nDallas, TX 752...",/andee.sloot,• LaunchPoint Wealth Advisors,Financial Advisor,"Dallas, TX 75225-6069",5960 Berkshire Ln Ste 1200,,https://www.ameripriseadvisors.com//andee.sloot


In [255]:
df.columns

Index(['name', 'phone', 'email', 'address', 'link_to_website', 'team',
       'correct_title', 'city, state, zip', 'address1', 'address2', 'link'],
      dtype='object')

In [259]:
new_cols = df.columns.to_list()

In [262]:
new_cols.remove('address')

In [266]:
new_cols.remove('link_to_website')

In [269]:
df = df[new_cols]

In [275]:
df['city'] = df['city, state, zip'].str.split(', ', expand=True)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [278]:
df['zip'] = df['city, state, zip'].str.split(', ', expand=True)[1]

In [283]:
df['zip']= df['zip'].str[3:]

In [287]:
df.columns.to_list()

['name',
 'phone',
 'email',
 'team',
 'correct_title',
 'city, state, zip',
 'address1',
 'address2',
 'link',
 'city',
 'zip']

In [288]:
new_cols = ['name', 'team', 'phone', 'email', 'address1', 'address2', 'city', 'zip', 'link', 'correct_title']

In [290]:
df = df[new_cols]

In [298]:
df['team'] = df['team'].str[3:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [308]:
df = df.sort_values(by=['city', 'address1', 'team'])

In [309]:
df.reset_index(inplace=True)

In [312]:
df.drop(columns = ['level_0','index'], inplace=True)

In [313]:
df.to_csv('morski_ameriprise_fixed.csv')

In [301]:
df.tail()

Unnamed: 0,name,team,phone,email,address1,address2,city,zip,link,correct_title
153,Verus Wealth Advisors,,972.398.8599,charles.c.lee@ampf.com,2435 N Central Expy,Ste 870,Richardson,75080-2737,https://www.ameripriseadvisors.com//team/verus...,
154,Waterfront Wealth Management,,504.889.1704,rufus.p.cressend@ampf.com,5172 Village Creek Drive,Unit 104,Plano,75093,https://www.ameripriseadvisors.com//team/water...,
155,Watters Creek Wealth Management,,972.954.1595,clint.w.wammack@ampf.com,700 Central Expy S,Ste 380,Allen,75013-8110,https://www.ameripriseadvisors.com//team/watte...,
156,William Burmeier,Leading Edge Advisors,214.445.0623,william.g.burmeier@ampf.com,4504 Legacy Dr,Ste 200,Plano,75024-2188,https://www.ameripriseadvisors.com//william.g....,Financial Advisor
157,Zeph Rouquette,Milestone Partners,214.420.7444,zeph.a.rouquette@ampf.com,3610 Shire Blvd Ste 212,,Richardson,75082-2239,https://www.ameripriseadvisors.com//zeph.a.rou...,Financial Advisor
