In [7]:
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import requests
from tqdm import tqdm_notebook as tqdm
import time
import random

from bs4 import BeautifulSoup
from copy import deepcopy

In [8]:
# boot up scraper for chrome
driver = webdriver.Chrome()

In [314]:
# go to ameriprise website
driver.get('https://www.ameripriseadvisors.com/')

# Isolate each box/container

In [13]:
# use beautiful soup to load website information
soup = BeautifulSoup(driver.page_source)

In [15]:
# match number of ameriprise advisors
len(soup.find_all('div',{'class':'right-pane'}))

42

In [18]:
# save each container to a list
advisor_list = soup.find_all('div',{'class':'right-pane'})

## Get Name

In [35]:
# code to get name
advisor_list[0].find('h3').text

'Vasso & Associates'

## Get Titles

In [73]:
# find advisor title: some are more important that others
advisor_list[1].find('p', {'class':'title'}).text

'Private Wealth Advisor\n • Vasso & Associates\n'

In [315]:
# # iterate through advisor list to check for mistakes
# # spits out '\n'
# for i in advisor_list:
#     print(i.find('h3').text)
#     print(i.find('p', {'class':'title'}).text)
#     print('\n')

### Team affiliation is after * charachter

In [75]:
# find team that advisor is on
advisor_list[1].find('p', {'class':'title'}).text

'Private Wealth Advisor\n • Vasso & Associates\n'

## Phone

In [76]:
# find phone number
advisor_list[1].find('a', {'class':'phone-link'}).text

'972.692.5069'

## Email

In [92]:
# find email address
advisor_list[1].find('p', {'class':'email'}).find('a', href=True)['href']

'mailto:neal.d.vasso@ampf.com'

In [95]:
# # test 
# for i in advisor_list:
#     print(i.find('p', {'class':'email'}).find('a', href=True)['href'])

## Address

In [101]:
# find address
advisor_list[1].find('a', {'class':'address'}).text

'\n5000 Quorum Dr Ste 375\n\nDallas, TX 75254-7091\n'

In [105]:
# # test 
# for i in advisor_list:
#     print(i.find('a', {'class':'address'}).text)
#     print('\n')

## Link to website

In [107]:
# find link to website
advisor_list[1].find('a', href=True)['href']


'/neal.d.vasso/?awsrc='

In [109]:
# # test 
# for i in advisor_list:
#     print(i.find('a', href=True)['href'])
#     print('\n')

# sample pipeline

In [119]:
## create a list of advisors
# list_of_advisor_dicts = []

In [185]:
# save page information
soup = BeautifulSoup(driver.page_source)

In [186]:
# create list of advisors from page
advisor_list = soup.find_all('div',{'class':'right-pane'})

In [187]:
# match number of advisors to result number
len(advisor_list)

3

In [188]:
# save each part of what we found into the dictionaries
for i in advisor_list:
    advisor = {}
    try:
        advisor['name'] = i.find('h3').text
    except:
        advisor['name'] = None
    try:
        advisor['title'] = i.find('p', {'class':'title'}).text
    except:
        advisor['title'] = None
    
    try:
        advisor['phone'] = i.find('a', {'class':'phone-link'}).text
    except: 
        advisor['phone'] = None
    
    try:
        advisor['email'] = i.find('p', {'class':'email'}).find('a', href=True)['href']
    except:
        advisor['email'] = None
    
    try:
        advisor['address'] = i.find('a', {'class':'address'}).text
    except:
        advisor['address'] = None
    
    try:
        advisor['link_to_website'] = i.find('a', href=True)['href']
    except:
        advisor['link_to_website'] = None
    
    list_of_advisor_dicts.append(advisor)

In [189]:
# how many advisors are on the list now
len(list_of_advisor_dicts)

299

In [191]:
# save into dataframe
df = pd.DataFrame(list_of_advisor_dicts)

## Save raw into csv

In [194]:
#save original to csv
raw_df = deepcopy(df)

In [195]:
raw_df.shape

(299, 6)

In [196]:
raw_df.to_csv('ameriprise_morski.csv')

## Drop duplicates

In [200]:
#drop duplicate values if they have the same name and email
df = df.sort_values('name').drop_duplicates(subset=['name','email'], keep = 'last')

## Fix Columns

In [212]:
# get title from title column
df['correct_title'] = df['title'].str.split('\n', expand = True)[0]

In [214]:
# get team from title column
df['team'] = df['title'].str.split('\n', expand = True)[1]

In [222]:
# drop unnecessary columns
df.drop(columns = ['new_title', 'unknow', 'index', 'title', 'misc'], inplace =True)

In [229]:
# get rid of phrase 'mail to:' in email column
df['email'] = df['email'].str[7:]

In [237]:
# get city, state, and zip from address column
df['city, state, zip'] = df['address'].str.split('\n', expand =True)[3]

In [240]:
# get the primary address
df['address1'] = df['address'].str.split('\n', expand =True)[1]

In [243]:
# get suite number from address
df['address2'] = df['address'].str.split('\n', expand =True)[2]

In [248]:
# get rid of phrase from the link column
df['link_to_website'] = df['link_to_website'].str[:-8]

In [250]:
# create a column with the proper address
df['link'] = 'https://www.ameripriseadvisors.com/'

In [253]:
# concatenate link and link_to_website to have the right web address
df['link'] = df['link']+df['link_to_website'].astype(str)

In [275]:
# isolate city from city,state, zip column
df['city'] = df['city, state, zip'].str.split(', ', expand=True)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [278]:
# get zip code from city, state, and zip column
df['zip'] = df['city, state, zip'].str.split(', ', expand=True)[1]

In [283]:
# strip the first 3 characters from zipcode
df['zip']= df['zip'].str[3:]

In [298]:
# strip the first 3 charachters from team 
df['team'] = df['team'].str[3:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [288]:
# create the right order for dataframe
new_cols = ['name', 'team', 'phone', 'email', 'address1', 'address2', 'city', 'zip', 'link', 'correct_title']

In [290]:
# save into new dataframe
df = df[new_cols]

In [308]:
# sort df by city, then, address, then team
df = df.sort_values(by=['city', 'address1', 'team'])

In [309]:
# reset index
df.reset_index(inplace=True)

In [312]:
# drop auto created columns from reset_index
df.drop(columns = ['level_0','index'], inplace=True)

In [313]:
# save
df.to_csv('morski_ameriprise_fixed.csv')