In [12]:
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, NoSuchElementException
from bs4 import BeautifulSoup
import re
from time import sleep
import pickle
import bs4
import collections

In [13]:
def search(driver, city, state):
    textbox = driver.find_element_by_xpath('//input[@class="large has-menu"]')
    textbox.click()
    textbox.send_keys(str(city)+', '+str(state), Keys.ENTER)

In [14]:
def get_links_from_page(driver):
    page_source = driver.page_source
    soup = BeautifulSoup(page_source)
    links = soup.findAll('a', {'class': 'result-name'})
    hrefs = []
    for l in links:
        string = str(l)
        entry = re.findall('(?<=href=\")(.*)(?=\" item)', string)
        hrefs.append(entry[0])
    return hrefs

In [15]:
def next_page(driver):
    nextpage=driver.find_element_by_xpath('//a[@class="btn btn-default btn-next"]')
    if nextpage:
        print('Moving to next page ...')
        nextpage.click()
    else:
        print('No more pages ...')

In [16]:
def pagination(driver):
    link_text=get_links_from_page(driver)
    next_page(driver)
    return link_text

In [17]:
def collect_links():
    results=[]
    chrome_option=ChromeOptions()
    chrome_option.add_argument('--headless')
    driver=Chrome(r'/Users/jkrovitz/Documents/LDA_OF_PA_Project/ldapa/driver/chromedriver', options=chrome_option)
    driver.get('https://www.psychologytoday.com/us')
    search(driver, 'Pittsburgh', 'PA')
    while True:
        try:
            entry=pagination(driver)
            results.append(entry)
        except StaleElementReferenceException as e:
            print('Error:', e)
            continue
        except NoSuchElementException as e:
            print('No more pages ...')
            break
    driver.close()
    return results

In [18]:
def check_entry(entry):
    proper_keys=['name', 'phone', 'city', 'state', 'zip_code', 'information', 'personal_website']
    missing_keys=list(set(proper_keys).difference(list(entry.keys())))
    if missing_keys:
        for k in missing_keys:
            entry.update({k:'None'})
    entry_sorted=dict(collections.OrderedDict(entry))
    return entry_sorted

In [19]:
def parse_pages(list_of_links):
    result_list=[]
    chrome_option=ChromeOptions()
    chrome_option.add_argument('--headless')
    driver=Chrome(r'/Users/jkrovitz/Documents/LDA_OF_PA_Project/ldapa/driver/chromedriver', options=chrome_option)
    for l in list_of_links:
        result={}
        driver.get(l)
        soup=BeautifulSoup(driver.page_source)
        name=soup.find('h1', {'itemprop':'name'})
        if name:
            result['name']=name.text
        else:
            result['name']='None'
        phone=soup.find('a',{'data-event-label':'Profile_PhoneLink'})
        if phone:
            result['phone']=phone.text
        else:
            result['phone']='None'
        city=soup.find('span',{'itemprop':'addressLocality'})
        if city:
            result['city']=city.text.replace(',','')
        else:
            result['city']='None'
        state=soup.find('span',{'itemprop':'addressRegion'})
        if state:
            result['state']=state.text
        else:
            result['state']='None'
        zip_code=soup.find('span',{'itemprop':'postalcode'})
        if zip_code:
            result['zip_code']=zip_code.text
        else:
            result['zip_code']='None'
        specalites=soup.findAll('li', {'class':'highlight'})
        if specalites:
            spec_text = [s.text.strip() for s in specalites]
        try:
            issues=soup.findAll('ul',{'class':'attribute-list copy-small'})[1]
            if issues:
                issues_text=[i.text.strip() for i in issues if type(i)==bs4.element.Tag]
        except IndexError:
            issues_text=[]
            continue
        try:
            mental_health=soup.findAll('ul',{'class':'attribute-list copy-small'})[2]
            if mental_health:
                mental_text=[i.text.strip() for i in mental_health if type(i)==bs4.element.Tag]
        except IndexError:
            mental_text=[]
            continue
        issues_2=soup.findAll('ul', {'class':'attribute-list copy-small'})
        if issues_2:
            issues_2_text=[i.text.strip() for i in issues_2 if type(i)==bs4.element.Tag]
            if issues_2_text:
                clean_text=[t.replace('\n', ',') for t in issues_2_text]
                split_text=[t.split(',') for t in clean_text]
                issues_2_text_clean=[]
                for t in split_text:
                    for r in t:
                        if len(r) > 0 and any(c.isalpha() for c in r):
                            issues_2_text_clean.append(r.strip())
        full_specs=list(set(spec_text+issues_text+issues_2_text_clean+mental_text))
        if len(full_specs)==0:
            full_specs=['None']
        result['information']=full_specs
        website_text=soup.find('a', {'data-event-label': 'links-website'})
        if website_text:
            if website_text.has_attr('href')=='True':
                driver.get(website_text.attrs['href'])
                personal_website=driver.current_url
                result['personal_website']=personal_website
        else:
            result['personal_website']='None'
        checked_result=check_entry(result)
        result_list.append(checked_result)
    driver.close()
    return result_list

In [20]:
r=collect_links()

Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
Moving to next page ...
No more pages ...


In [21]:
final_list = []
for sublist in r:
    for item in sublist:
        final_list.append(item)

In [22]:
list_to_parse=list(set(final_list))

In [12]:
data_to_enter=parse_pages(list_to_parse)

In [13]:
len(data_to_enter)

671

In [14]:
with open("../data/pitt_data.txt", "wb") as fp:
    pickle.dump(data_to_enter, fp)