# Step 1: Collect therapist information

In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install selenium

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

In [None]:
# enter your desired post code here (note, per default the therapist search will find therapists in a 5KM radius)
post_code = 
therapy_form = 1  # 1=Einzeltherapie
therapy_type = 2  # 2=Verhaltenstherapie


start_page = 1


In [None]:
def get_therapist_data(driver, profile_url):
    """
    Extracts data from a single therapist profile page.
    """
    driver.get(profile_url)
    time.sleep(10)  # give page time to load and to not get rate limited

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    try:
        name_element = soup.find('div', attrs={"class": "therapist-name"}).findChild("span", attrs={"itemprop": "name"})
        name = name_element.text.strip() if name_element else "Name not found"
    except Exception as e:
        name = "Error getting name"
        print(f"Error getting name: {e}")
        # If this is the case, cause is most likely rate limiting -- sleep
        time.sleep(60)

        try:
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            name_element = soup.find('div', attrs={"class": "therapist-name"}).findChild("span", attrs={"itemprop": "name"})
            name = name_element.text.strip() if name_element else "Name not found"
        except Exception as e:
            name = "Error getting name"

    try:
        website_element = soup.find('div', class_='contact-web')
        website = website_element.findChild("a")['href'] if website_element else None
    except:
        website = None

    try:
        # Find and click the "schreiben" button using Selenium
        schreiben_button = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.XPATH, "//*[@id='contact-button']"))
        )
        schreiben_button.click()
        time.sleep(1)  # Wait for email to appear

        # Extract the email address *after* clicking the button
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # refresh soup object after button click
        email_element = soup.find('div', attrs={"id":"email-address-container"}).findChild("a")
        email = email_element.text.strip() if email_element else None


    except Exception as e:
        email = None
        print(f"Error getting email: {e}")

    try:
        type_element = soup.find('div', attrs={"class": "therapist-name"}).findChild("h2", attrs={"itemprop": "description"})
        therapist_type = type_element.text.strip() if type_element else "Type not found"
    except:
        therapist_type = "Error getting type"

    therapist_data = {
        'name': name,
        'website': website,
        'email': email,
        'therapist_type': therapist_type
    }
    print(therapist_data)
    return therapist_data

In [None]:
def crawl_therapist_listings(start_url, max_pages=100):  # Added max_pages
    """
    Crawls the therapist listing pages and extracts data.
    """

    options = Options()
    options.add_argument("--headless=new")  # Run Chrome in headless mode
    driver = webdriver.Chrome(options=options) # Or whichever browser you prefer


    all_therapist_data = []
    current_page_url = start_url
    page_num = 1

    while current_page_url and page_num <= max_pages: # Added max_pages check
        print(f"Crawling page: {current_page_url}")
        try:
            response = requests.get(current_page_url)
            response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
            soup = BeautifulSoup(response.content, 'html.parser')
            # Find all therapist entries (adjust selector as needed)
            therapist_entries = soup.find('ul', class_="search-results-list").findChildren("li")  # Adjust the class name as needed
            print("Parsing therapists...")

            for entry in therapist_entries:
                try:
                    # Find the link to the therapist's profile
                    profile_link = entry.find('a', href=re.compile(r'/profil/'))
                    if profile_link:
                        profile_url = "https://www.therapie.de" + profile_link['href']
                        print(f"  Extracting data from: {profile_url}")
                        therapist_data = get_therapist_data(driver, profile_url)
                        all_therapist_data.append(therapist_data)
                    else:
                        print("  Profile link not found in this entry.")
                except Exception as e:
                    print(f"  Error processing entry: {e}")


            # Find the "next page" link
            next_page_link = soup.find('ul', attrs={"id":"pagenav-bottom"}).findChild("li", class_="next").findChild("a") # Adjust the selector as needed
            print(next_page_link)
            if next_page_link:
                 next_page_url = "https://www.therapie.de" + next_page_link['href']
            else:
                 next_page_url = None
            current_page_url = next_page_url
            print("Finished parsing page number " + str(page_num))
            print("Collected therapists:")
            print(all_therapist_data)
            page_num += 1

        except requests.exceptions.RequestException as e:
            print(f"Error during requests to {current_page_url} : {e}")
            break # stop crawling if there's a network issue

        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            break # stop crawling if there's an unhandled error

    driver.quit() # Close the browser
    return all_therapist_data

In [None]:
start_url = f"https://www.therapie.de/therapeutensuche/ergebnisse/?ort={post_code}&page={start_page}&therapieangebot={therapy_form}&verfahren={therapy_type}"


therapist_data = crawl_therapist_listings(start_url)

# Print the extracted data (or save to a file)
for data in therapist_data:
    print(data)

In [None]:
# export as CSV
import csv

csv_file = "therapists.csv"  # Name of your CSV file
try:
    # Get the keys from the first dictionary in the list
    fieldnames = therapist_data[0].keys()

    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')

        # Write the header row
        writer.writeheader()

        # Write the data rows
        for row in therapist_data:
            writer.writerow(row)

    print(f"Data successfully written to {csv_file}")

except Exception as e:
    print(f"An error occurred: {e}")


# Step 2: Contact therapists

In [2]:
# import exported data
import pandas as pd

df = pd.read_csv('therapists.csv', delimiter=';')

# filter data
filtered_df = df[~df['therapist_type'].str.contains('Heil|Kinder|Privat', case=False, na=False)] 
filtered_df = filtered_df.dropna(subset=['email'])

filtered_df

Unnamed: 0,name,website,email,therapist_type
1,Elisabeth Schreiber,http://www.psychotherapieschreiber.de,kontakt@psychotherapieschreiber.de,Psychologische Psychotherapeutin
3,Danjal Akrami,http://www.praxis-akrami.de,Kontakt@praxis-akrami.de,Psychologischer Psychotherapeut
4,Martin Daume,http://www.mdaume.de,mdaume@me.com,Psychologischer Psychotherapeut / Klinischer O...
6,Vicki Bauer-Bertsch,,info@psychotherapie-vbauer.de,Psychologische Psychotherapeutin
8,Cynthia Quiroga Murcia,https://www.psychotherapie-quiroga.de,frankfurt@praxis-quiroga.de,Psychologische Psychotherapeutin
...,...,...,...,...
251,Tanja Müller,http://www.durchatmen.jetzt,info@durchatmen.jetzt,durchatmen - Praxis für Psychotherapie & Coaching
252,Laura Dold,,kontakt@psychotherapie-dold.de,"Psychologische Psychotherapeutin, Verhaltensth..."
253,Alice Galle,,anfragen@galle-psychotherapie.de,Psychologische Psychotherapeutin
254,Corinna Stünckel,http://www.psychotherapie-in-ffm-unterliederba...,PsychotherapieStuenckel@gmx.de,Psychologische Psychotherapeutin Verhaltensthe...


In [None]:
# send mail with template

"""
Important notes:

You need to use an "App Password" instead of your regular Gmail password. 
This requires enabling 2-Step Verification in your Google Account settings.
"""
import smtplib
from email.mime.text import MIMEText
from email.header import Header

In [None]:
# replace all values
sender_email = "" 
sender_password = ""
subject = "Anfrage Erstgespräch"

body = f"""
"""

In [6]:
emails = set(filtered_df["email"].tolist())
# Add emails here that should not be contacted in case they appear in the dataset
emails_already_contacted = [
]

emails_filtered = [mail for mail in emails if mail not in emails_already_contacted]

print("Removed mails: " + str(len(emails)-len(emails_filtered)))

Removed mails: 0


In [None]:
with smtplib.SMTP_SSL('smtp.gmail.com', 465) as server:
    server.login(sender_email, sender_password)
    for email in emails_filtered:
        recipient_email = email
        message = MIMEText(body, _charset="utf-8")
        message['Subject'] = Header(subject, 'utf-8')
        message['From'] = sender_email
        message['To'] = recipient_email
        print(email)
        print(message)
        server.sendmail(sender_email, recipient_email, message.as_string())
    


In [None]:
# export contacted therapists as CSV
import csv
csv_file = "contacted_therapists.csv"  # Name of your CSV file
emails_contacted = set(emails_already_contacted + emails_filtered)
try:
    # Get the keys from the first dictionary in the list
    fieldnames = ["emails"]

    with open(csv_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';')

        # Write the header row
        writer.writeheader()

        # Write the data rows
        for mail in emails_contacted:
            writer.writerow({"emails": mail})

    print(f"Data successfully written to {csv_file}")

except Exception as e:
    print(f"An error occurred: {e}")

