# Part 1 - BACP register scraping
## Collecting profile links

In [2]:
from bs4 import BeautifulSoup as bs # HTML and XML parsing
import requests # making HTTP requests
from lxml import etree # xml data parsing
import pandas as pd # data analytics package
import numpy as np # mathematics
import re # regular expressions

In [3]:
# Creating an empty dictionary that I will add profile links to
profiles = {}

# A function that takes in website url, retrieves embedded links, and adds them to the dictionary
def scraper(url):
    source = requests.get(url).text
    soup = bs(source, 'lxml')

    for link in soup.find_all("a", class_="search-result__link"): # find all anchor tabs with a specified class
        url=link.get("href", "") # getting links and adding to dictionary
        profiles[link.text.strip()] = url

In [4]:
# Scraping the first page separately (as it's url is slightly different) and adding profile links to a dictionary
first_url = 'https://www.bacp.co.uk/search/Therapists?UserLocation=&q=&LocationQuery=UK&Location=&FoundLocation=&SortOrder=0&TherapistSortOrderSelectionMade=false&Distance=100'
scraper(first_url)

In [6]:
# Checking the result
profiles

{'Susie Jamieson - Templand': '/therapists/68319/susie-jamieson/templand-dg11',
 'Clare Elliot - Biggar': '/therapists/167087/clare-elliot/biggar-ml12',
 'Janet Yasities - Dumfries': '/therapists/379760/janet-yasities/dumfries-dg1',
 'Jennifer Collins - Peebles, Scottish Borders': '/therapists/389110/jennifer-collins/peebles-scottish-borders-eh45',
 'Clair Higgon - DUMFRIES': '/therapists/11763/clair-higgon/dumfries-dg1',
 'Belinda Hook - Innerleithen': '/therapists/145037/belinda-hook/innerleithen-eh44',
 'Ros Elphinstone - Dumfries': '/therapists/375603/ros-elphinstone/dumfries-dg1',
 'Ralf Bidder - Newcastleton': '/therapists/191658/ralf-bidder/newcastleton-td9',
 'Alison Connor - EDINBURGH': '/therapists/24342/alison-connor/edinburgh-eh26',
 'Elaine Robb - Glasgow': '/therapists/393806/elaine-robb/glasgow-ml9'}

In [7]:
# Scraping the rest of the pages - same as above, except looping through pages by using formatting function
# and skipping every 10 profiles
for page in range(1,1558): # The website specifies the total number of profiles (15580)
    rest_url = "https://www.bacp.co.uk/search/Therapists?UserLocation=&q=&LocationQuery=UK&Location=&FoundLocation=&SortOrder=0&TherapistSortOrderSelectionMade=false&Distance=100&skip={}0".format(page)
    scraper(rest_url)

In [185]:
len(profiles)

14928

In [154]:
# Creating dataframe from the dictionary
profiles_df = pd.DataFrame.from_dict(profiles, orient="index", columns=['link'])
profiles_df

Unnamed: 0,link
Susie Jamieson - Templand,/therapists/68319/susie-jamieson/templand-dg11
Clare Elliot - Biggar,/therapists/167087/clare-elliot/biggar-ml12
Janet Yasities - Dumfries,/therapists/379760/janet-yasities/dumfries-dg1
"Jennifer Collins - Peebles, Scottish Borders",/therapists/389110/jennifer-collins/peebles-sc...
Clair Higgon - DUMFRIES,/therapists/11763/clair-higgon/dumfries-dg1
...,...
Ada Sze Hang Kot - Hong Kong,/therapists/386325/ada-sze-hang-kot/hong-kong-
KinMingHerman Chan - Tsim Sha Tsui,/therapists/384950/kinmingherman-chan/tsim-sha...
Giuseppe Tagliarini - Bangkok,/therapists/326722/giuseppe-tagliarini/bangkok-10
Don Knox - Bangkok,/therapists/7147/don-knox/bangkok-10


In [155]:
# Adding BACP url to the begining of the link
profiles_df = profiles_df.apply(lambda x: "https://www.bacp.co.uk/"+x)

In [156]:
profiles_df = profiles_df.reset_index()

In [157]:
profiles_df.columns=['name', 'link']

In [158]:
profiles_df

Unnamed: 0,name,link
0,Susie Jamieson - Templand,https://www.bacp.co.uk//therapists/68319/susie...
1,Clare Elliot - Biggar,https://www.bacp.co.uk//therapists/167087/clar...
2,Janet Yasities - Dumfries,https://www.bacp.co.uk//therapists/379760/jane...
3,"Jennifer Collins - Peebles, Scottish Borders",https://www.bacp.co.uk//therapists/389110/jenn...
4,Clair Higgon - DUMFRIES,https://www.bacp.co.uk//therapists/11763/clair...
...,...,...
14923,Ada Sze Hang Kot - Hong Kong,https://www.bacp.co.uk//therapists/386325/ada-...
14924,KinMingHerman Chan - Tsim Sha Tsui,https://www.bacp.co.uk//therapists/384950/kinm...
14925,Giuseppe Tagliarini - Bangkok,https://www.bacp.co.uk//therapists/326722/gius...
14926,Don Knox - Bangkok,https://www.bacp.co.uk//therapists/7147/don-kn...


## Cleaning - removing duplicate profiles
I know from inspecting the webstie that some profiles are listed multiple times if the therapist works in different locations,
or offers therapy as well as supervision. I will use addresses and unique URL links to remove dupliactes.

In [159]:
# Splitting the address and name
profiles_df[['name', 'address1', 'address2', 'address3']] = profiles_df['name'].str.split(' - ', expand=True)

In [160]:
# Checking if it's worth keeping 3 columns of address
profiles_df['address2'].value_counts()

Online and Phone                   4
London                             4
Central London                     2
online                             2
Kendal                             1
Online                             1
Surrey                             1
Wimbledon                          1
ONLINE                             1
LONDON                             1
West London                        1
Marylebone                         1
St Helens                          1
UK)                                1
face to face, on-line and phone    1
Central                            1
North London                       1
Villaviciosa                       1
Name: address2, dtype: int64

Only a few profiles specify more than one address, so I will drop two of the columns. Instead, I will extract post code later.

In [161]:
profiles_df.drop(columns=['address2', 'address3'], inplace=True)

In [162]:
profiles_df

Unnamed: 0,name,link,address1
0,Susie Jamieson,https://www.bacp.co.uk//therapists/68319/susie...,Templand
1,Clare Elliot,https://www.bacp.co.uk//therapists/167087/clar...,Biggar
2,Janet Yasities,https://www.bacp.co.uk//therapists/379760/jane...,Dumfries
3,Jennifer Collins,https://www.bacp.co.uk//therapists/389110/jenn...,"Peebles, Scottish Borders"
4,Clair Higgon,https://www.bacp.co.uk//therapists/11763/clair...,DUMFRIES
...,...,...,...
14923,Ada Sze Hang Kot,https://www.bacp.co.uk//therapists/386325/ada-...,Hong Kong
14924,KinMingHerman Chan,https://www.bacp.co.uk//therapists/384950/kinm...,Tsim Sha Tsui
14925,Giuseppe Tagliarini,https://www.bacp.co.uk//therapists/326722/gius...,Bangkok
14926,Don Knox,https://www.bacp.co.uk//therapists/7147/don-kn...,Bangkok


In [163]:
# Checking for duplicates
profiles_df['name'].duplicated().sum()

2769

In [164]:
# Checking if dupliates have same website links
profiles_df['link'].duplicated().sum()

9

In [165]:
# Having a closer look at one of the profiles
profiles_df[profiles_df['name'] == 'Katie Graves']['link'].values

array(['https://www.bacp.co.uk//therapists/390583/katie-graves/leeds-ls27',
       'https://www.bacp.co.uk//therapists/390583/katie-graves/leeds-ls27'],
      dtype=object)

While some of the links are identical,most are not. I will remove the last part referring to the address.

In [166]:
# creating a new column with the url
profiles_df['link2'] = profiles_df['link']

In [167]:
# splitting the link using '/'
profiles_df['link2'] = profiles_df['link2'].str.split('/')

In [168]:
# Extracting post code
profiles_df['post_code'] = profiles_df['link2'].apply(lambda x: x[-1])

In [170]:
# Grabbing the profile Ids to identify duplicates
profiles_df['link2'] = profiles_df['link2'].apply(lambda x: x[5])

In [171]:
# Checking the number of identical profiles
profiles_df['link2'].duplicated().sum()

2602

In [172]:
# Dropping duplicate profiles
profiles_df = profiles_df.drop_duplicates(subset='link2')

In [173]:
profiles_df['link2'].duplicated().sum()

0

In [174]:
# Saving
profiles_df.to_csv('bacp_links.csv')

In [175]:
len(profiles_df)

12326

In [176]:
# Creating a list with all urls
profile_links = list(profiles_df['link'])

# Part 2
## Scraping psychotherapists' profiles

In [7]:
## Loading the saved data
# profile_links = pd.read_csv('bacp_links.csv')
# profile_links.drop(columns='Unnamed: 0', inplace=True)
# profile_links = list(profile_links['link'])

In [10]:
# Checking
profile_links[:10]

['https://www.bacp.co.uk//therapists/68319/susie-jamieson/templand-dg11',
 'https://www.bacp.co.uk//therapists/167087/clare-elliot/biggar-ml12',
 'https://www.bacp.co.uk//therapists/379760/janet-yasities/dumfries-dg1',
 'https://www.bacp.co.uk//therapists/389110/jennifer-collins/peebles-scottish-borders-eh45',
 'https://www.bacp.co.uk//therapists/11763/clair-higgon/dumfries-dg1',
 'https://www.bacp.co.uk//therapists/145037/belinda-hook/innerleithen-eh44',
 'https://www.bacp.co.uk//therapists/375603/ros-elphinstone/dumfries-dg1',
 'https://www.bacp.co.uk//therapists/191658/ralf-bidder/newcastleton-td9',
 'https://www.bacp.co.uk//therapists/24342/alison-connor/edinburgh-eh26',
 'https://www.bacp.co.uk//therapists/393806/elaine-robb/glasgow-ml9']

In [132]:
# An empty list that I will be adding transcripts to
therapist_data = []

In [186]:
# Creating a profile-scrape function. It will loop through the list of links, and for each profile grab the headers
# as well as relevant data.

def profile_scrape(url):
    
    info_dict = {} # an empty dictionary for separate sections of the profiles

    page = requests.get(url).text # requesting url
    soup = bs(page, 'lxml') # creating a soup object and parsing
    
    try:
        info_dict['name'] = soup.h1.text # locating header with the name and adding value to the dictionary
    except:
        info_dict['name'] = None
    try:
        info_dict['address'] = soup.find('span', itemprop='streetAddress').text # adding address
    except:
        info_dict['address'] = None
    try:
        main_body = soup.find('div', class_='directory-section__body') # separating main body section
        main_content = main_body.find('div', class_='content')
        
        # Extracting headers into a list
        headers = []
        for header in main_content.find_all('h3'):
            headers.append(header.text)
        
        # Extracting main text into a list. Inspecting HTML revealed that some paragraphs are contained within
        # 'span' tag and are easy to retrieve. Others, however, are not, but are located at the end of the main body,
        # and are limited to one paragraph per header. I will use those specific headers to identify whether these
        # paragraphs exist, and extract them.
        
        # Adding text withing 'span' tags into a list
        content = []
        for description in main_content.find_all('span', style=None, class_=None, id=None, lang=None):
            description = re.sub(r'\n|\u200b|\xa0', ' ', description.text) # removing Unicode characters
            description = re.sub(r'\s+', ' ', description).strip() # removing white space
            content.append(description)
            
        # Identifying if there are headers with text that did not use 'span' tag
        target_headers = {'What I can help with', 'Types of therapy', 'Clients I work with', 'How I deliver therapy'}
        match = set(headers) & target_headers
        if len(match) > 0:
            add_content = [p.text for p in main_content.find_all('p')]
            to_add = add_content[-len(match):] # taking 'match' number of paragraphs counting from the end of the list
            content = content + to_add
        
        for i in range(0, len(headers)):
            info_dict[headers[i]] = content[i]

    except:
        pass
    
    therapist_data.append(info_dict) # adding dictionary of data to the list

In [44]:
# Scraping the profiles
for link in profile_links['link'][479:]: # if the process gets interrupted, slicing allows me to pick up from where it stopped
    profile_scrape(link)
    if len(therapist_data) in range(0, 12300, 100):
        print(len(therapist_data))

500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200


In [226]:
# Checking number of profiles
len(therapist_data)

12325

In [271]:
# Creating data frame
bacp_df = pd.DataFrame(therapist_data)

In [281]:
# Checking null values
bacp_df.isna().sum()

name                                   0
address                                1
Availability                        6607
About me and my therapy practice     196
Practice description                4191
My first session                    4410
What I can help with                 596
Types of therapy                      39
Clients I work with                   32
How I deliver therapy                 22
filt                                   0
dtype: int64

In [276]:
# Checking for dupliactes and removing them

bacp_df.duplicated().sum()

112

In [277]:
bacp_df = bacp_df.drop_duplicates()

In [278]:
bacp_df.duplicated().sum()

0

In [279]:
len(bacp_df)

12208

In [288]:
# Saving
bacp_df.to_csv('bacp_df.csv')

In [6]:
## Reading in the dataframe
# bacp_df = pd.read_csv('bacp_df.csv')
# bacp_df.drop(columns='Unnamed: 0', inplace=True)

In [5]:
bacp_df

Unnamed: 0,name,address,Availability,About me and my therapy practice,Practice description,My first session,What I can help with,Types of therapy,Clients I work with,How I deliver therapy
0,Susie Jamieson,Templand,My Practice is online only.I offer appointment...,"When the unexpected arises, or the past just w...",You can choose to work with me short-term or l...,I offer a free no obligation initial 30 minute...,"Abuse, Anger management, Anxiety, Bereavement,...","CBT, Creative therapy, Eclectic, Humanistic, I...","Adults, EAP, Older adults","Online counselling, Telephone counselling"
1,Clare Elliot,Biggar,I am offering telephone and video sessions only.,I provide online (zoom) and telephone counsell...,,I offer an initial 30 minute free no obligatio...,"Abuse, Anger management, Anxiety, Bereavement,...","Humanistic, Integrative, Person centred","Adults, Children, Older adults, Organisations,...","Online counselling, Telephone counselling"
2,Janet Yasities,Dumfries,I am available between the hours of 09:00 and ...,I AM CURRENTLY OFFERING ON-LINE SESSIONS ONLYA...,,,"Abuse, Anger management, Anxiety, Depression, ...","Behavioural, Brief therapy, CBT, Cognitive, Co...","Adults, EAP, Groups, Older adults, Trainee, Yo...","Long-term face-to-face work, Online counsellin..."
3,Jennifer Collins,"Peebles, Scottish Borders",I don’t provide sessions at weekends.Fees are ...,"Hi, I'm Jen and I'm a counsellor/CBT Therapist...",Are things just feeling out of kilter? Know so...,After our initial consultation I will send you...,"Anxiety, Bereavement, Career coaching, Depress...","Behavioural, CBT, Cognitive, Eclectic, Humanis...","Adults, Older adults","Long-term face-to-face work, Online counsellin..."
4,Clair Higgon,DUMFRIES,Thank you for looking at my profile. I am curr...,I am a person centred therapist and in practic...,I work from a consulting room in my home. I of...,The initial session is about 45 minutes in len...,"Abuse, Anxiety, Bereavement, Depression, Healt...",Person centred,"Adults, Children, Couples, EAP, Groups, Organi...","Long-term face-to-face work, Online counsellin..."
...,...,...,...,...,...,...,...,...,...,...
12203,Karen Payne,Stockton-on-Tees,Currently Available. weekdays and Evenings,I trained as a Person Centred Counsellor in 20...,Welcome my name is Karen. I am currently worki...,In the first session with you I would like to ...,"Abuse, ADD / ADHD, Anxiety, Bereavement, Child...","Humanistic, Person centred","Adults, Older adults, Trainee, Young people","Online counselling, Telephone counselling"
12204,Josephine Kerr,HARROGATE,I offer remote and face to face sessions.Face ...,"Anxiety, panic attacks, depression, feeling sa...",,I offer a Free Initial Assessment. This is a p...,"Abuse, Anxiety, Bereavement, Career coaching, ...","Brief therapy, CBT, Cognitive, Creative therap...","Adults, Older adults, Trainee","Long term sessions, Long-term face-to-face wor..."
12205,Deirdre MacNamara,Malpas,I currently provide online and telephone couns...,"I am a qualified person-centred counsellor, fu...",Main Counselling Qualifications:Diploma HE in ...,I offer a free 15-minute telephone session to ...,"Abuse, ADD / ADHD, Addictions, AIDS/HIV, Anger...","Brief therapy, Humanistic, Person centred","Adults, Older adults, Young people","Long term sessions, Online counselling, Short ..."
12206,Ringaile Turonyte,Peterborough,,"Are you feeling lonely, or maybe anxious and n...",,"The initial meeting, for which you will be cha...","Abuse, Anxiety, Bereavement, Cultural issues, ...","Gestalt, Humanistic","Adults, Groups, Young people","Long-term face-to-face work, Online counsellin..."


# Part 3: Identifying Types of Therapy

In [11]:
# Separating the column I need
types_df = pd.DataFrame(bacp_df['Types of therapy'].apply(lambda x: str(x).lower()))

In [12]:
types_df

Unnamed: 0,Types of therapy
0,"cbt, creative therapy, eclectic, humanistic, i..."
1,"humanistic, integrative, person centred"
2,"behavioural, brief therapy, cbt, cognitive, co..."
3,"behavioural, cbt, cognitive, eclectic, humanis..."
4,person centred
...,...
12203,"humanistic, person centred"
12204,"brief therapy, cbt, cognitive, creative therap..."
12205,"brief therapy, humanistic, person centred"
12206,"gestalt, humanistic"


In [14]:
# Splitting the types
# creating a new column for each modality

types_split = types_df['Types of therapy'].str.split(',', expand=True).add_prefix('therapy').fillna('')
types_split

Unnamed: 0,therapy0,therapy1,therapy2,therapy3,therapy4,therapy5,therapy6,therapy7,therapy8,therapy9,...,therapy19,therapy20,therapy21,therapy22,therapy23,therapy24,therapy25,therapy26,therapy27,therapy28
0,cbt,creative therapy,eclectic,humanistic,integrative,person centred,transactional analysis,,,,...,,,,,,,,,,
1,humanistic,integrative,person centred,,,,,,,,...,,,,,,,,,,
2,behavioural,brief therapy,cbt,cognitive,cognitive analytic therapy,solution focused brief therapy,,,,,...,,,,,,,,,,
3,behavioural,cbt,cognitive,eclectic,humanistic,integrative,person centred,solution focused brief therapy,,,...,,,,,,,,,,
4,person centred,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12203,humanistic,person centred,,,,,,,,,...,,,,,,,,,,
12204,brief therapy,cbt,cognitive,creative therapy,emotionally focused therapy,existential,gestalt,humanistic,integrative,person centred,...,,,,,,,,,,
12205,brief therapy,humanistic,person centred,,,,,,,,...,,,,,,,,,,
12206,gestalt,humanistic,,,,,,,,,...,,,,,,,,,,


In [15]:
# Creating empty dataframe that I will add data to
modality_count = pd.DataFrame(columns = ['modality', 'count'])
modality_count

Unnamed: 0,modality,count


In [16]:
# Creating a temporary dataframe with value counts for each column and concatenating them all to one
for column in types_split:
    temp_df = pd.DataFrame(types_split[column].value_counts().reset_index())
    temp_df.columns = ['modality', 'count']
    modality_count = pd.concat([modality_count, temp_df], axis=0, ignore_index=True)

In [17]:
# Stripping white space
modality_count['modality'] = modality_count['modality'].apply(lambda x: x.strip())

In [18]:
# Aggregating count values from all columns
modality_count = modality_count.groupby('modality').sum()
modality_count.drop(index = '', inplace=True)

In [19]:
modality_count = modality_count.sort_values('count', ascending=False)

In [24]:
# Some data did not scarpe cleanly. Removing corrupted values
modality_count = modality_count[modality_count['count']>1]

In [30]:
# dropping nan values
modality_count = modality_count.drop(index='nan')

In [31]:
modality_count

Unnamed: 0_level_0,count
modality,Unnamed: 1_level_1
person centred,8969
integrative,8818
humanistic,7453
psychodynamic,5877
relational,4868
cbt,4647
brief therapy,4027
solution focused brief therapy,3863
transactional analysis,2975
existential,2681


## Conclusion

I have extracted the main types of therapy listed on BACP register. I will compare this to the result from UKCP before the final count across both registers.