# UKCP Scrape Part 1: Profile Links

In [1]:
from bs4 import BeautifulSoup as bs # HTML and XML parsing
import requests # making HTTP requests
from lxml import etree # xml data parsing
import pandas as pd # data analytics package
import numpy as np # mathematics
import re # regular expressions

In [265]:
# Creating an empty dictionary that I will add profile links to
profiles = {}

UKCP website is using Ajax and is a little trickier to scrape compared to BACP with simple HTML. I will have to provide headers in my request and, most importantly, payload, which in this case contains search parameters aiming to get the maximum results.
One of these parameters is Page Number, and I will use .format to iterate through them all.

In [266]:
# Header information
headers = {
"authority": 'www.psychotherapy.org.uk',
"method": 'POST',
"path": '/umbraco/Surface/SearchSurface/Search',
"scheme": 'https',
"accept": '*/*',
"accept-encoding": "gzip, deflate, br",
"content-type": 'application/x-www-form-urlencoded; charset=UTF-8',
"origin": 'https://www.psychotherapy.org.uk',
"referer": 'https://www.psychotherapy.org.uk/find-a-therapist/?Distance=30&LocationSearchOutsideUK=true',
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36",
"x-requested-with": "XMLHttpRequest",
"cookie": "_gcl_au=1.1.1587263335.1663537845; _ga=GA1.3.1419321980.1663537845; _hjSessionUser_2575114=eyJpZCI6IjYxMjQ0YThkLWQ3Y2YtNWE4OC05NTQ2LTc4YzRiMjliZjQxMCIsImNyZWF0ZWQiOjE2NjM1Mzc4NDUyOTIsImV4aXN0aW5nIjp0cnVlfQ==; _gid=GA1.3.810337034.1666535138; TOrderSeed=-636850756; _hjAbsoluteSessionInProgress=0; _gat_gtag_UA_20343789_1=1; _hjIncludedInSessionSample=0; _hjSession_2575114=eyJpZCI6IjY2NWM1NjZlLWY1NzctNDE2ZS1hYjYxLTBkNmY4NDgyNmQzYyIsImNyZWF0ZWQiOjE2NjY3MTE0MTk0NDEsImluU2FtcGxlIjpmYWxzZX0="
}

# A function that takes in website url, retrieves embedded links, and adds them to the dictionary
def scraper(url):
    source = requests.post(url, json=payload, headers=headers).text
    soup = bs(source, 'lxml')
    
    for link in soup.find_all("a", class_="light-anchor"): # iterate throught blocks of code that hold therapists' information
        name = link.find("h2").string # grab the name for my dictionary's keys
        url=link.get("href", "") # getting links for my dictionary's values
        profiles[name] = url

In [1]:
first_url = "https://www.psychotherapy.org.uk/umbraco/Surface/SearchSurface/Search"

In [267]:
# A loop that will iterate through pages in the website (using .format on payload string), and add profile info to the ductionary
for page in range(1, 646): # Website shows max number of profiles to be 7741 and 12 per page, so I'll go through 646 pages
    
    # Payload containing search parameters
    payload = "HelpWith=&InPerson=false&Remote=false&Location=&Pager.CurrentPage={}&Pager.PageSize=12&KeywordFilter=&Distance=10&LocationSearchOutsideUK=false&OnlyProfilesWithPhotos=false&OnlyWheelchairAccessible=false&OrderSeed=-636850756&X-Requested-With=XMLHttpRequest".format(page)
    # Keeping track of the progress
    if page in range(1, 646, 10):
        print(page)
        
    # Calling the scraper function defined earlier 
    try:
        scraper(first_url)
    except:
        pass

1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191
201
211
221
231
241
251
261
271
281
291
301
311
321
331
341
351
361
371
381
391
401
411
421
431
441
451
461
471
481
491
501
511
521
531
541
551
561
571
581
591
601
611
621
631
641


In [268]:
# Creating dataframe from the dictionary
profiles_df = pd.DataFrame.from_dict(profiles, orient="index", columns=['link'])
profiles_df

Unnamed: 0,link
Naomi Landau,/therapist/naomi-landau-iai1paas/
Siobhan Tinker,/therapist/siobhan-tinker-iaixxaa0/
Rehma Said,/therapist/rehma-said-cnvcuaal/
Yvonne Rose,/therapist/yvonne-rose-iaih5aak/
Suzanna Brown,/therapist/suzanna-brown-iai1oaas/
...,...
Stella Ridley,/therapist/stella-ridley-iajjaaa0/
Lisa Tedeschini,/therapist/lisa-tedeschini-iai1xaac/
Claire Barber,/therapist/claire-barber-nrblzaaa/
Catherine Collins,/therapist/catherine-collins-dgfnqqaz/


In [269]:
# Adding UKCP url to the begining of the link
profiles_df = profiles_df.apply(lambda x: "https://www.psychotherapy.org.uk/"+x)

In [271]:
# Saving to csv
profiles_df.to_csv('ukcp_links.csv')

# UKCP Part 2: Scraping the profile data

In [2]:
## Reading in the file
# profiles_df = pd.read_csv('ukcp_links.csv')
# profiles_df.drop(columns='Unnamed: 0', inplace=True)

In [248]:
therapist_data = [] # an empty list that I will add dictionaries of therapist data to

In [249]:
# Creating a function for retrieving relevant headers and info from therapists' profiles

def profile_scrape(url):
    
    info_dict = {} # an empty dictionary for separate sections of the profiles

    page = requests.get(url).text # requesting url
    soup2 = bs(page, 'lxml') # creating a soup object and parsing
    
    try:
        info_dict['name'] = soup2.h1.text # locating header with the name and adding value to the dictionary
    except:
        info_dict['name'] = None
    try:
        info_dict['address'] = soup2.find('span', class_='profile-intro profile-intro-locations').text # adding address
    except:
        info_dict['address'] = None
    try:
        main_body = soup2.find('div', id='AboutMe') # separating main body section
        for section in main_body.find_all('section'):
            # if a section has a paragraph of text, I will grab the header as dictionary key and text as value
            if section.find('p') is not None:
                info_dict[section.h2.text] = section.p.text
            else:
                if section.find('ul') is not None: # otherwise grabbing items from other sections (unordered list)
                    # here I clean up value a bit by creating a list, filtering out none values, and joining it back to a string
                    info_dict[section.h2.text] = ', '.join(list(filter(None, section.ul.text.split('\n'))))
    except:
        pass
                
    therapist_data.append(info_dict) # adding dictionary of data to the list

In [250]:
# Iterating through the links and scraping profile data
for link in profiles_df['link']:
    profile_scrape(link)
    if len(therapist_data) in range(1, 7741, 100): # tracking the progress
        print(len(therapist_data))

1
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2001
2101
2201
2301
2401
2501
2601
2701
2801
2901
3001
3101
3201
3301
3401
3501
3601
3701
3801
3901
4001
4101
4201
4301
4401
4501
4601
4701
4801
4901
5001
5101
5201
5301
5401
5501
5601
5701
5801
5901
6001
6101
6201
6301
6401
6501
6601
6701
6801
6901
7001
7101
7201
7301
7401
7501
7601


In [251]:
# Creating a dataframe
ukcp_df = pd.DataFrame(therapist_data)

In [252]:
# Saving
ukcp_df.to_csv('ukcp_df.csv')

# Part 3: EDA
## Exploring the types of therapies that are listed

In [2]:
# # Reading in the data
# ukcp_df = pd.read_csv('ukcp_df.csv')
# ukcp_df.drop(columns='Unnamed: 0', inplace=True)

In [3]:
# Checking how many profile section columns I have
ukcp_df.columns

Index(['name', 'address', 'Types of Therapies Offered', 'I work with',
       'What I can help with', 'My Approach', 'About Me', 'Special Interests'],
      dtype='object')

In [4]:
# I am mostly interested in 'Types of Therapies Offered', but some profiles might not have that filled in.
ukcp_df['Types of Therapies Offered'].isna().sum()

95

There are only 95 profiles missing the data I need. Most likely they will have it entered in another paragraph, such as 'My Approach' and 'About Me'. It means I might need to look for target data in different sections of the profile.

In [5]:
# Checking for duplicate accounts
ukcp_df['name'].str.strip().duplicated().sum()

0

In [6]:
# Double-checking data types
ukcp_df.dtypes

name                          object
address                       object
Types of Therapies Offered    object
I work with                   object
What I can help with          object
My Approach                   object
About Me                      object
Special Interests             object
dtype: object

I will now separate "Types of Therapies Offered" from other columns and count the values.

In [7]:
# Creating a separate df
ukcp_types = ukcp_df[['name', 'Types of Therapies Offered']]

In [8]:
# Dropping null values
ukcp_types = ukcp_types.dropna(subset='Types of Therapies Offered')

In [9]:
# Checking
ukcp_types['Types of Therapies Offered'].isna().sum()

0

In [10]:
# Creating a new column for each modality
types_matrix = ukcp_types['Types of Therapies Offered'].str.split(', ', expand=True).add_prefix('therapy').fillna('')
types_matrix

Unnamed: 0,therapy0,therapy1,therapy2,therapy3,therapy4,therapy5,therapy6,therapy7
0,Integrative Psychotherapist,,,,,,,
1,Psychosynthesis Psychotherapist,,,,,,,
2,Family and Systemic Psychotherapist,Family Therapist,Systemic Family and Couple Psychotherapist,Systemic Psychotherapist,,,,
3,Family and Systemic Psychotherapist,Family Therapist,Systemic Family and Couple Psychotherapist,Systemic Psychotherapist,,,,
4,Transpersonal Psychotherapist,,,,,,,
...,...,...,...,...,...,...,...,...
7694,Lacanian Analyst,Psychoanalytic Psychotherapist,,,,,,
7695,Analytical Psychologist - Jungian Analyst,,,,,,,
7696,Family and Systemic Psychotherapist,Family Therapist,Systemic Family and Couple Psychotherapist,Systemic Psychotherapist,,,,
7697,Group Analyst,,,,,,,


In [409]:
# Creating an empty dataframe that I will add data to
types_count = pd.DataFrame(columns = ['modality', 'count'])
types_count

Unnamed: 0,modality,count


In [410]:
# Creating a temporary dataframe with value counts for each column and concatenating them all to one
for column in types_matrix:
    temp_df = pd.DataFrame(types_matrix[column].value_counts().reset_index())
    temp_df.columns = ['modality', 'count']
    types_count = pd.concat([types_count, temp_df], axis=0, ignore_index=True)

In [411]:
# Stripping white space
types_count['modality'] = types_count['modality'].apply(lambda x: x.strip())

In [412]:
# Aggregating count values from all columns
types_count = types_count.groupby('modality').sum()
types_count.drop(index = '', inplace=True) # dropping aggregated none-values
types_count = types_count.sort_values('count', ascending=False) # sorting

In [413]:
types_count

Unnamed: 0_level_0,count
modality,Unnamed: 1_level_1
Integrative Psychotherapist,1345
Family and Systemic Psychotherapist,1186
Systemic Psychotherapist,1185
Family Therapist,1183
Systemic Family and Couple Psychotherapist,1181
...,...
Psychosexual Psychotherapist,1
Child Counsellor,1
Compassion based Psychotherapist,1
Constructivist Psychotherapist,1


We have 70 different types of psychotherapy. It is very likely some are listed under different names and making the total number bigger than it actually is.

In [414]:
# Resetting index
types_count = types_count.reset_index()
types_count.columns = ['modality', 'count']
types_count

Unnamed: 0,modality,count
0,Integrative Psychotherapist,1345
1,Family and Systemic Psychotherapist,1186
2,Systemic Psychotherapist,1185
3,Family Therapist,1183
4,Systemic Family and Couple Psychotherapist,1181
...,...,...
65,Psychosexual Psychotherapist,1
66,Child Counsellor,1
67,Compassion based Psychotherapist,1
68,Constructivist Psychotherapist,1


In [73]:
# Prining all values so I can see them better, and sorting alphabetically
for count, row in enumerate(types_count['modality'].sort_values()):
    print(count, row)

0 Adolescent Counsellor
1 Adolescent Psychotherapeutic Counsellor
2 Adolescent Psychotherapist
3 Analytical Psychologist
4 Analytical Psychologist - Jungian Analyst
5 Analytical Psychotherapist
6 Analytical Psychotherapist (Jungian)
7 Attachment-based Psychoanalytic Psychotherapist
8 Biodynamic Psychotherapist
9 Body Psychotherapist
10 Child Counsellor
11 Child Psychotherapeutic Counsellor
12 Child Psychotherapist
13 Child and Adolescent Psychotherapeutic Counsellor
14 Child and Adolescent Psychotherapist
15 Cognitive Analytic Therapist
16 Cognitive and Behavioural Psychotherapist
17 Compassion based Psychotherapist
18 Constructivist Psychotherapist
19 Contemporary Psychoanalyst
20 Contemporary Psychotherapist
21 Core Process Psychotherapist
22 Dance Movement Psychotherapist
23 Educational Psychotherapist
24 Existential Psychotherapist
25 Existential-Analytic Psychotherapist
26 Family Therapist
27 Family and Systemic Psychotherapist
28 Gestalt Group Psychotherapist
29 Gestalt Psychothe

It is clear that some therapies do indeed appear under multiple names, and are likely listed together. I will separate all listings of more than one therapy type and inspect them.

In [74]:
# Aggregating all variations of therapy listings
dup_types = pd.DataFrame(ukcp_types['Types of Therapies Offered'].value_counts())
dup_types = dup_types.reset_index()
dup_types.columns = ['types', 'count']

In [75]:
# Counting number of commas (separator between the types). It will tell me how many types of therapies are listed in each
# profile.
dup_types['comma_count'] = 0
for i in range(0,len(dup_types)):
    dup_types.iloc[i, 2] = dup_types.iloc[i, 0].count(', ')

In [76]:
dup_types['comma_count'].value_counts()

1    142
2     69
0     63
4     12
3      7
5      2
7      1
Name: comma_count, dtype: int64

The numbers are small enough that I can print all variations and inspect them.

In [77]:
for count, row in enumerate(dup_types['types']):
    print(count, row)

0 Family and Systemic Psychotherapist, Family Therapist, Systemic Family and Couple Psychotherapist, Systemic Psychotherapist
1 Integrative Psychotherapist
2 Psychoanalytic Psychotherapist
3 Transpersonal Psychotherapist
4 Transactional Analysis Psychotherapist
5 Psychodynamic Psychotherapist
6 Psychotherapeutic Counsellor
7 Existential Psychotherapist
8 Gestalt Psychotherapist
9 Core Process Psychotherapist
10 Integrative Arts Psychotherapist
11 Group Analyst
12 Humanistic and Integrative Psychotherapist
13 Psychosynthesis Psychotherapist
14 Person Centred Psychotherapist
15 Attachment-based Psychoanalytic Psychotherapist
16 Hypno -Psychotherapist
17 Integrative Child Psychotherapist
18 Sexual and Relationship Psychotherapist
19 Analytical Psychologist - Jungian Analyst
20 Psychodrama Psychotherapist
21 Cognitive Analytic Therapist
22 Gestalt Group Psychotherapist, Gestalt Psychotherapist
23 Humanistic Psychotherapist
24 Child Psychotherapist, Integrative Child Psychotherapist
25 Adol

In [78]:
dup_types

Unnamed: 0,types,count,comma_count
0,"Family and Systemic Psychotherapist, Family Th...",1163,3
1,Integrative Psychotherapist,1126,0
2,Psychoanalytic Psychotherapist,551,0
3,Transpersonal Psychotherapist,411,0
4,Transactional Analysis Psychotherapist,379,0
...,...,...,...
291,"Gestalt Psychotherapist, Person Centred Psycho...",1,1
292,"Existential Psychotherapist, Psychotherapeutic...",1,1
293,"Gestalt Group Psychotherapist, Gestalt Psychot...",1,3
294,"Child Psychotherapist, Integrative Psychothera...",1,2


I also want to see most common variations - those that appear more than once, and have more than one therapy listed.

In [79]:
filt = (dup_types['count']>1) & dup_types['comma_count']>0
dup_type_groups = dup_types[filt]

In [80]:
# Printing variations that appear more than once
for count, row in enumerate(dup_type_groups['types']):
    print(count, row)

0 Family and Systemic Psychotherapist, Family Therapist, Systemic Family and Couple Psychotherapist, Systemic Psychotherapist
1 Gestalt Group Psychotherapist, Gestalt Psychotherapist
2 Child Psychotherapist, Integrative Child Psychotherapist
3 Adolescent Psychotherapist, Child Psychotherapist
4 Body Psychotherapist, Integrative Psychotherapist
5 Integrative Psychotherapist, Psychotherapeutic Counsellor
6 Humanistic and Integrative Psychotherapist, Integrative Psychotherapist
7 Psychoanalytic Psychotherapist, Psychodynamic Psychotherapist
8 Humanistic Psychotherapist, Person Centred Psychotherapist
9 Child Psychotherapist, Integrative Psychotherapist
10 Integrative Psychotherapist, Psychodynamic Psychotherapist
11 Contemporary Psychotherapist, Hypno -Psychotherapist
12 Psychodynamic Psychotherapist, Psychotherapeutic Counsellor
13 Existential Psychotherapist, Integrative Psychotherapist
14 Psychoanalyst, Psychoanalytic Psychotherapist
15 Humanistic Psychotherapist, Integrative Psychothe

# Conclusion:
- Most profiles have "Types of Therapies Offered" listed in a separate section, but it may also be specified in other parts of the profile, such as "About Me" or "My Approach". For an accurate count, I might need to search for listings in multiple profile sections.
- I can reduce number of therapies by:
    i) making the names consistent throughout
    ii) ignoring the subtle differences between 'psychotherapist', 'counsellor', and 'psychotheraputic counsellor'
- I need to be careful with umbrella terms such as Humanistic therapy, which can be either listed by itself or together with more specific types like Gestalt.
- After reducing the number of therapies, I will know what to look for in other sections of the profiles, in case therapies are specified there instead.