# Preparation

In [15]:
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
import numpy as np

## Department of Management

In [2]:
#getting the management webpage
r_mgt=requests.get('https://www.lse.ac.uk/management/people-home')
soup_mgt=BeautifulSoup(r_mgt.content,'lxml')

In [3]:
#getting the name,department,label information as a list
mgt=[]

label1=soup_mgt.find_all('h1')
label1_wanted1=['academic staff',
               'other academic and research staff']
label1_wanted2=['professional services staff']

for label1 in soup_mgt.find_all('h1'):
    department='management'
    
    #getting info under tabs: 'academic staff', 'other academic and research staff'
    #these two have similar structures
    if label1.get_text().lower() in label1_wanted1:
        label=label1.get_text()
        shortcut=label1.find_next('div',attrs={'class':'accordionContainer'})
        for label2 in shortcut.find_all('h2',attrs={'class':'accordion__title'}):
            inlabel2=label2.find_next('div',attrs={'class':'accordion__content'})
            for person in inlabel2.find_all('div',attrs={'class':'accordion__txt'}):
                #2 different cases
                if person.find('p').find('strong'):
                    name=person.find('p').find('strong').get_text()
                    mgt.append([name,department,label]) 
                else:
                    name=person.find('p').find_next('a').get_text()
                    mgt.append([name,department,label])
    
    #getting info under tab: 'professional services staff'
    #this one has a different structure
    if label1.get_text().lower() in label1_wanted2:
        label=label1.get_text()
        shortcut=label1.find_next('div',attrs={'class':'accordionContainer'})
        for label2 in shortcut.find_all('h2',attrs={'class':'accordion__title'}):
            inlabel2=label2.find_next('div',attrs={'class':'accordion__content'})
            for person in inlabel2.find_all('div',attrs={'class':'accordion__txt'}):
                #3 different cases
                if person.find('p').find('strong'):
                    name=person.find('p').find('strong').get_text()
                    mgt.append([name,department,label]) 
                elif person.find('p').find('b'):
                    name=person.find('p').find('b').get_text()
                    mgt.append([name,department,label]) 
                elif person.find('p').find('span'):
                    name=person.find('p').find('span').get_text()
                    mgt.append([name,department,label]) 
                else:
                    print('not all included')
   



In [4]:
#from the extracted name, decide if the person is prof/dr/non

for i in range(len(mgt)):
    namestr=mgt[i][0].lower().split()
    if 'dr' in namestr:
        mgt[i].append('Dr')
    elif 'professor' in namestr:
        mgt[i].append('Professor')
    else:
        mgt[i].append('Non')
    
    #getting rid of other useless strings, only keep the name
    name=mgt[i][0].replace('Dr','').replace('Professor','').replace('Sir','').replace('\xa0',' ').split()
    name=" ".join(name)
    mgt[i][0]=name

In [5]:
#convert list to dataframe

mgt=pd.DataFrame(mgt,columns=['name','department','label','title'])

In [6]:
#some problems with the webscaped information

print('\nSome names called vacancies are accidentally included, as they directly appear on the webpage.')
display(mgt[mgt['name']=='Vacancy'])
print('\n\nThere is one name missing because of the abnormal structure of the webpage')
display(mgt[mgt['name']==''])
display(mgt[148:151])


Some names called vacancies are accidentally included, as they directly appear on the webpage.


Unnamed: 0,name,department,label,title
75,Vacancy,management,Professional services staff,Non
82,Vacancy,management,Professional services staff,Non
100,Vacancy,management,Professional services staff,Non
122,Vacancy,management,Professional services staff,Non
133,Vacancy,management,Professional services staff,Non




There is one name missing because of the abnormal structure of the webpage


Unnamed: 0,name,department,label,title
149,,management,Other academic and research staff,Dr


Unnamed: 0,name,department,label,title
148,Michele Fioretti,management,Other academic and research staff,Dr
149,,management,Other academic and research staff,Dr
150,Dina Rabie,management,Other academic and research staff,Dr


In [7]:
#dealing with the problems

mgt['name'] = mgt['name'].replace('Vacancy', pd.NA)
mgt=mgt.copy().dropna()
mgt.loc[mgt['name']=='','name']='Dr Henry Hang Shen'

display(mgt[mgt['name']=='Vacancy'])
display(mgt[mgt['name']==''])

Unnamed: 0,name,department,label,title


Unnamed: 0,name,department,label,title


In [8]:
#final dataframe
mgt

Unnamed: 0,name,department,label,title
0,Bethania Antunes,management,Academic staff,Dr
1,Sarah Ashwin,management,Academic staff,Professor
2,Jonathan E. Booth,management,Academic staff,Dr
3,Wafaa Elmezraoui,management,Academic staff,Non
4,Karin King,management,Academic staff,Dr
...,...,...,...,...
168,Paul Willman,management,Other academic and research staff,Professor
169,Mohamed Abouaziza,management,Other academic and research staff,Dr
170,Anushri Gupta,management,Other academic and research staff,Dr
171,Philipp Schoenegger,management,Other academic and research staff,Dr


## Department of Government

In [9]:
#getting the government webpage

r_gvt=requests.get('https://www.lse.ac.uk/government/people')
soup_gvt=BeautifulSoup(r_gvt.content,'lxml')

In [10]:
#getting the name,department,label information as a list

gvt=[]

label1=soup_gvt.find_all('h2',attrs={'class':'accordion__title'})
llabel1_wanted1=['academic staff',
               'professional services staff',
                'research staff']
label1_wanted2=['guest teachers and gtas']
label1_wanted3=['emeritus, affiliated & visiting academic staff']

for label1 in soup_gvt.find_all('h2',attrs={'class':'accordion__title'}):
    department='government'
    
    # the following three tabs have the similar structures:
    #'academic staff','professional services staff','research staff'
    if label1.get_text().lower() in llabel1_wanted1:
        label=label1.get_text()
        shortcut=label1.find_next('div',attrs={'class':'accordion__content'})
        for person in shortcut.find_all('div',attrs={'class':'accordion__txt'}):
            if person.find('p').find('strong'):
                name=person.find('p').find('strong').get_text()
                gvt.append([name,department,label])
            elif person.find('p').find('b'):
                name=person.find('p').find('b').get_text()
                gvt.append([name,department,label])
            else:
                print('not all structure considered')
    
    #different structure for 'guest teachers and gtas'
    if label1.get_text().lower() in label1_wanted2:
        label=label1.get_text()
        shortcut=label1.find_next('div',attrs={'class':'accordion__content'})
        for person in shortcut.find_all('p'):
            if person.find('strong'):
                continue
            else:
                name=person.get_text()
                gvt.append([name,department,label])

    #different structure for 'emeritus, affiliated & visiting academic staff'
    if label1.get_text().lower() in label1_wanted3:
        label=label1.get_text()
        shortcut=label1.find_next('div',attrs={'class':'accordion__content'})
        for people in shortcut.find_all('ul'):
            for person in people.find_all('li'):
                if person.find('p'):
                    if person.find('p').find('strong'):
                        name=person.find('p').find('strong').get_text()
                        gvt.append([name,department,label])
                    else:
                        name=person.find('p').find('span').get_text()
                        gvt.append([name,department,label])
                else:
                    if person.find('strong'):
                        name=person.find('strong').get_text()
                        gvt.append([name,department,label])
                    else:
                        print('Not all structures considered')
    

In [11]:
#from the extracted name, decide if the person is prof/dr/non

for i in range(len(gvt)):
    namestr=gvt[i][0].lower().split()
    if '(dr)' in namestr:
        gvt[i].append('Dr')
    elif '(prof)' in namestr:
        gvt[i].append('Professor')
    else:
        gvt[i].append('Non')
    
    #getting rid of other useless strings, only keep the name
    namestrings=[]
    name=gvt[i][0].replace('\xa0',' ').split()
    for stri in name:
        string=stri.strip('()')
        if string.startswith('GV'):
            continue
        elif string in ['Dr','Mr','Prof']:
            continue
        else:
            namestrings.append(stri)  
    name=" ".join(namestrings)
    gvt[i][0]=name

In [12]:
#converting list to dataframe, final dataset
gvt=pd.DataFrame(gvt,columns=['name','department','label','title'])
gvt

Unnamed: 0,name,department,label,title
0,Victor Agboga,government,Academic Staff,Non
1,Elise Antoine,government,Academic Staff,Dr
2,Paul Apostolidis,government,Academic Staff,Professor
3,Tom Bailey,government,Academic Staff,Non
4,Daniel Berliner,government,Academic Staff,Dr
...,...,...,...,...
158,Lukas Slothuus,government,"Emeritus, Affiliated & Visiting Academic Staff",Dr
159,Zeynep Somer Topcu,government,"Emeritus, Affiliated & Visiting Academic Staff",Dr
160,Christine Stedtnitz,government,"Emeritus, Affiliated & Visiting Academic Staff",Dr
161,Jill Stuart,government,"Emeritus, Affiliated & Visiting Academic Staff",Dr


## Department of Economic Histiory

In [13]:
url_eh_staff = 'https://www.lse.ac.uk/Economic-History/People'
r = requests.get(url_eh_staff)
soup = BeautifulSoup(r.content,'lxml')

In [14]:
df_eh = pd.DataFrame(columns=['Name', 'Type'])

# Deal with special "Type"
manage_divs = soup.find_all('div', {'class': 'accordion__txt'})[:4]
for manage_div in manage_divs:
    manage_info = manage_div.find('strong')
    if manage_info:
        name = manage_info.get_text().strip()
        df_eh = pd.concat([df_eh, pd.DataFrame({'Name': [name], 'Type': ['Senior Management Team']})], ignore_index=True)

triggers = soup.find_all('a', {'class': 'accordion__trigger'})[:6]
for trigger in triggers:
    trigger_text = trigger.get_text().strip()
    # Find the sibling <div class="accordion__panel">
    panel_div = trigger.find_next_sibling('div', {'class': 'accordion__panel'})
    if panel_div:
        # Find all <div class="accordion__txt"> within the panel_div
        txt_divs = panel_div.find_all('div', {'class': 'accordion__txt'})
        for txt_div in txt_divs:
            info = txt_div.find('strong')
            if info:
                name = info.get_text().strip()
                df_eh = pd.concat([df_eh, pd.DataFrame({'Name': [name], 'Type': [trigger_text]})], ignore_index=True)
                
df_eh

Unnamed: 0,Name,Type
0,Professor Patrick Wallis - Head of Department,Senior Management Team
1,Professor Neil Cummins - Deputy Head for Teaching,Senior Management Team
2,Professor Sara Horrell - Deputy Head for Research,Senior Management Team
3,Jennie Stayner - Department Manager,Senior Management Team
4,Professor Olivier Accominotti,"Faculty, Fellows and Teachers"
...,...,...
65,Kamilah Hassan,Professional Support Staff
66,Helena Ivins,Professional Support Staff
67,Tracy Keefe,Professional Support Staff
68,Jennie Stayner,Professional Support Staff


In [15]:
def normalize_name(name):
    name = name.split(' - ')[0]
    name = ' '.join(name.split())
    return name

df_eh['Title'] = df_eh['Name'].apply(lambda name: name.split()[0] if name.split()[0] in ['Dr', 'Professor'] else None)

for i, split_name in enumerate(df_eh['Name'].str.split()):
    if split_name[0] in ['Dr', 'Professor', 'Mr']:
        df_eh.loc[i, 'Name'] = ' '.join(split_name[1:]) if split_name[0] in ['Dr', 'Professor'] else ' '.join(split_name[1:])
df_eh

Unnamed: 0,Name,Type,Title
0,Patrick Wallis - Head of Department,Senior Management Team,Professor
1,Neil Cummins - Deputy Head for Teaching,Senior Management Team,Professor
2,Sara Horrell - Deputy Head for Research,Senior Management Team,Professor
3,Jennie Stayner - Department Manager,Senior Management Team,
4,Olivier Accominotti,"Faculty, Fellows and Teachers",Professor
...,...,...,...
65,Kamilah Hassan,Professional Support Staff,
66,Helena Ivins,Professional Support Staff,
67,Tracy Keefe,Professional Support Staff,
68,Jennie Stayner,Professional Support Staff,


## Department of Geography and Environment

In [17]:
url_eh_staff = 'https://www.lse.ac.uk/geography-and-environment/our-people'
r = requests.get(url_eh_staff)
soup = BeautifulSoup(r.content,'lxml')

In [18]:
geo=[]

# Deal with special "Type" - Senior Managment Team
triggers = soup.find_all('a', {'class': 'accordion__trigger'})
for trigger in triggers:
    department='Geography and Environment'
    label=trigger.get_text().replace('\n','')
    # Find the sibling <div> with class 'accordion__panel'
    panel_div = trigger.find_next_sibling('div', {'class': 'accordion__panel'})
    if panel_div:
        # Find all <div class="accordion__txt"> within the panel_div
        txt_divs = panel_div.find_all('div', {'class': 'accordion__txt'})
        for txt_div in txt_divs:
            # Find all <p> tags within the txt_div
            p_tags = txt_div.find_all('p')
            if len(p_tags) > 1:  # Ensure there is at least a second <p> tag
                second_p = p_tags[1]
                manage_info = second_p.find('strong')
                if manage_info:
                    name = manage_info.get_text().strip().replace('\xa0',' ')
                    if name!='':
                        geo.append([name,department,label]) 
                        
# Deal with 'Academic staff','Teaching staff','Affiliate staff'
soup.find_all('h2')
wanted=['Academic staff','Teaching staff','Affiliate staff']
for title in soup.find_all('h2'):
    department='geography and environment'
    if 'class' in title.attrs:
        continue
    else:
        if title.get_text() in wanted:
            section_div = title.find_next('section', {'class': 'accordion'})
            if section_div:
                txt_divs = section_div.find_all('div', {'class': 'accordion__txt'})
                for txt_div in txt_divs:
                    info = txt_div.find('a')
                    strong_info = txt_div.find('strong')
                    if info:
                        name = info.get_text().strip().replace('\xa0',' ')
                        geo.append([name,department,title.get_text()])                 
                    elif strong_info:
                        name = strong_info.get_text().strip().replace('\xa0',' ')
                        geo.append([name,department,title.get_text()])

                                
# Deal with special "Type" - Professional Services Staff
professional_staff_h2 = soup.find('h2', string='Professional Services Staff')
if professional_staff_h2:
    department='geography and environment'
    section_div = professional_staff_h2.find_next('section',{'class': 'accordion'})
    if section_div:
        txt_divs = section_div.find_all('div', {'class': 'accordion__txt'})
        for txt_div in txt_divs:
            info_strong = txt_div.find('strong')
            if info_strong:
                name = info_strong.get_text().strip().replace('\xa0',' ')
                geo.append([name,department,'Professional Services Staff'])
                
# Deal with special "Type" - Visiting Staff                  
visiting_staff_h2 = soup.find('h2', string='Visiting staff')
if visiting_staff_h2:
    department='geography and environment'
    section_div = visiting_staff_h2.find_next('div',{'class': 'accordion__content'})
    for person in section_div.find_all('p'):
        if person.find('a'):
            name=person.find('a').get_text().replace('\xa0',' ')
            geo.append([name,department,'Visiting staff'])
        else:
            name=person.get_text().replace('\xa0',' ')
            geo.append([name,department,'Visiting staff'])

In [19]:
len(geo)

141

In [20]:
for i in range(len(geo)):
    namestr=geo[i][0].lower().split()
    if 'dr' in namestr:
        geo[i].append('Dr')
    elif 'prof' in namestr:
        geo[i].append('Professor')
    elif 'professor' in namestr:
        geo[i].append('Professor')
    else:
        geo[i].append('Non')
        
    if geo[i][0].startswith('Dr.'):
        name=geo[i][0].replace('Dr.','')
        geo[i][0]=name
    else:
        name=geo[i][0].replace('Dr','').replace('.','').replace('Prof','').replace('Professor','').replace('\xa0',' ').split()
        name=" ".join(name)
        geo[i][0]=name

In [23]:
geo=pd.DataFrame(geo,columns=['name','department','label','title'])
geo.head(10)

Unnamed: 0,name,department,label,title
0,Hyun Bang Shin,Geography and Environment,Senior Management Team,Professor
1,Christian Hilber,Geography and Environment,Senior Management Team,Professor
2,Claire Mercer,Geography and Environment,Senior Management Team,Professor
3,Simon Dietz,Geography and Environment,Senior Management Team,Professor
4,Nancy Holman,Geography and Environment,Senior Management Team,Dr
5,Olmo Silva,Geography and Environment,Senior Management Team,Professor
6,Sam Colegate,Geography and Environment,Senior Management Team,Non
7,Cornelia Agyenim – Boateng,geography and environment,Academic staff,Non
8,Laura Antona,geography and environment,Academic staff,Dr
9,Gabriel Ahlfeldt,geography and environment,Academic staff,Professor


By inspection, there is one name missing because of the abnormal structure of the webpage

In [24]:
display(geo[geo['name']==''])

Unnamed: 0,name,department,label,title
11,,geography and environment,Academic staff,Dr


In [25]:
geo.loc[geo['name']=='','name']='Dr Aretousa Bloom'

In [26]:
display(geo[geo['name']==''])

Unnamed: 0,name,department,label,title


In [27]:
geo.head(20)

Unnamed: 0,name,department,label,title
0,Hyun Bang Shin,Geography and Environment,Senior Management Team,Professor
1,Christian Hilber,Geography and Environment,Senior Management Team,Professor
2,Claire Mercer,Geography and Environment,Senior Management Team,Professor
3,Simon Dietz,Geography and Environment,Senior Management Team,Professor
4,Nancy Holman,Geography and Environment,Senior Management Team,Dr
5,Olmo Silva,Geography and Environment,Senior Management Team,Professor
6,Sam Colegate,Geography and Environment,Senior Management Team,Non
7,Cornelia Agyenim – Boateng,geography and environment,Academic staff,Non
8,Laura Antona,geography and environment,Academic staff,Dr
9,Gabriel Ahlfeldt,geography and environment,Academic staff,Professor
