In [1]:
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import requests
import re
import unicodedata

In [20]:
#prep

og_html = requests.get('https://religion.byu.edu/directory')
s = bs(og_html.text)

In [21]:
#Tag containing professor info
profs = s.find_all('h3', class_="PromoVerticalImage-title promo-title")

#Find and normalize formatting of full names
names = [i.find('a').text for i in profs]
names = list(map(lambda x: unicodedata.normalize("NFKD", x), names))

#Find links to their individual pages (used for getting more info later)
links = [i.find('a')['href'] for i in profs]

#Find their titles (Professor, Adjunct Professor, etc.)
titles = [None] * len(profs)
for i in range(len(profs)):
    try:
        titles[i] = profs[i].parent.find(class_="PromoVerticalImage-jobTitle").text
    except AttributeError:
        titles[i] = None
        
#Find out their teaching area (ancient scripture, church history and doctrine, etc.)
#The error handling is because some don't have one
areas = [None] * len(profs)
for i in range(len(profs)):
    try:
        areas[i] = profs[i].parent.find(class_="PromoVerticalImage-groups").text
    except AttributeError:
        areas[i] = None
areas = [i.split(', ') for i in areas]

#Remove "Salt Lake Center" from the area list and turn into its own boolean variable
slc = [False] * len(profs)
for i in range(len(profs)):
    if 'Salt Lake Center' in areas[i]:
        slc[i] = True
        areas[i].remove('Salt Lake Center')

In [241]:
#Go into each link to get the building and room number of their offices and their phone numbers

offices = [None] * len(profs)
phone_numbers = [None] * len(profs)
for i in range(len(profs)):                                     #Iterate through all faculty on the page
    html = requests.get(links[i])                               #Follow the link to the professor's page
    s = bs(html.text)
    t = s.find(class_="RichTextModule-items RichTextBody").text #Find his or her office location
    offices[i] = t.strip(' \n').split(' \n')[0]                 #Reformat
    
    try:
        phone_numbers[i] = s.find(class_="EmployeePage-phoneNumber description-text").text   #Get phone number
    except AttributeError:
        phone_numbers[i] = None
    #s.find(text=re.compile(r'\d{3}-\d{3}-\d{4}|\(\d{3}\)\d{3}-\d{4}'))
    #This commented line almost works for getting the phone number, but it's a little more finnicky
offices = list(map(lambda v: v.replace('Joseph Smith Building','? JSB'), offices))
room_numbers = list(map(lambda o: o.split(' ')[0], offices))     #Split into two lists, room number and building
buildings = list(map(lambda o: o.split(' ')[1], offices))

In [257]:
# This cell takes about a minute to run

# Get BeautifulSouped HTML of the page searching for each religion professor's name in the BYU section of RMP

rmp_searches = [None] * len(profs)
for i in range(len(profs)):
    splitname = names[i].split(' ')
    firstname = splitname[0]
    lastname = splitname[-1]
    r = requests.get(f'https://www.ratemyprofessors.com/search/teachers?query={firstname}%20{lastname}&sid=U2Nob29sLTEzNQ==')
    rmp_searches[i] = bs(r.text)

In [278]:
# Get the URLs of each professor's RMP page
    
rmp_prof_pages = [None] * len(profs)
for i in range(len(profs)):
    try:
        rmp_info_string = rmp_searches[i].find_all('script')[7].text   #weird string of info I didn't know how to handle well
        IDs = re.compile('legacyId":\d+')   #find the legacy ID, which is used to make the URL for each professor
        ID = IDs.search(rmp_info_string).group(0).split(':')[1]   #reformat to include only the legacy ID
        rmp_prof_pages[i] = f'https://www.ratemyprofessors.com/professor?tid={ID}'   #build the correct URL
    except AttributeError:
        rmp_prof_pages[i] = None

In [280]:
# Remove the URLs of the professors who don't actually teach at BYU
# This was necessary because if no professors at the school you're searching within match the name you search,
# RMP suggests professors with similar names from other schools, and the for loop above this just takes the
# first search result, which is from BYU if there's a match, but is from another school if there's no match
# at BYU and there is one elsewhere.
for i in range(len(profs)):
    if type(rmp_prof_pages[i]) is str:
        html = requests.get(rmp_prof_pages[i])
        s = bs(html.text)
        if s.find(text='Brigham Young University') is None:
            rmp_prof_pages[i] = None

In [281]:
#Get the BeautifulSouped HTML from every professor's RMP page
#Takes a minute to run, so just run it once and then use the results to get other info

rmp_bs = [None] * len(profs)
for i in range(len(profs)):
    if type(rmp_prof_pages[i]) is str:
        r = requests.get(rmp_prof_pages[i])
        rmp_bs[i] = bs(r.text)

In [282]:
#Get available ratings as floats
#If professor doesn't have a page on RMP or has never been rated, his rating is stored as None

ratings = [None] * len(profs)
for i in range(len(profs)):
    if type(rmp_prof_pages[i]) is str:
        ratings[i] = rmp_bs[i].find('div', class_="RatingValue__Numerator-qw8sqy-2 liyUjw").text
        if ratings[i] == 'N/A':
            ratings[i] = None
        else:
            ratings[i] = float(ratings[i])

In [283]:
d = pd.DataFrame({'name':names,
                  'position':titles,
                  'teaching area':areas,
                  'salt lake center':slc,
                  'RMP rating':ratings,
                  'building':buildings,
                  'room number':room_numbers,
                  'phone number':phone_numbers,
                  'BYU website':links,
                  'RMP website':rmp_prof_pages})

In [285]:
d

Unnamed: 0,name,position,teaching area,salt lake center,RMP rating,building,room number,phone number,BYU website,RMP website
0,Ken Alford,Professor,[Church History and Doctrine],False,4.8,JSB,365H,801-422-6497,https://religion.byu.edu/directory/ken-alford,https://www.ratemyprofessors.com/professor?tid...
1,Camey Andersen,Adjunct Instructor,[Ancient Scripture],True,5.0,JSB,?,801-422-0347,https://religion.byu.edu/directory/camey-andersen,https://www.ratemyprofessors.com/professor?tid...
2,Mark Ashurst-McGee,Adjunct Instructor,[Church History and Doctrine],True,,JSB,?,,https://religion.byu.edu/directory/mark-ashurs...,https://www.ratemyprofessors.com/professor?tid...
3,Alex Baugh,Professor,[Church History and Doctrine],False,3.5,JSB,210G,801-422-5164,https://religion.byu.edu/directory/alex-baugh,https://www.ratemyprofessors.com/professor?tid...
4,Daniel Becerra,Assistant Professor,[Ancient Scripture],False,5.0,JSB,270E,,https://religion.byu.edu/directory/daniel-becerra,https://www.ratemyprofessors.com/professor?tid...
...,...,...,...,...,...,...,...,...,...,...
119,Fred Woods,Professor,[Church History and Doctrine],False,3.4,JSB,365E,801-422-3366,https://religion.byu.edu/directory/fred-woods,https://www.ratemyprofessors.com/professor?tid...
120,Guinevere Woolstenhulme,Adjunct Instructor,[Ancient Scripture],False,5.0,JSB,102E,,https://religion.byu.edu/directory/guinevere-w...,https://www.ratemyprofessors.com/professor?tid...
121,Traci Wright,Administrative Assistant,[Ancient Scripture],False,,JSB,375,801-422-2067,https://religion.byu.edu/directory/traci-wright,
122,Wesley Wright,Preservice Trainer,[Seminaries and Institutes],False,4.9,JSB,207A,801-422-6131,https://religion.byu.edu/directory/wesley-wright,https://www.ratemyprofessors.com/professor?tid...
