In [8]:
import requests
from bs4 import BeautifulSoup as bs
from bs4.element import Comment
import pandas as pd
import re
import numpy as np

In [9]:
#functions for getting all txt from a page
#source: https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True

def text_from_html(body):
    # soup = bs(body.content)
    texts = soup.find_all(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [10]:
#set up links that will be appended to w/ counts of each word later
school_link = []
campus_count = []
boarding_count = []
housing_count = []
residence_count = []
dorm_count = []

In [11]:
#example list of school links to scrape
lst_of_schools = [
                  'bad_url.com','https://tjhsst.fcps.edu/', 'https://www2.montgomeryschoolsmd.org/schools/tildenms/','https://www2.montgomeryschoolsmd.org/schools/wjhs/', 'https://www2.montgomeryschoolsmd.org/schools/newportmillms/',
                  'https://www.ssfs.org/', 'https://www.stjames.edu/', 'https://www.standrews-de.org/', 'https://www.taftschool.org/', 'http://mes.madisoncity.k12.al.us/'
                  ]

# lst_of_schools = [
#                   'https://tjhsst.fcps.edu/', 'https://www2.montgomeryschoolsmd.org/schools/tildenms/',
#                   'https://www.ssfs.org/'
#                   ]

failed_requests = 0

In [12]:
#request the website
for idx, link in enumerate(lst_of_schools):
  #skip non-links
  if not re.search('http', link):
    continue
  
  #add '/' at the end of the url
  if link[-1] != '/':
    link += '/'
  
  # print(f'at {idx} link: {link}')

  #put in the request for the link
  try:
    html = requests.get(link)
    soup = bs(html.content)
    txt = text_from_html(html).strip().lower().split()

    #get links with words that potentially has more info on the school
    about_links =[]
    for about in soup.find_all('a'):
      txt = about.get_text().lower()
      if re.search(r'\babout\b', txt):
        about_links.append(about['href'])
      elif re.search(r'\bfaq[s]?\b', txt):
        about_links.append(about['href'])
      elif re.search(r'\bour\b [a-z]+', txt):
        about_links.append(about['href'])

    # #make links that only have the tail end portion into full links
    end_link = re.search(r'\.com|\.net|\.org|\.info|\.gov|\.edu|\.us', link).end()
    base_link = link[:end_link]
    
    full_about_links = [link]
    for link_ in about_links:
      if re.search("http", link_):
          full_about_links.append(link_)
      #if it has no 'http' in the link, add it to the base link and original link
      else:
        new_link1 = f'{base_link}{link_[1:]}'
        new_link2 = f'{link}{link_[1:]}'
        full_about_links.append(new_link1)
        full_about_links.append(new_link2)

    #count words that pertain to boarding schools
    campus = 0
    boarding = 0
    housing = 0
    residence = 0
    dorm = 0

    #remove duplicates from full_about_links
    full_about_links = list(set(full_about_links))

    for link_ in full_about_links:
      #put in the request for the link
      try:
        html = requests.get(link_)
        soup = bs(html.content)
        txt = text_from_html(html).strip().lower().split()
        campus += txt.count('campus')
        boarding += txt.count('boarding')
        housing += txt.count('housing')
        residence += txt.count('residence')
        dorm += txt.count('dorm')
      except:
        continue

# add the counts to the list
    school_link.append(link)
    campus_count.append(campus)
    boarding_count.append(boarding)
    housing_count.append(housing)
    residence_count.append(residence)
    dorm_count.append(dorm)

# add np.nan when the link doesn't work
  except:
    school_link.append(link)
    campus_count.append(np.nan)
    boarding_count.append(np.nan)
    housing_count.append(np.nan)
    residence_count.append(np.nan)
    dorm_count.append(np.nan)

    failed_requests += 1
    

In [13]:
#turn counts into a pandas dataframe
all_counts = {'school': school_link,
              'campus': campus_count,
              'boarding': boarding_count,
              'housing': housing_count,
              'residence': residence_count,
              'dorm': dorm_count}
boarding_school = pd.DataFrame(all_counts, columns = ['school', 'campus', 'boarding', 'housing', 'residence', 'dorm'])

In [14]:
boarding_school

Unnamed: 0,school,campus,boarding,housing,residence,dorm
0,https://tjhsst.fcps.edu/,4,0,2,0,0
1,https://www2.montgomeryschoolsmd.org/schools/t...,0,0,0,0,0
2,https://www2.montgomeryschoolsmd.org/schools/w...,0,0,0,0,0
3,https://www2.montgomeryschoolsmd.org/schools/n...,0,0,0,0,0
4,https://www.ssfs.org/,111,128,0,0,29
5,https://www.stjames.edu/,39,30,0,0,4
6,https://www.standrews-de.org/,16,32,1,0,29
7,https://www.taftschool.org/,302,5,0,2,7
8,http://mes.madisoncity.k12.al.us/,0,0,0,0,0
