# Functions

In [34]:
import unittest
import os
import bs4

def get_files_in_dir(dir_name, extension = None, num_files = None):
   if os.path.exists(dir_name) == False:
      return None
   if os.path.isfile(dir_name):
      return None
   files = os.listdir(dir_name)
   if extension is not None: # we have a filter 
      files = [f for f in files if f.endswith(extension)]
   if num_files is not None: 
      files = files[:num_files]
   return files

def get_bs4_obj(filename):
    if os.path.exists(filename):
        with open(filename, encoding='utf-8') as f:
            bs4_obj = bs4.BeautifulSoup(f)
        return bs4_obj
    else:
        return None


    
def get_alum_list(filename):
   dom = get_bs4_obj(filename)
   if dom == None:
      print('dom is none')
      return []
   h2s_filt = [h2 for h2 in dom.find_all("h2") if "lumni" in h2.text]
    
   if len(h2s_filt) == 0:
      return []
    
   sib = h2s_filt[0].next_sibling
   # traverse, accumulating uls
   # that are siblings of h2
   uls = []
   while sib != None:
      if sib.name == "ul":
         uls.append(sib)
      if sib.name == "h2": # end of the section
         break
      sib = sib.next_sibling
   # dig into the uls
   list_texts = []
   for ul in uls: # iterate all sibling uls
      lis = ul.find_all("li")
      for li in lis: # iterate list items
         list_text = li.text
         list_texts.append(list_text) # add text 
   return list_texts

get_alum_list('./test_files/uol.html')


len 0


[]

# Unit tests

In [28]:


class TestListFiles(unittest.TestCase):
    def test_list_xists(self):
        self.assertIsNotNone(get_files_in_dir)
    
    def test_returns_list(self):
        res = get_files_in_dir("/")
        self.assertEqual(  type(res), list)
    
    def test_nonexistent(self):
        res = get_files_in_dir("/i/do/not/exist")
        self.assertIsNone(res)
    
    def test_nofilenames(self):
        res = get_files_in_dir("./test_files/test1.txt")
        self.assertIsNone(res)
        
    def test_get_all(self):
        res = get_files_in_dir("./test_files/", None)
        self.assertEqual(len(res), 9)
    
    def test_filter_to_two(self):
        res = get_files_in_dir("./test_files/", "txt", 2)
        self.assertEqual(len(res), 2)
    
    def test_filter_to_one(self):
        res = get_files_in_dir("./test_files/", "html", 1)
        self.assertEqual(len(res), 1)
    
    def test_filter_to_one_ext(self):
        res = get_files_in_dir("./test_files/", "html", 1)
        
        self.assertTrue(res[0].endswith("html"))  

unittest.main(argv=['ingored', '-v'], exit=False)

test_get_alums_exists (__main__.TestHTMLParse) ... ok
test_load_html_not_none (__main__.TestHTMLParse) ... ERROR
test_ret_4 (__main__.TestHTMLParse) ... ERROR
test_returns_list (__main__.TestHTMLParse) ... ok
test_uni_leeds (__main__.TestHTMLParse) ... ok
test_uni_london_works (__main__.TestHTMLParse) ... FAIL
test_filter_to_one (__main__.TestListFiles) ... ok
test_filter_to_one_ext (__main__.TestListFiles) ... ok
test_filter_to_two (__main__.TestListFiles) ... ok
test_get_all (__main__.TestListFiles) ... ok
test_list_xists (__main__.TestListFiles) ... ok
test_nofilenames (__main__.TestListFiles) ... ok
test_nonexistent (__main__.TestListFiles) ... ok
test_returns_list (__main__.TestListFiles) ... 

dom is none
dom is none
dom is none


ok

ERROR: test_load_html_not_none (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-25-4bb6c6f30066>", line 10, in test_load_html_not_none
    res = get_bs4_obj("./test_files/uol.html")
  File "<ipython-input-18-d8f50a143a9c>", line 20, in get_bs4_obj
    bs4_obj = bs4.BeautifulSoup(f)
  File "C:\Users\hugho\AppData\Roaming\Python\Python39\site-packages\bs4\__init__.py", line 309, in __init__
    markup = markup.read()
  File "C:\Program Files\Python39\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 31058: character maps to <undefined>

ERROR: test_ret_4 (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-25-4bb6c6f30066>", line 14, in test_ret_4


<unittest.main.TestProgram at 0x1814ed71910>

In [31]:
class TestHTMLParse(unittest.TestCase):
    def test_get_alums_exists(self):
        self.assertIsNotNone(get_alum_list)
    
    def test_returns_list(self):
        res = get_alum_list("something.html")
        self.assertEqual( type(res), list )
    
    def test_load_html_not_none(self):
        res = get_bs4_obj("./test_files/uol.html")
        self.assertEqual(type(res), bs4.BeautifulSoup)
    
    def test_ret_4(self):
        res = get_alum_list('./test_files/uol.html')
        self.assertEqual(len(res), 4)
    
    def test_uni_london_works(self):
        res = get_alum_list('./test_files/uol.html')
        self.assertGreater(len(res), 0)
    
    def test_uni_leeds(self):
        res = get_alum_list('./test_html_files/leeds.html')
        self.assertIsNotNone(res)
    
    
        
        
unittest.main(argv=['ingored', '-v'], exit=False)

test_get_alums_exists (__main__.TestHTMLParse) ... ok
test_load_html_not_none (__main__.TestHTMLParse) ... ok
test_ret_4 (__main__.TestHTMLParse) ... FAIL
test_returns_list (__main__.TestHTMLParse) ... ok
test_uni_leeds (__main__.TestHTMLParse) ... ok
test_uni_london_works (__main__.TestHTMLParse) ... 

dom is none
dom is none


FAIL
test_filter_to_one (__main__.TestListFiles) ... ok
test_filter_to_one_ext (__main__.TestListFiles) ... ok
test_filter_to_two (__main__.TestListFiles) ... ok
test_get_all (__main__.TestListFiles) ... ok
test_list_xists (__main__.TestListFiles) ... ok
test_nofilenames (__main__.TestListFiles) ... ok
test_nonexistent (__main__.TestListFiles) ... ok
test_returns_list (__main__.TestListFiles) ... ok

FAIL: test_ret_4 (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-31-186181c95139>", line 15, in test_ret_4
    self.assertEqual(len(res), 4)
AssertionError: 0 != 4

FAIL: test_uni_london_works (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-31-186181c95139>", line 19, in test_uni_london_works
    self.assertGreater(len(res), 0)
AssertionError: 0 not greater than 0

----------------

<unittest.main.TestProgram at 0x1814f662640>

# Data analysis

In [36]:
dirname = "./wikipedia-university-pages/"
files = get_files_in_dir(dirname)
print("Found ", len(files))

for f in files:
    alums = get_alum_list(dirname + f)
    if len(alums) > 0:
        print("Found ", len(alums), "in",  f)
        


Found  1067
len 0
len 0
len 0
len 0
len 0
len 0
Found  13 in Abertay_University.html
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
Found  26 in Anglia_Ruskin_University.html
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
Found  19 in Arts_University_Bournemouth.html
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
Found  9 in Barking_and_Dagenham_College.html
Found  2 in Barnet_and_Southgate_College.html
Found  2 in Barts_and_The_London_School_of_Medicine_and_Dentistry.html
len 0
len 0
len 0
Found  6 in Bexley_College.html
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
len 0
Found  4 in Bromley_College_of_Further_&_Higher_Education.html
Found  62 in Brunel_University.html
Found  62 in Brunel_University_London.html
len 0
len 0
Found  14 in Buckinghamshire_New_University.html
len 0
len 0
le

KeyboardInterrupt: 