# Functions

In [18]:
import unittest
import os
import bs4

def get_files_in_dir(dir_name, extension = None, num_files = None):
   if os.path.exists(dir_name) == False:
      return None
   if os.path.isfile(dir_name):
      return None
   files = os.listdir(dir_name)
   if extension is not None: # we have a filter 
      files = [f for f in files if f.endswith(extension)]
   if num_files is not None: 
      files = files[:num_files]
   return files

def get_bs4_obj(filename):
    if os.path.exists(filename):
        with open(filename) as f:
            bs4_obj = bs4.BeautifulSoup(f)
        return bs4_obj
    else:
        return None


    
def get_alum_list(filename):
   dom = get_bs4_obj(filename)
   if dom == None:
      print('dom is none')
      return []
   h2s_filt = [h2 for h2 in dom.find_all("h2") if "lumni" in h2.text]
    
   if len(h2s_filt) == 0:
      return []
    
   sib = h2s_filt[0].next_sibling
   # traverse, accumulating uls
   # that are siblings of h2
   uls = []
   while sib != None:
      if sib.name == "ul":
         uls.append(sib)
      if sib.name == "h2": # end of the section
         break
      sib = sib.next_sibling
   # dig into the uls
   list_texts = []
   for ul in uls: # iterate all sibling uls
      lis = ul.find_all("li")
      for li in lis: # iterate list items
         list_text = li.text
         list_texts.append(list_text) # add text 
   return list_texts

get_alum_list('./wikipedia-university-pages/ol.html')


dom is none


[]

# Unit tests

In [23]:


class TestListFiles(unittest.TestCase):
    def test_list_xists(self):
        self.assertIsNotNone(get_files_in_dir)
    
    def test_returns_list(self):
        res = get_files_in_dir("/")
        self.assertEqual(  type(res), list)
    
    def test_nonexistent(self):
        res = get_files_in_dir("/i/do/not/exist")
        self.assertIsNone(res)
    
    def test_nofilenames(self):
        res = get_files_in_dir("./test_files/test1.txt")
        self.assertIsNone(res)
        
    def test_get_two(self):
        res = get_files_in_dir("./test_files/", None, 2)
        self.assertEqual(len(res), 3)
    
    def test_filter_to_two(self):
        res = get_files_in_dir("./test_files/", "txt", 2)
        self.assertEqual(len(res), 2)
    
    def test_filter_to_one(self):
        res = get_files_in_dir("./test_files/", "html", 1)
        self.assertEqual(len(res), 1)
    
    def test_filter_to_one_ext(self):
        res = get_files_in_dir("./test_files/", "html", 1)
        
        self.assertTrue(res[0].endswith("html"))  

unittest.main(argv=['ingored', '-v'], exit=False)

test_get_alums_exists (__main__.TestHTMLParse) ... ok
test_load_html_not_none (__main__.TestHTMLParse) ... FAIL
test_ret_4 (__main__.TestHTMLParse) ... FAIL
test_returns_list (__main__.TestHTMLParse) ... ok
test_uni_leeds (__main__.TestHTMLParse) ... ok
test_uni_london_works (__main__.TestHTMLParse) ... FAIL
test_filter_to_one (__main__.TestListFiles) ... FAIL
test_filter_to_one_ext (__main__.TestListFiles) ... ERROR
test_filter_to_two (__main__.TestListFiles) ... ok
test_get_two (__main__.TestListFiles) ... FAIL
test_list_xists (__main__.TestListFiles) ... ok
test_nofilenames (__main__.TestListFiles) ... ok
test_nonexistent (__main__.TestListFiles) ... ok
test_returns_list (__main__.TestListFiles) ... 

dom is none
dom is none
dom is none
dom is none


ok

ERROR: test_filter_to_one_ext (__main__.TestListFiles)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-23-cd9dc42593ea>", line 32, in test_filter_to_one_ext
    self.assertTrue(res[0].endswith("html"))
IndexError: list index out of range

FAIL: test_load_html_not_none (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-48c846018d51>", line 11, in test_load_html_not_none
    self.assertEqual(type(res), bs4.BeautifulSoup)
AssertionError: <class 'NoneType'> != <class 'bs4.BeautifulSoup'>

FAIL: test_ret_4 (__main__.TestHTMLParse)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-48c846018d51>", line 15, in test_ret_4
    self.assertEqual(len(res), 4)
AssertionError: 0 != 4

FAIL: test_uni_london_works (__main__.TestHTMLPars

<unittest.main.TestProgram at 0x1814eafa1f0>

In [5]:
class TestHTMLParse(unittest.TestCase):
    def test_get_alums_exists(self):
        self.assertIsNotNone(get_alum_list)
    
    def test_returns_list(self):
        res = get_alum_list("something.html")
        self.assertEqual( type(res), list )
    
    def test_load_html_not_none(self):
        res = get_bs4_obj("./test_html_files/uol.html")
        self.assertEqual(type(res), bs4.BeautifulSoup)
    
    def test_ret_4(self):
        res = get_alum_list('./test_html_files/uol.html')
        self.assertEqual(len(res), 4)
    
    def test_uni_london_works(self):
        res = get_alum_list('./test_html_files/uol_real.html')
        self.assertGreater(len(res), 0)
    
    def test_uni_leeds(self):
        res = get_alum_list('./test_html_files/leeds.html')
        self.assertIsNotNone(res)
    
    
        
        
unittest.main(argv=['ingored', '-v'], exit=False)

test_get_alums_exists (__main__.TestHTMLParse) ... ok
test_load_html_not_none (__main__.TestHTMLParse) ... FAIL
test_ret_4 (__main__.TestHTMLParse) ... FAIL
test_returns_list (__main__.TestHTMLParse) ... ok
test_uni_leeds (__main__.TestHTMLParse) ... ok
test_uni_london_works (__main__.TestHTMLParse) ... FAIL
test_filter_to_one (__main__.TestListFiles) ... ERROR
test_filter_to_one_ext (__main__.TestListFiles) ... ERROR
test_filter_to_two (__main__.TestListFiles) ... ERROR
test_get_two (__main__.TestListFiles) ... ERROR
test_list_xists (__main__.TestListFiles) ... ok
test_nofilenames (__main__.TestListFiles) ... ok
test_nonexistent (__main__.TestListFiles) ... ok
test_returns_list (__main__.TestListFiles) ... ok

ERROR: test_filter_to_one (__main__.TestListFiles)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-4-051bab04a945>", line 27, in test_filter_to_one
    self.assertEqual(len(res), 1)
TypeError: objec

<unittest.main.TestProgram at 0x1814d7d2b50>

# Data analysis

In [6]:
dirname = "../../Data/wikipedia-university-pages/"
files = get_files_in_dir(dirname)
print("Found ", len(files))

for f in files:
    alums = get_alum_list(dirname + f)
    if len(alums) > 0:
        print("Found ", len(alums), "in",  f)
        


TypeError: object of type 'NoneType' has no len()