In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import glob
import re
import time

# Part 1: Retrieve Information

In [13]:
# Open the page source of the UTEP CS Faculty Page.
# Find all the classes that correspond to the faculty's information.
req = requests.get('https://www.utep.edu/cs/people/index.html')
soup = BeautifulSoup(req.text,'lxml')
divs = soup.findAll('div',{'class':'col-md-6'})

In [14]:
# Create a new directory in the current directory called 'professors'.
# Print a message to the user stating if the directory creation already exists, failed, or was successful.
path = os.path.join(os.getcwd(),'professors')
try:
    os.makedirs(path)
except FileExistsError:
    print ("The directory %s, already exists" % path)
except OSError:
    print ("Creation of the directory %s failed" % path)
else:
    print ("Successfully created the directory %s " % path)

The directory /Users/boo/Documents/CS4364/professors, already exists


In [15]:
# Initialze lists to save the information from each professor.
names = []
titles = []
offices = []
emails = []
phones = []
websiteURLs = []

# Iterate through each of the faculty's information.
# If the faculty is a professor, add their information to the lists.
# If the faculty does not have a certain information (i.e. office), concatenate an empty list in order to keep the list indexes conistent.
for div in divs:
    # Save the faulcty's title if they are a professor.
    # If the faculty is a professor, retrieve the rest of their information.
    facultyTitle = div.find('span',{'class':'Title'})
    if facultyTitle is not None and "Professor" in facultyTitle.text:
        titles += [facultyTitle.text]
     
        # Save the professor's name.
        facultyName = div.find('h3',{'class':'name'})
        if facultyName is not None:
            names += [facultyName.text]
        else:
            names += []
    
        # Save the professor's office.
        facultyOffice = div.find("span", {"class":"address"})
        if facultyOffice is not None:
            offices += [facultyOffice.text]
        else:
            offices += []
        
        # Save the professor's email.
        facultyEmail = div.find('span',{'class':'email'})
        if facultyEmail is not None:
            emails += [facultyEmail.text.strip()]
        else:
            emails += []
            
        # Save the professor's phone.
        facultyPhone = div.find('span',{'class':'phone'})
        if facultyPhone is not None:
            phones += [facultyPhone.text.split('F')[0]]
        else:
            phones += []
    
        # Save the professor's website URL.
        facultyWebPage = div.findAll('a')
        if len(facultyWebPage) > 0:
            facultyURL = facultyWebPage[-1].get('href')
            websiteURLs += [facultyURL]
        else:
            websiteURLs += []
    
        # Save the contents of the professor's website if they have one.
        # Save the contents in a new file named with the professor's email.
        if len(facultyURL) > 0:
            webPageContent = requests.get(facultyURL)
            content = BeautifulSoup(webPageContent.text, 'lxml')

            tempPath = os.path.join(path,facultyEmail.text.split('@')[0]+'.txt')
            
            file = open(tempPath, "wt")
            file.write(content.text)
            file.close()

In [16]:
# Create a dataframe for the retrieved information.
# Save the dataframe as a pickle file.
data = {'Name' : names, 'Title' : titles, 'Office' : offices, 'Email' : emails, 'Phone' : phones, 'Website' : websiteURLs}
professors = pd.DataFrame(data=data)
professors.to_pickle("professors.pkl")

# Part 2: Search Engine

In [17]:
# Ask the user to enter a search term.
# Save the first term entered and convert it to lower case.
val = input("Enter a search term: ") 
firstTerm = val.split()[0].lower() 

Enter a search term: Office


In [18]:
# Open the files containing the contents of the professor's websites.
# Initialize two lists to keep track of the file names and the number of times the term was found.
filePaths = glob.glob(os.path.join(path,'*.txt'))
fileName = []
times = []

# Keep track of the time it takes to check through the files.
startTime = time.time()

# Iterate through the files.
# Save the name of the file in a list without '.txt'.
# Iterate through the contents of each file and check if each word matches the search term.
# If there is a match, increase the count by 1.
# After the current file is done being traversed, save the number of times the word was found.
for file in filePaths:
    count = 0
    fileName += [os.path.basename(file).split(".")[0]]
    currentFile = open(file, "r")
    for line in currentFile:
        # Save the alphabetical words of each line in a list.
        # Iterate though the list and compare both lowercase words.
        wordList = re.findall(r"[\w']+", line)
        for word in wordList:
            if word.lower() == firstTerm:
                count += 1 
    times += [count]
    currentFile.close()

# Save the elapsed time.
elapsedTime = time.time() - startTime

# Save the results of the search into a dataframe with the file name and the number of times the term appeared. 
# Sort the data frame in descending order. 
data = {'Username' : fileName, 'Times' : times}
search = pd.DataFrame(data=data)
search = search.sort_values(by='Times',ascending=False)

In [19]:
# Sum the number of files that the term appeared in.
# Print the number of results and the elapsed time. 
numResults = sum([i for i in search['Times']!= 0])
print('{} results found ({:.3f} sec)'.format(numResults, elapsedTime))
print()

# Iterate through the rows in the dataframe that was created when searching for the term.
# Only iterate through the number of search results.
# Access the row in the 'professors' dataframe of the current professor through the name of the file.
# Use this row number to print the rest of the professors information. 
i = 1
for index, searchRow in search.head(n=numResults).iterrows():
    value = professors[professors['Email'] == searchRow['Username']+"@utep.edu"].index[0]
    dfRow = professors.iloc[value]
    print('Rank #{}: The search term "{}" appears {} times'.format(i, firstTerm,searchRow['Times']))
    print(dfRow['Name'], end = ", ")
    print(dfRow['Title'])
    print('Office: ', dfRow['Office'], end = "  ")
    print('Email: ', dfRow['Email'], end = "  ")
    print('Phone: ', dfRow['Phone'])
    print('Website: ', dfRow['Website'])
    print()
    i += 1

6 results found (0.034 sec)

Rank #1: The search term "office" appears 6 times
Luc Longpré, Associate Professor  Undergraduate Program Director
Office:  CCSB 3.0420  Email:  longpre@utep.edu  Phone:  (915) 747-6804 
Website:  https://www.utep.edu/cs/people/longpre.html

Rank #2: The search term "office" appears 4 times
Eric Freudenthal, Associate Professor
Office:  CCSB 3.0424  Email:  efreudenthal@utep.edu  Phone:  (915) 747-6954 
Website:  https://www.utep.edu/cs/people/Freudenthal.html

Rank #3: The search term "office" appears 2 times
Salamah I. Salamah, Associate Professor and Chair Director of Software Engineering
Office:  CCSB 3.1002  Email:  isalamah@utep.edu  Phone:  (915) 613-3110 
Website:  http://www.cs.utep.edu/isalamah/

Rank #4: The search term "office" appears 1 times
Oscar Adrian Mondragon, Clinical Associate Professor Director of M.S Software Engineering
Office:  CCSB 3.1020  Email:  oamondragon@utep.edu  Phone:  (915)747-8015
Website:  https://expertise.utep.edu/prof