Web Scraping from UTampa Course Catalog

In [1]:
from bs4 import BeautifulSoup
import requests

In [3]:
url = 'https://ut.smartcatalogiq.com/current/catalog/undergraduate-catalog/college-of-social-sciences-mathematics-and-education/department-of-mathematics/data-science-major/'
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html')


In [4]:
# find table on page - gives all MUST take courses
table_core = soup.find_all('table')[0]
table_core

<table>
<tr>
<td class="sc-coursenumber sc-programtable-column-1"><a class="sc-courselink" href="/en/current/catalog/course-descriptions/csc-computer-science/100/csc-101">CSC 101</a></td><td class="sc-coursetitle sc-programtable-column-2">The Science of Computing I</td><td class="sc-credits sc-programtable-column-3"><p class="credits">4</p></td>
</tr><tr>
<td class="sc-coursenumber sc-programtable-column-1"><a class="sc-courselink" href="/en/current/catalog/course-descriptions/csc-computer-science/100/csc-102">CSC 102</a></td><td class="sc-coursetitle sc-programtable-column-2">The Science of Computing II</td><td class="sc-credits sc-programtable-column-3"><p class="credits">4</p></td>
</tr><tr>
<td class="sc-coursenumber sc-programtable-column-1"><a class="sc-courselink" href="/en/current/catalog/course-descriptions/csc-computer-science/200/csc-201">CSC 201</a></td><td class="sc-coursetitle sc-programtable-column-2">Data Structures and Algorithm Analysis</td><td class="sc-credits sc-pr

In [5]:
# initialize lists
course_titles = []
course_numbers = []
credits = []

for td in soup.find_all('td', class_=True):
    classes = td.get('class', [])
    if 'sc-programtable-column-2' in classes:  # course titles
        course_titles.append(td.get_text(strip=True))
    elif 'sc-programtable-column-1' in classes:  # course numbers
        course_numbers.append(td.get_text(strip=True))

# remove duplicates and maintain order
single_course_titles = list(dict.fromkeys(course_titles))
single_course_numbers = list(dict.fromkeys(course_numbers))

print(single_course_titles)
print(single_course_numbers)
# every course is 4 credits
credits = ['4'] * len(single_course_titles)
print(credits)


['The Science of Computing I', 'The Science of Computing II', 'Data Structures and Algorithm Analysis', 'Introduction to Data Science', 'Applied Data Science', 'Data Science Capstone', 'Calculus I', 'Calculus II', 'Calculus III', 'Computational Linear Algebra', 'Applied Statistics', 'Probability', 'Database Management Systems', 'Artificial Intelligence and Machine Learning', 'Applied Regression Analysis', 'Mathematical Statistics']
['CSC 101', 'CSC 102', 'CSC 201', 'DSC 101', 'DSC 201', 'DSC 401', 'MAT 260', 'MAT 261', 'MAT 262', 'MAT 271', 'MAT 272', 'MAT 310', 'CSC 340', 'CSC 410', 'MAT 402', 'MAT 425']
['4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4', '4']


In [6]:
# default setting all courses to requirement
requirements = []
requirements= ['Yes'] * len(single_course_titles)
print(requirements)

['Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes']


In [7]:
# creating dataframe
# importing pandas as pd
import pandas as pd

# dictionary of lists
dict = {'Course_number': single_course_numbers, 'Course_title': single_course_titles, 'Credit_hours': credits}

df = pd.DataFrame(dict)

print(df)

   Course_number                                  Course_title Credit_hours
0        CSC 101                    The Science of Computing I            4
1        CSC 102                   The Science of Computing II            4
2        CSC 201        Data Structures and Algorithm Analysis            4
3        DSC 101                  Introduction to Data Science            4
4        DSC 201                          Applied Data Science            4
5        DSC 401                         Data Science Capstone            4
6        MAT 260                                    Calculus I            4
7        MAT 261                                   Calculus II            4
8        MAT 262                                  Calculus III            4
9        MAT 271                  Computational Linear Algebra            4
10       MAT 272                            Applied Statistics            4
11       MAT 310                                   Probability            4
12       CSC

In [8]:
import numpy as np
courses = ['CSC 340', 'CSC 410', 'MAT 402', 'MAT 425']  # list of elective courses
df['Elective'] = np.where(df['Course_number'].isin(courses), 'Yes', 'No')
print(df)

   Course_number                                  Course_title Credit_hours  \
0        CSC 101                    The Science of Computing I            4   
1        CSC 102                   The Science of Computing II            4   
2        CSC 201        Data Structures and Algorithm Analysis            4   
3        DSC 101                  Introduction to Data Science            4   
4        DSC 201                          Applied Data Science            4   
5        DSC 401                         Data Science Capstone            4   
6        MAT 260                                    Calculus I            4   
7        MAT 261                                   Calculus II            4   
8        MAT 262                                  Calculus III            4   
9        MAT 271                  Computational Linear Algebra            4   
10       MAT 272                            Applied Statistics            4   
11       MAT 310                                   P

In [9]:
# save as csv
df.to_csv('DScoursecatalog.csv', index=False)

from google.colab import files
# download file to computer
files.download('DScoursecatalog.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>