In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from google.colab import files
import time

In [2]:
# while catalog links are being scraped, will just use a hard-coded link
URL = "https://catalog.utdallas.edu/now/undergraduate/programs/bbs/cognitive-science" # grabbed from data
title = "Bachelor of Science in Cognitive Science" # grabbed from data
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')

NOTE:
<ul>
  <li>All course numbers have the class='coursebook-info'</li>
  <li>The parent of the mentioned class contains the full text</li>
  <li>Entities with class='xind-3' is the category or requirement</li>
  <li>All degree plan-relevant information such as courses and descriptions contain 'xind-(1-7)'.</li>
</ul>

In [3]:
# The following are helper functions that help analyze classes, all functions are passed arrays of classes
# to find 'xind' value, is passed an array of classes
def find_xind_val(classes):
  for cl in classes:
    if 'xind-' in cl:
      return int(cl.replace('xind-',''))
      
# to find value after 'cat-'
def find_cat(classes):
  for cl in classes:
    if 'cat-' in cl:
      return cl.replace('cat-','')

In [4]:
courses = soup.find_all(class_='coursebook-info')
headers = soup.find_all(class_='xind-3')
liners = soup.find_all(class_=[f'xind-{x}' for x in range(8)])

In [5]:
cols = ['Text Content', 'Tag', 'Classes', 'xind', 'cat', 'style', 'ID']
df = pd.DataFrame([['']*len(cols) for x in enumerate(liners)],columns=cols)

In [6]:
for i, line in enumerate(liners):
  df['Text Content'][i] = line.text
  df['Tag'][i] = liners[i].name
  df['Classes'][i] = liners[i]['class']
  df['xind'][i] = find_xind_val(df['Classes'][i])
  df['cat'][i] = find_cat(df['Classes'][i])
  try: 
    df['ID'][i] = liners[i]['id']
  except:
    df['ID'][i] = np.NaN
  try:
    df['style'][i] = liners[i]['style']
  except:
    df['style'][i] = np.NaN

In [7]:
df.head()

Unnamed: 0,Text Content,Tag,Classes,xind,cat,style,ID
0,School of Behavioral and Brain Sciences,h1,"[cat-cat1, line-1, xind-0]",0,cat1,,school-of-behavioral-and-brain-sciences
1,Cognitive Science (BS),h2,"[cat-cat2, line-2, xind-1]",1,cat2,margin-left:15px,cognitive-science-bs-
2,Cognitive Science is the study of complex info...,p,"[cat-catp, line-3, xind-2]",2,catp,margin-left:30px,
3,The Cognitive Science program in the School of...,p,"[cat-catp, line-4, xind-2]",2,catp,margin-left:30px,
4,Cognitive Science Majors select core upper-div...,p,"[cat-catp, line-5, xind-2]",2,catp,margin-left:30px,


In [8]:
df.to_csv('line_data.csv', index=False)

In [9]:
liners[0:5]

[<h1 class="cat-cat1 line-1 xind-0" id="school-of-behavioral-and-brain-sciences">School of Behavioral and Brain Sciences</h1>,
 <h2 class="cat-cat2 line-2 xind-1" id="cognitive-science-bs-" style="margin-left:15px">Cognitive Science (BS)</h2>,
 <p class="cat-catp line-3 xind-2" style="margin-left:30px">Cognitive Science is the study of complex information processing in humans and machines and includes the multidisciplinary study of biological and artificial systems. Important components of cognitive science include areas of research such as: cognitive-neuroscience, brain-imaging studies of perceptual and cognitive processing, Human-Computer Interaction (HCI), Artificial Intelligence (AI) and machine learning, computational neuroscience, and mathematical psychology. The field of cognitive science draws from diverse approaches to understanding complex information processing systems, including research from experimental psychology, neuroscience, linguistics, philosophy, computer science, 

In [10]:
liners[3]['style']

'margin-left:30px'