In [None]:
!pip install requests
!pip install beautifulsoup4
!pip install pyarrow

import requests
from bs4 import BeautifulSoup
import pandas as pd
import ssl
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import re
nltk.download('vader_lexicon')

In [None]:
# Settings and Requests
url = "https://my.sa.ucsb.edu/public/curriculum/coursesearch.aspx"
data = requests.get(url)
html = data.text
soup = BeautifulSoup(html, 'html.parser')
session = requests.Session()
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Content-Type': 'application/x-www-form-urlencoded'
}

In [None]:
# Obtaining all data from enire webpage
classId = []
year = []
level = []

for option in soup.find_all('option'):
    classId.append(option['value'])

year = classId[99:108]
level = classId[-3:]
classId = classId[:99]

depAndQuarter = []

for i in year:
    for c in classId:
        depAndQuarter.append([c, i])

depAndQuarter = pd.DataFrame(depAndQuarter)
depAndQuarter = depAndQuarter.rename(columns={0: "department", 1: "quarter"})
depAndQuarter = depAndQuarter[~depAndQuarter.isin(['20251']).any(axis=1)]
depAndQuarter = depAndQuarter.sort_values(by='department')
departments = depAndQuarter.groupby(['department']).count()

In [None]:
# Loop through the dropdown choices

data = pd.DataFrame(columns=['CourseID','Department','CourseTitle', 'Days','Time','Location'])

for d in range(len(departments)):
    for q in range(9):
        response = session.get(url, headers=headers)
        if response.status_code != 200:
            print("Error loading initial page:", response.status_code)
        else:
            # Parse the initial HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
        
            # Extract the necessary hidden fields for the form submission
            viewstate = soup.find("input", {"id": "__VIEWSTATE"})['value']
            viewstategenerator = soup.find("input", {"id": "__VIEWSTATEGENERATOR"})['value']
            eventvalidation = soup.find("input", {"id": "__EVENTVALIDATION"})['value']
        
            # Prepare the form data
            form_data = {
                '__VIEWSTATE': viewstate,
                '__VIEWSTATEGENERATOR': viewstategenerator,
                '__EVENTVALIDATION': eventvalidation,
                'ctl00$pageContent1$courseList': departments.index[d],  # Subject Area: Anthropology
                'ctl00$pageContent1$quarterList': depAndQuarter.iloc[q, 1],  # Quarter: FALL 2024
                'ctl00$pageContent1$dropDownCourseLevels': 'Undergraduate',  # Course Level: Undergraduate
                'ctl00$pageContent1$searchButton.x': '0',  # Simulating button click coordinates
                'ctl00$pageContent1$searchButton.y': '0'
            }
        
            # Send the POST request with form data to simulate clicking the search button
            response = session.post(url, data=form_data, headers=headers)
        
            # Check if the request was successful
            if response.status_code == 200:
                # Parse the response content using BeautifulSoup
                soup = BeautifulSoup(response.content, 'html.parser')
            else:
                print("Error in POST request:", response.status_code)

        classList = soup.find_all("tr", {"class": "CourseInfoRow"})

        for c in range(len(classList)):
            class_vec = []
            course = classList[c]
            class_vec.append(course.find_all("td", {"id": "CourseTitle"})[0].contents[0].strip())
            class_vec.append(departments.index[d])
            class_vec.append(course.find_all("td", {"class": "PrimaryCourse"})[0].find("span").get_text().strip())
            courseInfo = course.find_all("td")[-5:-2]
            for k in courseInfo:
                class_vec.append(k.get_text().strip())
            data.loc[len(data)] = class_vec
    
    print(departments.index[d], ' is department ', d, ' of ', len(departments), '\n')

In [None]:
# Extracting departments

dep_fullnames = pd.DataFrame(columns=["DepartmentCode", "DepartmentName"])
nameList = []
for option in soup.find_all('option'):
    nameList.append(option.text)
for d in range(len(nameList)):
    dep_fullnames.loc[d] = [nameList[d].split('-')[-1].strip(), nameList[d].split('-')[0].strip()]

pd.DataFrame(dep_fullnames).to_csv("depNames.csv", index=False) 

In [None]:
# Further Cleaning and Processing data.csv

import pandas as pd
import numpy as np
df = pd.read_csv("cleanedData.csv")
df_lectures = df[df['CourseTitle'].notna()]
locations_to_drop = ['T B A', '', 'ON   ASYNC', 'ON   LINE']
df_lectures = df_lectures[~df_lectures.isin(locations_to_drop).any(axis=1)]
room_df = pd.read_csv("roomToBuilding.csv")
coordinate_df = pd.read_csv("coordinates.csv")
data = pd.merge(df_lectures, room_df, on='Location')
data = pd.merge(data, coordinate_df, on='Location')
data.rename(columns={'Department': 'DepartmentCode'}, inplace=True)
depNames = pd.read_csv("depNames.csv")
data = pd.merge(data, depNames, on='DepartmentCode', how='left')
data = data.iloc[:, [0, 1, -1, 2, 3, 4, 5, 6, 7]]
data['Longitude'] = ""
data['Latitude'] = ""
for r in range(len(data)):
    coord = data.loc[r, "coordinates"].split(',')
    data.at[r, 'Latitude'] = float(coord[0])
    data.at[r, 'Longitude'] = float(coord[1])

data = pd.read_csv("cleanedData.csv")

In [None]:
# Creating and Processing Map Data
mapData = pd.DataFrame(columns=['Department','building','Count','Percentage','Latitude','Longitude'])
data.drop(['Location'], axis=1)
coordinate_df['Latitude'] = ""
coordinate_df['Longitude'] = ""
for r in range(len(coordinate_df)):
    coord = coordinate_df.loc[r, "coordinates"].split(',')
    coordinate_df.at[r, 'Latitude'] = float(coord[0])
    coordinate_df.at[r, 'Longitude'] = float(coord[1])

for d in depNames['DepartmentName']:
    df = data[data['DepartmentName'] == d]
    mergeDf = pd.DataFrame()
    lat = df.groupby(['Latitude']).count()
    mergeDf['Latitude'] = pd.to_numeric(lat.index, errors='coerce')
    mergeDf['Count'] = lat['CourseID'].values
    mergeDf['Department'] = np.array(d)
    mergeDf['Percentage'] = np.round(mergeDf['Count']/len(df)*100, 2)
    mergeDf = mergeDf.merge(coordinate_df.loc[:, ['Latitude', 'Longitude', 'building']].drop_duplicates(subset=['Latitude', 'building']), 
                            on="Latitude", how="left")
    mergeDf = mergeDf.iloc[:, [2, 5, 1, 3, 0, 4]]
    #mergeDf['Blurb'] = ""
    #for i in range(len(mergeDf)):
    #    mergeDf.at[i, 'Blurb'] = "Number of classes: " + str(mergeDf.at[i, 'Count']) + " (" + str(mergeDf.at[i, 'Percentage'].round())+ "2%)"
    mapData = pd.concat([mapData, mergeDf])

mapData.rename(columns={'building': 'Building'}, inplace=True)
mapData.to_csv("mapData.csv", index=False) 


data = pd.read_csv("mapData.csv")

coordinate_df = pd.read_csv("buildingCoordinates.csv")
coordinate_df['Latitude'] = ""
coordinate_df['Longitude'] = ""

for r in range(len(coordinate_df)):
    coord = coordinate_df.loc[r, "coordinates"].split(',')
    coordinate_df.at[r, 'Latitude'] = float(coord[0])
    coordinate_df.at[r, 'Longitude'] = float(coord[1])

data = pd.merge(data, coordinate_df, on='Building')
data.drop(['Latitude_x', 'Longitude_x'], axis=1, inplace=True)
data.rename(columns={'Latitude_y': 'Latitude', 'Longitude_y': 'Longitude'}, inplace=True)
data

data.to_csv("newMapData.csv", index=False) 

In [None]:
# Creating and processing Charts data

import pandas as pd
import numpy as np

data = pd.read_csv("cleanedData.csv")
depNames = data['DepartmentCode'].unique()

daysData = pd.DataFrame()
timesData = pd.DataFrame()

for dep in depNames:
    daysArr = []
    timesArr = []
    sub_df = data[data['DepartmentCode'] == dep]
    for i, r in sub_df.iterrows():
        x = str(r['Days']).split(' ')
        if len(x[0])>1:
            x = list(x[0])
        for xx in x:
            daysArr.append(xx)
        timesArr.append(str(r['Time']).split('-')[0].strip().replace(" ", ""))
    daysCount = pd.DataFrame(pd.DataFrame(daysArr).groupby(0)[0].count())
    daysCount['dep'] = np.full((len(daysCount),), dep)
    daysCount.rename(columns={0: 'count'}, inplace=True)
    daysCount.reset_index(inplace=True)
    daysCount.rename(columns={0: 'days'}, inplace=True)
    days_order = ["M", "T", "W", "R", "F"]
    daysCount["days"] = pd.Categorical(daysCount["days"], categories=days_order, ordered=True)

    # Sort the DataFrame
    daysCount = daysCount.sort_values("days")



    
    daysData = pd.concat([daysData, daysCount])

    timesCount = pd.DataFrame(pd.DataFrame(timesArr).groupby(0)[0].count())
    timesCount['dep'] = np.full((len(timesCount),), dep)
    timesCount.rename(columns={0: 'count'}, inplace=True)
    timesCount.reset_index(inplace=True)
    timesCount.rename(columns={0: 'time'}, inplace=True)
    timesCount['time'] = pd.to_datetime(timesCount['time'], format='%I:%M%p')
    timesCount = timesCount.sort_values(by='time')
    timesCount['time'] = timesCount['time'].dt.strftime('%I:%M%p')
    timesData = pd.concat([timesData, timesCount])







#timesData.to_csv('timesData.csv', index=False)
daysData.to_csv('daysData.csv', index=False)
daysData
