In [23]:
!pip install feedparser
!pip install bs4
!pip install urllib.parse

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement urllib.parse (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for urllib.parse[0m[31m
[0m

First using Feedparser to get links

In [24]:
import feedparser
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

feed = feedparser.parse('https://bg.raindrop.io/rss/public/47223284')

print([(e.title, e.link) for e in feed.entries])
print(f"Found {len(feed.entries)} entries")

entries = []
#entries is now our URL list from RSS feed
for e in feed.entries:
    title = getattr(e, "title", "")
    link = getattr(e, "link", "")
    entries.append({"title":title, "link":link})

[('Intro DS Syllabi at 2-year Colleges', 'https://docs.google.com/spreadsheets/d/1wihCoaD5Ei-11kxzowDjp_v0gXX5LNb_DdXwsBswknw/edit?gid=0#gid=0'), ('Introduction to Data Science | University of Stavanger', 'https://www.uis.no/en/course/DAT540_1'), ('Open Access Data Science Resource by Data Science Discovery at the University of Illinois', 'https://discovery.cs.illinois.edu/'), ('Columbia | Foundations of Data Science: Syllabus', 'https://www.columbia.edu/~cs2035/courses/orca2500.S18/syllabus.html'), ('Delta College MTH 225 - Introduction to Data Science - Modern Campus Catalog™', 'https://catalog.delta.edu/preview_course_nopop.php?catoid=15&coid=31564'), ('Data Science (DS) 210 | Johnson County Community College Catalog', 'https://catalog.jccc.edu/coursedescriptions/ds/#DS_210'), ('Undergrad Intro DS Syllabi', 'https://docs.google.com/spreadsheets/d/1HTnAukzyXh0wM411MMeiNjR-O1sXTmIwX8GqzQgPxIY/edit?gid=0#gid=0'), ('C S 8A: INTRODUCTION TO DATA SCIENCE Foothill College', 'https://catalo

In [25]:
from bs4 import BeautifulSoup
import requests
import json


def extract_text_from_url(url) -> dict:
    response = requests.get(url)
    response.raise_for_status()  # Ensure we notice bad responses
    soup = BeautifulSoup(response.text, 'html.parser')
    # Remove script and style elements
    for tag in soup(['script', 'style']):
        tag.decompose()

    # Get text
    text = soup.get_text(separator='\n')
    lines = [line.strip() for line in text.splitlines() if line.strip()]

    return {
        "url": url,
        "text": "\n".join(lines)
    }

In [26]:
urls = [entry['link'] for entry in entries]

with open('rss_extracted_texts.jsonl', 'w', encoding='utf-8') as f:
    for url in urls:
        try:
            extracted = extract_text_from_url(url)
            f.write(json.dumps(extracted, ensure_ascii=False) + '\n')
        except Exception as e:
            print(f"Failed to extract {url}: {e}")

Failed to extract https://www.columbia.edu/~cs2035/courses/orca2500.S18/syllabus.html: 403 Client Error: Forbidden for url: https://www.columbia.edu/~cs2035/courses/orca2500.S18/syllabus.html
Failed to extract https://fan.princeton.edu/fan/classes/525.html: 403 Client Error: Forbidden for url: https://fan.princeton.edu/fan/classes/525.html


In [27]:
docs = []
with open('rss_extracted_texts.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        docs.append(json.loads(line)["text"])
print(f"Extracted {len(docs)} documents.")

Extracted 30 documents.


In [28]:
import re
def clean_text(t):
    t = t.lower()
    t = re.sub(r'\s+', ' ', t)  # Replace multiple whitespace with single space
    t = re.sub(r'[^a-z0-9\s]', '', t)  # Remove non-alphanumeric characters
    return t.strip()
cleaned_docs = [clean_text(doc) for doc in docs]
print(cleaned_docs[0][:500])  # Print first 500 characters of the first

intro ds syllabi at 2year colleges  google sheets javascript isnt enabled in your browser so this file cant be opened enable and reload this browser version is no longer supported please upgrade to a supported browser intro ds syllabi at 2year colleges tab external share sign in file edit view insert format data tools extensions help accessibility debug unsaved changes to drive accessibility view only loading a b c d e f g h i j k l m n o p q r s t u v w x y z 1 course name department college pr


TF-IDF to see dominant terms per course

In [29]:


tfidf_vectorizer = TfidfVectorizer(
    max_features = 5000,
    stop_words='english',
    ngram_range=(1,2)
)

X = tfidf_vectorizer.fit_transform(cleaned_docs)

#top terms globally
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_means = X.mean(axis=0).A1

top = pd.DataFrame({
    'term': feature_names,
    'tfidf': tfidf_means
}).sort_values(by='tfidf', ascending=False).head(30)

print(top)

                   term     tfidf
1294               data  0.193379
4128            science  0.079902
1223             course  0.076249
1330       data science  0.070027
4586           students  0.053077
1018              class  0.045189
627         assignments  0.041167
3721             python  0.039728
2479                lab  0.036763
4828                use  0.034473
1060               code  0.033890
2573           learning  0.031285
2363       introduction  0.030377
2072           homework  0.029049
3662            project  0.027567
2099              hours  0.027137
918             catalog  0.025885
461            analysis  0.025842
1721              final  0.024896
614          assignment  0.024607
2603            lecture  0.024414
4950               work  0.024231
2155                ids  0.024149
4927               week  0.023997
4843              using  0.023916
3638        programming  0.023884
2366  introduction data  0.023240
1137           computer  0.023019
3581        pr

LDA Topic Modeling

Treating syllabi as multitopic to get a 'topic profile' rather than one explicit label for each syllabi

In [46]:
# build a document-term matrix for LDA
count_vectorizer = CountVectorizer(
    stop_words='english',
    max_df = 0.95, #ignore terms that appear in more than 95% of documents
    min_df = 2     #ignore terms that appear in less than 2 documents
)

X_count = count_vectorizer.fit_transform(cleaned_docs)

#fit LDA model
n_topics = 3
lda = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    learning_method='batch'
)

lda.fit(X_count)

In [47]:
#inspect topics
def print_top_words(model, feature_names, n_top_words=12):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic #{topic_idx + 1}:")
        top_features_ind = topic.argsort()[:-n_top_words - 1:-1]
        top_terms = [feature_names[i] for i in top_features_ind]
        print(f"Topic {topic_idx+1}: {', '.join(top_terms)}")

count_feature_names = count_vectorizer.get_feature_names_out()
print_top_words(lda, count_feature_names, n_top_words=12)

Topic #1:
Topic 1: university, catalog, course, services, code, analysis, statistical, information, apply, programming, college, hours
Topic #2:
Topic 2: python, lab, lecture, introduction, week, 2022, problem, code, project, course, programming, learning
Topic #3:
Topic 3: course, students, assignments, class, use, learning, code, assignment, hours, work, using, homework


In [48]:
#getting multi topic portions for each syllabus
topic_distributions = lda.transform(X_count)

topic_df = pd.DataFrame(
    topic_distributions,
    columns=[f'Topic_{i+1}' for i in range(n_topics)]
)

print(topic_df.head())

    Topic_1   Topic_2   Topic_3
0  0.000763  0.998552  0.000685
1  0.017932  0.963205  0.018863
2  0.091318  0.610850  0.297832
3  0.998596  0.000738  0.000666
4  0.999731  0.000133  0.000136
