In [3]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load the cleaned course catalog
data = pd.read_csv('./cleaned_catalog.csv')  # Ensure this path matches your cleaned catalog file

# Function to display the top words for each topic
def display_topics(model, feature_names, no_top_words, subject):
    print(f"Subject: {subject}")
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    print("\n")  # Print a newline for readability between subjects

no_top_words = 10

# Loop through each unique subject in the DataFrame
for subject in data['Subject'].unique():
    # Filter the DataFrame to only include courses for the current subject
    subject_data = data[data['Subject'] == subject]
    
    # Ensure there's enough data to perform LDA
    if subject_data.shape[0] < 5:  # Adjust this threshold as necessary
        print(f"Not enough data for subject: {subject}")
        continue

    # Vectorization - CountVectorizer
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    dtm = vectorizer.fit_transform(subject_data['Description'])  # Adjust column name if different

    # LDA Model Training
    num_topics = 1
    lda = LatentDirichletAllocation(num_topics, random_state=0)  # Adjust n_components as necessary
    lda.fit(dtm)

    # Displaying Topics
    display_topics(lda, vectorizer.get_feature_names_out(), no_top_words, subject)


Not enough data for subject: AAS
Subject: ABE
Topic 1:
design engineering water soil principles infrastructure food properties biological development


Subject: ACCY
Topic 1:
accounting professional financial data business tax accountancy reporting complete information


Subject: ACE
Topic 1:
financial agricultural planning food problems economic analysis data management economics


Subject: ACES
Topic 1:
study abroad experience agricultural environmental sciences consumer academic special andor


Subject: ADV
Topic 1:
advertising media research social consumer sales relations brand industry data


Subject: AE
Topic 1:
aerospace engineering design control performance systems techniques registration waves theory


Subject: AFAS
Topic 1:
leadership force cadets air designed afrotc skills seniors overview level


Subject: AFRO
Topic 1:
african american hiphop political americans music studies study specifically understand


Subject: AFST
Topic 1:
african studies supervised research


Subj