In [1]:
import pandas as pd

from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import MaximalMarginalRelevance

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
training_data = pd.read_csv("training_data.csv")
training_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,comment number,tone,topic
0,0,3,comment 3,against,Discretion of the program should be allowed in...
1,1,3,comment 3,against,Parents may not recognize the need for additi...
2,2,3,comment 3,against,Head Start programs may not have the speciali...
3,3,3,comment 3,against,Negative program outcomes when dealing with v...
4,4,4,comment 4,against,Opposition to expansion of bureaucracy


In [4]:
docs = training_data['topic']

cluster_model = KMeans(n_clusters=15) 
# Normally, the cluster model is HDBScan, but I found a large number of ouliers (unassigned docs) when
# using this model. KMeans does not have outliers. 

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
# Some words appear quite often in every topic but are generally not considered stop words. In this case
# "child", "head", "start" etc. Setting this parameter removes frequent words similar to how stop words
# are removed

representation_model = MaximalMarginalRelevance(diversity=0.2)
# When we calculate the weights of keywords, we typically do not consider whether we already have 
# similar keywords in our topic. Words like "car" and "cars" essentially represent the same information 
# and are often redundant. MMR considers the similarity of keywords/keyphrases with the document, along 
# with the similarity of already selected keywords and keyphrases. This results in a selection of 
# keywords that maximize their diversity with respect to the document.

vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1,4))
# Increasing the n-gram range from the default of a 1-gram
model = BERTopic(hdbscan_model=cluster_model, 
                 ctfidf_model=ctfidf_model,
                 vectorizer_model=vectorizer_model, 
                 representation_model=representation_model)

topics = model.fit_transform(docs)

In [5]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,47,0_professional development_degree_credit beari...,"[professional development, degree, credit bear...",[ Requiring all training to be credit bearing ...
1,1,41,1_nap time_service time infants toddlers_presc...,"[nap time, service time infants toddlers, pres...",[Half-day sessions allow children to learn soc...
2,2,39,2_families served_35 program families_family s...,"[families served, 35 program families, family ...",[ Importance of having parents from the commun...
3,3,39,3_head start programs_start benefits_head star...,"[head start programs, start benefits, head sta...",[Head Start provides valuable preschool servic...
4,4,38,4_safety_violent behavior_child safety_backgro...,"[safety, violent behavior, child safety, backg...",[ Limited availability of safe and healthy lic...
5,5,34,5_disabilities_special needs_children special_...,"[disabilities, special needs, children special...",[ Recognizing that impulsivity and severe emot...
6,6,34,6_teachers_burnout_curriculum_time teachers,"[teachers, burnout, curriculum, time teachers,...",[ Lack of specific supports for teachers worki...
7,7,31,7_advisory committee_policy council_standards_...,"[advisory committee, policy council, standards...",[ Effectiveness of rigorous standards and expe...
8,8,31,8_homeless_areas_homeless children_compliance ...,"[homeless, areas, homeless children, complianc...",[ Benefit of allowing providers to hold slots ...
9,9,26,9_mental health_health literacy_maternal mood ...,"[mental health, health literacy, maternal mood...",[ Ensuring that children with severe mental he...


In [6]:
topic_labels = model.generate_topic_labels(nr_words=3, separator=", ")
topic_labels

['0, professional development, degree, credit bearing',
 '1, nap time, service time infants toddlers, preschoolers',
 '2, families served, 35 program families, family style',
 '3, head start programs, start benefits, head start benefits',
 '4, safety, violent behavior, child safety',
 '5, disabilities, special needs, children special',
 '6, teachers, burnout, curriculum',
 '7, advisory committee, policy council, standards',
 '8, homeless, areas, homeless children',
 '9, mental health, health literacy, maternal mood depressive',
 '10, parental involvement providing sufficient, parents child absent hour, parental involvement providing',
 '11, staff, meetings, compensation',
 '12, home visits, home visitors, home visiting staff',
 '13, needs maintaining, care children, foster care children',
 '14, rating elimination written program, quality rating elimination written, quality rating variability']

Merging the bert topic information with the original inline comments file to create a dataframe we can use for the final product:

In [7]:
com_exl = pd.read_excel("Other comments.xlsx")

In [9]:
training_data.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'comment number', 'tone', 'topic'], dtype='object')

In [11]:
df_bert = pd.DataFrame({'comment number': training_data['comment number'], 
                        'tone': training_data['tone'], 
                        'topic': topics[0], 
                        'document': docs})
topic_0 = df_bert[df_bert.topic == 8][:10]
topic_0['document'].tolist()

com_exl['comment number'] = ""
for i in range(len(com_exl)):
    com_exl['comment number'][i] = "comment {}".format(i)

df_bert = df_bert.merge(com_exl, how='left', left_on="comment number", right_on="comment number")
df_bert.head()

Unnamed: 0,comment number,tone,topic,document,Document ID,Comment
0,comment 3,against,7,Discretion of the program should be allowed in...,ACF-2015-0008-0005,I am writing to comment on the proposed regula...
1,comment 3,against,2,Parents may not recognize the need for additi...,ACF-2015-0008-0005,I am writing to comment on the proposed regula...
2,comment 3,against,3,Head Start programs may not have the speciali...,ACF-2015-0008-0005,I am writing to comment on the proposed regula...
3,comment 3,against,4,Negative program outcomes when dealing with v...,ACF-2015-0008-0005,I am writing to comment on the proposed regula...
4,comment 4,against,14,Opposition to expansion of bureaucracy,ACF-2015-0008-0006,i oppose expansion of this fat cat bureaucracy...


Merging in the topic labels to the dataframe

In [12]:
# Split each string into a tuple of (number, label)
data_tuples = [tuple(item.split(', ', 1)) for item in topic_labels]

# Create a DataFrame from the list of tuples
labels_df = pd.DataFrame(data_tuples, columns=['topic_number', 'topic_label'])

# Convert 'number' column to numeric if needed
labels_df['topic_number'] = pd.to_numeric(labels_df['topic_number'], errors='coerce')

df_bert = df_bert.merge(labels_df, how='left', left_on='topic', right_on='topic_number')
df_bert = df_bert[['Document ID','tone','topic_number','topic_label','document','Comment']]
df_bert.columns = ['Document ID','tone','bert_topic_number','bert_topic_label','chatgpt_topic','Comment']
df_bert.head()

Unnamed: 0,Document ID,tone,bert_topic_number,bert_topic_label,chatgpt_topic,Comment
0,ACF-2015-0008-0005,against,7,"advisory committee, policy council, standards",Discretion of the program should be allowed in...,I am writing to comment on the proposed regula...
1,ACF-2015-0008-0005,against,2,"families served, 35 program families, family s...",Parents may not recognize the need for additi...,I am writing to comment on the proposed regula...
2,ACF-2015-0008-0005,against,3,"head start programs, start benefits, head star...",Head Start programs may not have the speciali...,I am writing to comment on the proposed regula...
3,ACF-2015-0008-0005,against,4,"safety, violent behavior, child safety",Negative program outcomes when dealing with v...,I am writing to comment on the proposed regula...
4,ACF-2015-0008-0006,against,14,"rating elimination written program, quality ra...",Opposition to expansion of bureaucracy,i oppose expansion of this fat cat bureaucracy...


In [None]:
df_bert[['topic','Comment']].groupby('Comment').count()