# Case Study 4.2 - 02 Generate Names for Clusters

The next part of this case study is to create natural language names for the data we have clustered.

The original script can be found here: [CaseStudy_4.2_02-02.py](CaseStudy_4.2_02-02.py)

In [None]:
from langchain.chat_models import init_chat_model
from pydantic import BaseModel, Field

import pandas as pd
import numpy as np

In [None]:
model = init_chat_model("gemini-2.0-flash", model_provider="google_genai")

In [None]:
# Use Pydantic to define the data structure
class Topic(BaseModel):
    name: str = Field(description="The name of the core topic being discussed in all provided essays.")


In [None]:
structured_model = model.with_structured_output(Topic)

In [None]:
DELIMITER = '####'

In [None]:
def generate_prompt_text(essays: list[str]) -> str:
    TEXT_DELIMITER = "\n" + DELIMITER + "\n"
    essays_comb = TEXT_DELIMITER.join(essays)
    user_message = f'''
    Below is a set of student essays delimited with {DELIMITER}.

    Please identify the single main topic discussed in these essays.
    Return a just a topic name for the complete set.
    The topic name should be short, between one and three words long.

    Student Essays
    {DELIMITER}
    {essays_comb}
    {DELIMITER}
    '''
    return user_message

In [None]:
def get_topic_label(essays):
    prompt = generate_prompt_text(essays)
    result = structured_model.invoke(prompt)
    return result.name

In [None]:
df = pd.read_csv("data/clustered_data.csv")

In [None]:
clusters = df['cluster'].unique().tolist()
clusters.sort()

In [None]:
results = pd.DataFrame(columns=('ID', 'Records', 'Topic Name', 'Mean Dist'))

In [None]:
topics = {}

for c in clusters:
    temp = df[df['cluster']==c].copy()
    temp = temp.sort_values("dist_to_centroid").reset_index()
    examples = temp.loc[0:4,'text'].tolist()
    topic = get_topic_label(examples)
    topics[c] = topic
    record = {'ID':c, 'Records':len(temp), 'Topic Name':topic, 'Mean Dist':temp["dist_to_centroid"].mean()}
    results = pd.concat([results, pd.DataFrame([record])], ignore_index=True)


In [None]:
# Save the results and Display as a Markdown Table
results.to_csv("topic_labels", index=False)
markdown_table = results.to_markdown(index=False)
print(markdown_table)
