In [21]:
import os
import pandas as pd
import numpy as np
import asyncio
from dotenv import load_dotenv

load_dotenv()

from text_lloom.src.text_lloom import workbench as wb

In [11]:
print("Starting lloom demo")
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

Starting lloom demo


In [12]:
df = pd.read_excel("data/2024_election.xlsx")
print(df.columns)
df= df[['commentID', 'recommendations', 'commentBody']]
print(df.head())

Index(['commentID', 'status', 'commentSequence', 'userID', 'userDisplayName',
       'userLocation', 'userTitle', 'userURL', 'picURL', 'commentTitle',
       'commentBody', 'createDate', 'updateDate', 'approveDate',
       'recommendations', 'replyCount', 'replies', 'editorsSelection',
       'parentID', 'parentUserDisplayName', 'depth', 'commentType', 'trusted',
       'recommendedFlag', 'permID', 'isAnonymous', 'text'],
      dtype='object')
   commentID  recommendations  \
0          0                8   
1          1                9   
2          2               22   
3          3                9   
4          4               29   

                                                                                                                                                                                                                                                                                                                                                                 

In [13]:
def print_concepts(lloom_instance, name_only=False, examples=True):
    for concept_id, concept in lloom_instance.concepts.items():
        if name_only:
            print(concept.name + " (Size: " + str(len(concept.members)) + ")")
        else:
            # print(concept)
            # print(df['commentID'])
            comments = lloom_instance.in_df[lloom_instance.in_df['id'].isin(map( lambda x: int(x), concept.members))]
            comments = comments['commentBody'].tolist()
            # print(f"Comments: {comments}")
            res = f"""
            Concept ID: {concept.id}.
            Concept Name: {concept.name}.
            Concept Prompt: {concept.prompt}.\n
            Concept Size: {len(concept.members)}.\n
            """


            if examples:
                # res += f"Concept Examples: \n\t {('\n\t').join(comments)}"
                comments = [x.replace("\n", " ") for x in comments]
                res += "Members: \n\t"
                res += ('\n\t*').join(comments)
                # res += f"Members: \n\t {comments}"

                # get the claim from the claim ids in members
                

            res += "\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
            
            print(res)


In [14]:
def save_concepts_to_json(lloom_instance, filename):
    import json
    import datetime
    time = datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
    with open(filename + time, 'w') as f:
        concepts_and_comments = {}
        for concept_id, concept in lloom_instance.concepts.items():
            comments = lloom_instance.in_df[lloom_instance.in_df['id'].isin(map( lambda x: int(x), concept.members))]
            comments = comments['commentBody'].tolist()
            concepts_and_comments[concept.name] = comments
        json.dump(concepts_and_comments, f, indent=4)

In [15]:
lloom_nofilter_newsummarize = wb.lloom(
    df=df,
    text_col="commentBody",
    # id_col="commentID",  # Optional
    debug=True,

    # # Model specification
    distill_model_name = "gpt-3.5-turbo-0125",
    embed_model_name = "text-embedding-3-small",
    synth_model_name = "gpt-3.5-turbo-0125",
    score_model_name = "gpt-3.5-turbo-0125",
)
custom_prompts = {
    "distill_filter": None,
    "distill_summarize": lloom_nofilter_newsummarize.show_prompt("distill_summarize"),
    "synthesize": None,
}
params = {
    "filter_n_quotes": 3, #shouldn't matter
    "summ_n_bullets": 2,
    "synth_n_concepts": 0,
}
cur_seed = "explicit or implied claims or arguments"  # Optionally replace with string
await lloom_nofilter_newsummarize.gen(seed=cur_seed, custom_prompts=custom_prompts, params=params)
print(lloom_nofilter_newsummarize.summary())

No `id_col` provided. Created an ID column named 'id'.


[1mEstimated cost[0m: $0.05
**Please note that this is only an approximate cost estimate**


[1m[48;5;228mAction required[0m[0m


[48;5;117mDistill-summarize[0m
⠋ LoadingBatched version
processing distill summarize llm results
all_ex_ids: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]
bullet_df:       id                                                         commentBody
0      0                       Biden stalls in primaries with filler phrases
1      0        Advisors urge attacking 

In [16]:
save_concepts_to_json(lloom_nofilter_newsummarize, "concept_logs/2024_election_concepts_")

In [17]:
print_concepts(lloom_nofilter_newsummarize, name_only=False, examples=True)


            Concept ID: 6fbf58cf-aaf7-4498-ad22-2f83db76202c.
            Concept Name: Preference for Stronger Candidate.
            Concept Prompt: Does the text example suggest replacing a candidate with a stronger alternative?.

            Concept Size: 77.

            Members: 
	Biden will do what he did last time in the primaries: stall for time to run out the clock by repeating filler phrases -- no, I really mean that -- it's not just hyperbole -- because he has nothing to say. His advisors are urging him not to focus on his accomplishments and instead attack Trump, which is the only advice they can give, because he has no accomplishments. Of course, his whole strategy will be to try to get under Trump's skin, so he will repeat the word "loser" at every opportunity. I'll be watching the moderators, who hate Trump as much as Biden does, to see if they cut his mike if they don't like his answers.
	*Do you think anyone will really talk about how a 30 year old in this country ca

In [18]:
print_concepts(lloom_nofilter_newsummarize, name_only=True)

Preference for Stronger Candidate (Size: 77)
Encouragement boosts performance (Size: 11)
Managing Expectations (Size: 14)
Trump's Economic Policies Impact (Size: 7)
Evaluating Candidates Based on Skills (Size: 3)
Impact of Election Outcome on Civil Unrest (Size: 4)
Effectiveness of Cutting Trump's Mic (Size: 4)
Importance of Democracy (Size: 3)
Discontent with Political System (Size: 7)
Criticism of Biden's Campaign Strategy (Size: 4)
Voter preference and intention for Democratic candidates (Size: 7)
Partisan Bias in Coverage (Size: 6)
Need for Representative Candidates (Size: 2)
Political strategy in elections (Size: 2)
Evaluation of Biden's Speeches (Size: 5)
Economic Challenges (Size: 3)
Assessment of Biden's Performance (Size: 4)


In [19]:
lloom_nofilter_newsummarize.saved_dfs.keys()

dict_keys([('Distill-summarize', '2024-07-05-17-05-40'), ('Cluster', '2024-07-05-17-05-44'), ('Synthesize', '2024-07-05-17-05-45'), ('Review-remove', '2024-07-05-17-05-47'), ('Review-merge', '2024-07-05-17-05-48')])

In [20]:
synth_df = lloom_nofilter_newsummarize.saved_dfs[('Synthesize', '2024-07-05-14-58-47')]
synth_df

KeyError: ('Synthesize', '2024-07-05-14-58-47')

In [None]:
grouped_synth_df = synth_df.groupby('concept_name').size().sort_values(ascending=False)
grouped_synth_df

concept_name
Analogical Reasoning                                1
Assessment of Biden's Health                        1
Trump's Mic Control Impact                          1
Trump's Impact on Elections                         1
Support for Biden's Campaign                        1
Questioning Trump's legitimacy                      1
Negative impact of Trump's presidency               1
Media Bias Towards Biden                            1
Media Bias Criticism                                1
Joe's Performance Concerns                          1
Importance of November Elections                    1
Housing vs. Assisted Living Decision                1
Focus on Biden's Answering Ability                  1
Expectation vs Reality                              1
Dismissal of Complaints                             1
Debates as Entertainment                            1
Debate preparation focus on Biden's advantage       1
Critique of Trump's Mental State                    1
Critique of Tru