In [1]:
import os

os.chdir("..")

In [2]:
from litellm import completion



def get_completion(prompt, 
                   model="openai/gpt-4.1-2025-04-14", 
                   temperature=0, max_tries=5):
    
    for _ in range(max_tries):
        try:
            # Call the completion function with the provided parameters
            response = completion(
                model=model, temperature=temperature,
                messages=[{"role": "user", "content": prompt}]
            )
            return response["choices"][0]["message"]["content"]
        
        except Exception as e:
            print(f"Error: {e}")
            sleep(10)
            continue
    
    return "SOMETHING_WRONG"

### Make it less colloquial

In [28]:
from string import Template

prompt_tmp = '''
You are given an excerpt from a transcribed, spontaneous conversation between two individuals. \
Your task is to revise the excerpt to produce a clear, polished version of the dialogue that reads like formal written text. \
Transform any standalone words or phrases into complete, grammatically correct sentences where appropriate. Do not add any \
additional information or context or change the meaning of the text. Do not output anything other than the revised excerpt.

Here is the excerpt:
${excerpt}

Revised Excerpt:
'''.strip()
prompt_tmp = Template(prompt_tmp)

In [29]:
import pandas as pd
from tqdm import tqdm

def revise_df(df):
    df_revised = df.copy()
    pairs = df_revised.columns.tolist()[3:]

    for i, row in tqdm(df_revised.iterrows(), total=len(df_revised)):
        for pair in pairs:
            excerpt = row[pair]
            
            if pd.isna(excerpt):
                continue
            
            prompt = prompt_tmp.substitute(excerpt=excerpt)
            revised_excerpt = get_completion(prompt)
            df_revised.at[i, pair] = revised_excerpt
    
    return df_revised


def combine_original_and_revision(df, df_revised):
    out = []
    cols = ["Trial", "Pair", "Original", "Revised"]
    pairs = df.columns.tolist()[3:]

    for i, row in df.iterrows():
        trial = row["Trial"]
        for pair in pairs:
            original = row[pair]
            revised = df_revised.at[i, pair]
            
            out.append({
                "Trial": trial,
                "Pair": pair,
                "Original": original,
                "Revised": revised
            })

    return pd.DataFrame(out, columns=cols)

df_dogs = pd.read_excel("data/dogs-matching-data.xlsx")
df_revised = revise_df(df_dogs)
df_revised.to_excel("data/dogs-matching-data-revised.xlsx", index=False)
print("Revised data saved to data/dogs-matching-data-revised.xlsx")

df_baskets = pd.read_excel("data/baskets-matching-data.xlsx")
df_baskets_revised = revise_df(df_baskets)
df_baskets_revised.to_excel("data/baskets-matching-data-revised.xlsx", index=False)
print("Revised data saved to data/baskets-matching-data-revised.xlsx")

100%|██████████| 40/40 [09:55<00:00, 14.88s/it]


Revised data saved to data/dogs-matching-data-revised.xlsx


100%|██████████| 40/40 [09:23<00:00, 14.10s/it]

Revised data saved to data/baskets-matching-data-revised.xlsx





In [None]:
from IPython.display import display, HTML


def pretty_print(df):
    return display(HTML( df.to_html().replace("\\n","<br>")))


combined = combine_original_and_revision(df_dogs, df_revised)

In [26]:
pretty_print(combined.sample(5))

Unnamed: 0,Trial,Pair,Original,Revised
152,2,Pair_22,"D: Yeah (laughs) And his tail is sticking up Number six is the one with the golden necklace Not the necklace but a leash? M: The circle? D: Yeah, the circle M: Ok","D: Yes. (laughs) And his tail is sticking up. Number six is the one with the golden necklace—no, not the necklace, but a leash. M: The circle? D: Yes, the circle. M: Okay."
347,4,Pair_37,D: Number five is the Doberman pinscher with the...the black…dog with the...looks like there's grease on 'im M: Number seven…,"D: Number five is the Doberman Pinscher, the black dog that looks like there is grease on him. M: Number seven."
221,3,Pair_21,D: Third is the Siberian husky M: Yup,D: The third is the Siberian husky. M: Yes.
142,2,Pair_22,"D: It's in the grass background Number five is the, um--remember the last one we had? Uh, the wooshy--uh, he--he has like a mustache or *something?* M: *A mustache?*","D: It is in the grass background. Number five is the, um—do you remember the last one we had? The wooshy one—he has like a mustache or something. M: A mustache?"
132,2,Pair_22,"D: Number four is the one with the leash Um, he--his body's towards the right, but he's toward--his head is towards the left M: In the grass, *right?* D: *Yeah* M: Ok","D: Number four is the one with the leash. His body is facing towards the right, but his head is turned towards the left. M: He is in the grass, correct? D: Yes. M: Okay."


### No Interaction, Just a Summary

In [9]:
from string import Template

prompt_tmp = '''
You are given an excerpt from a transcribed, spontaneous conversation between two individuals. \
Your task is to extract and concisely summarize all descriptions used to characterize a specific object mentioned in the excerpt. \
You must follow the instructions below:

1. Preserve all relevant descriptive details.
2. Do not alter the meaning, add context, or introduce new information.
3. Your response must only include the final summary—do not include the original excerpt or any explanatory text.

Excerpt:

${excerpt}

Summary of Object Descriptions:
'''.strip()
prompt_tmp = Template(prompt_tmp)

In [11]:
import pandas as pd
from tqdm import tqdm

def revise_df(df):
    df_revised = df.copy()
    pairs = df_revised.columns.tolist()[3:]

    for i, row in tqdm(df_revised.iterrows(), total=len(df_revised)):
        for pair in pairs:
            excerpt = row[pair]
            
            if pd.isna(excerpt):
                continue
            
            prompt = prompt_tmp.substitute(excerpt=excerpt)
            revised_excerpt = get_completion(prompt)
            df_revised.at[i, pair] = revised_excerpt
    
    return df_revised


def combine_original_and_revision(df, df_revised):
    out = []
    cols = ["Trial", "Pair", "Original", "Revised"]
    pairs = df.columns.tolist()[3:]

    for i, row in df.iterrows():
        trial = row["Trial"]
        for pair in pairs:
            original = row[pair]
            revised = df_revised.at[i, pair]
            
            out.append({
                "Trial": trial,
                "Pair": pair,
                "Original": original,
                "Revised": revised
            })

    return pd.DataFrame(out, columns=cols)

df_dogs = pd.read_excel("data/dogs-matching-data.xlsx")
df_revised = revise_df(df_dogs)
df_revised.to_excel("data/dogs-matching-data-summarized.xlsx", index=False)
print("Summarized data saved to data/dogs-matching-data-summarized.xlsx")

df_baskets = pd.read_excel("data/baskets-matching-data.xlsx")
df_baskets_revised = revise_df(df_baskets)
df_baskets_revised.to_excel("data/baskets-matching-data-summarized.xlsx", index=False)
print("Summarized data saved to data/baskets-matching-data-summarized.xlsx")

100%|██████████| 40/40 [06:53<00:00, 10.33s/it]


Summarized data saved to data/dogs-matching-data-summarized.xlsx


100%|██████████| 40/40 [06:57<00:00, 10.44s/it]

Summarized data saved to data/baskets-matching-data-revised.xlsx





In [12]:
from IPython.display import display, HTML


def pretty_print(df):
    return display(HTML( df.to_html().replace("\\n","<br>")))


combined = combine_original_and_revision(df_dogs, df_revised)

In [14]:
pretty_print(combined.sample(5))

Unnamed: 0,Trial,Pair,Original,Revised
340,4,Pair_20,D: The Doberman?  M: Gotcha,No descriptive details about the object were provided.
361,4,Pair_21,"D: Uh, seventh is the hotdog M: Yup","The object is referred to as ""the hotdog."" No additional descriptive details are provided."
306,4,Pair_35,"D: One Uh, tail sticking up in the air M: Ok",Tail sticking up in the air
136,2,Pair_35,"D: Ok Dog number four was, um...again that dog which--his leash is sort of tied to the ground Its tail is laying--laying down on the grass Green grass behind it? All brown It's, uh, facing in front of you, but you could see its head is tilted to the side M: Ok D: You got it? M: Yeah","Dog number four has its leash tied to the ground, its tail laying down on all brown grass with green grass behind it, and it is facing in front with its head tilted to the side."
366,4,Pair_35,"D: Dog number seven Short legs, long body M: Ok","Short legs, long body"
