In [None]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
from datetime import datetime
import os
import textwrap

def create_llm_comparison_widget(df, csv_name):
    """
    Creates an interactive Jupyter Notebook widget for comparing human and LLM sentiment/relevance classifications.

    Args:
        df: pandas DataFrame with Reddit data and LLM classifications ('sentiment', 'is_relevant', 'reasoning').
        csv_name: Original CSV file name.
    """

    # --- Data Prep & Initialization ---
    df = df.sort_values(by=['upvotes', 'subreddit'], ascending=[False, True]).reset_index(drop=True)

    start_index = 0
    # Initialize dataframe to store human and llm evaluations
    eval_df_columns = df.columns.tolist() + ['human_sentiment', 'human_is_relevant', 'evaluation_type']
    eval_df = pd.DataFrame(columns=eval_df_columns)


    if start_index >= len(df):
        print("All data has already been processed.")
        return

    current_index = start_index
    subreddit_counts = {sub: 0 for sub in df['subreddit'].unique()}
    for sub in subreddit_counts:
        subreddit_counts[sub] = df.loc[:start_index-1, 'subreddit'].eq(sub).sum()


    # --- UI Elements ---
    post_display = widgets.Output()
    llm_reasoning_display = widgets.Output() # Output widget for LLM reasoning
    main_container = widgets.VBox(layout = {'border': '1px solid black'})

    agree_button = widgets.Button(description="Agree with LLM") # New Agree Button
    positive_button = widgets.Button(description="Positive")
    negative_button = widgets.Button(description="Negative")
    neutral_button = widgets.Button(description="Neutral")
    irrelevant_button = widgets.Button(description="Irrelevant")
    skip_button = widgets.Button(description="Skip")
    save_button = widgets.Button(description="Save Progress")

    button_box = widgets.HBox([agree_button, positive_button, negative_button, neutral_button, irrelevant_button, skip_button]) # Include Agree Button

    total_processed_count = widgets.Label(value=f"Processed: {start_index}/{len(df)}") # Changed label
    sentiment_counts = {
        'positive': widgets.Label(value="Positive: 0"),
        'negative': widgets.Label(value="Negative: 0"),
        'neutral':  widgets.Label(value="Neutral: 0"),
    }
    relevance_counts = {
        'relevant': widgets.Label(value="Relevant: 0"),
        'irrelevant': widgets.Label(value="Irrelevant: 0"),
    }
    subreddit_rotation_label = widgets.Label(value="Subreddit Rotation: ")

    sentiment_box = widgets.HBox(list(sentiment_counts.values()))
    relevance_box = widgets.HBox(list(relevance_counts.values()))

    # --- Helper Functions ---
    def get_next_index(current_index, subreddit_counts):
        """Gets the next index, prioritizing subreddit rotation."""

        available_subreddits = [sub for sub in df['subreddit'].unique()
                                    if subreddit_counts[sub] < df['subreddit'].value_counts()[sub]]

        if not available_subreddits:
            return None

        min_sub = min(available_subreddits, key=subreddit_counts.get)
        mask = (df['subreddit'] == min_sub) & (eval_df['subreddit'].isna().all(axis=1) if eval_df.empty else ~eval_df.iloc[:,0].isin(df.iloc[:current_index+1].index))


        unlabeled_in_min_sub_indices = df[mask].index.difference(eval_df.index)

        if not unlabeled_in_min_sub_indices.empty:
            return unlabeled_in_min_sub_indices[0]
        else:
            subreddit_counts[min_sub] = df['subreddit'].eq(min_sub).sum()
            return get_next_index(current_index, subreddit_counts)


    def update_display():
        nonlocal current_index
        post_display.clear_output(wait=True)
        llm_reasoning_display.clear_output(wait=True) # Clear LLM reasoning display

        if current_index >= len(df):
            with post_display:
                print("All data has been processed!")
            return

        row = df.loc[current_index]
        with post_display:
            output = []
            output.append("---- Post Details ----")
            output.append(f"Title: {row['title']}")
            output.append(f"Subreddit: {row['subreddit']} | Category: {row['category']}")
            output.append(f"Upvotes: {row['upvotes']} | Date: {row['date']}")
            if not pd.isna(row['post_flair']):
                output.append(f"Post Flair: {row['post_flair']}")
            if not pd.isna(row['user_flair']):
                output.append(f"User Flair: {row['user_flair']}")
            output.append("\n---- Content ----")
            wrapped_text = textwrap.fill(str(row['text']), width=170) # Text wrapping here
            output.append(wrapped_text)

            for line in output:
                post_display.append_stdout(line + '\n')

        with llm_reasoning_display: # Display LLM Reasoning
            llm_output = []
            llm_output.append("---- LLM Classification ----")
            llm_output.append(f"LLM Sentiment: {row['sentiment']}")
            llm_output.append(f"LLM Relevance: {row['is_relevant']}")
            llm_output.append("\n---- LLM Reasoning ----")
            wrapped_reasoning = textwrap.fill(str(row['reasoning']), width=170) # Text wrapping here
            llm_output.append(wrapped_reasoning)

            for line in llm_output:
                llm_reasoning_display.append_stdout(line + '\n')


    def update_counters():
        nonlocal current_index
        total_processed_count.value = f"Processed: {len(eval_df)}/{len(df)}" # Update processed count
        sentiment_counts['positive'].value = f"Positive: {eval_df['human_sentiment'].eq('positive').sum()}" # Count in eval_df
        sentiment_counts['negative'].value = f"Negative: {eval_df['human_sentiment'].eq('negative').sum()}" # Count in eval_df
        sentiment_counts['neutral'].value = f"Neutral: {eval_df['human_sentiment'].eq('neutral').sum()}"   # Count in eval_df
        relevance_counts['relevant'].value = f"Relevant: {eval_df['human_is_relevant'].eq('relevant').sum()}" # Count in eval_df
        relevance_counts['irrelevant'].value = f"Irrelevant: {eval_df['human_is_relevant'].eq('irrelevant').sum()}" # Count in eval_df
        subreddit_list = [f'{key}:{subreddit_counts[key]}' for key in subreddit_counts]
        wrapped_subreddit_rotation = textwrap.fill(", ".join(subreddit_list), width=170)
        subreddit_rotation_label.value = f"Subreddit Rotation: {wrapped_subreddit_rotation}"


    def on_button_clicked(button):
        nonlocal current_index, eval_df
        if current_index >= len(df):
            return

        evaluation_type = ""
        human_sentiment = None
        human_isRelevant = None

        if button.description == "Agree with LLM":
            evaluation_type = "agree_llm"
            human_sentiment = df.loc[current_index, 'sentiment']
            human_isRelevant = df.loc[current_index, 'is_relevant']
        elif button.description == "Positive":
            evaluation_type = "human_label"
            human_sentiment = 'positive'
            human_isRelevant = 'relevant'
        elif button.description == "Negative":
            evaluation_type = "human_label"
            human_sentiment = 'negative'
            human_isRelevant = 'relevant'
        elif button.description == "Neutral":
            evaluation_type = "human_label"
            human_sentiment = 'neutral'
            human_isRelevant = 'relevant'
        elif button.description == "Irrelevant":
            evaluation_type = "human_label"
            human_isRelevant = 'irrelevant'
            human_sentiment = None


        new_row = df.loc[current_index].copy() # Copy original row data
        new_row['human_sentiment'] = human_sentiment # Add human sentiment
        new_row['human_isRelevant'] = human_isRelevant # Add human relevance
        new_row['evaluation_type'] = evaluation_type # Add evaluation type

        eval_df = pd.concat([eval_df, pd.DataFrame([new_row])], ignore_index=True) # Append to eval_df

        subreddit_counts[df.loc[current_index]['subreddit']] += 1
        update_counters()

        next_index = get_next_index(current_index, subreddit_counts)

        if next_index is not None:
            current_index = next_index
            update_display()
        else:
            with post_display:
                clear_output(wait=True)
                print("All data has been processed!")
            return


    def on_skip_button_clicked(button):
        nonlocal current_index
        next_index = get_next_index(current_index, subreddit_counts)

        if next_index is not None:
            current_index = next_index
            update_display()
        else:
            with post_display:
                clear_output(wait=True)
                print("All data has been processed!")
            return


    def on_save_button_clicked(button):
        if not os.path.exists('data'):
            os.makedirs('data')
        output_filename = f"data/human_llm_eval.csv" # Output to new eval CSV
        eval_df.to_csv(output_filename, index=False) # Save eval_df
        print(f"Evaluation data saved to {output_filename}")

    # --- Event Handling ---
    agree_button.on_click(on_button_clicked) # Event handler for Agree button
    positive_button.on_click(on_button_clicked)
    negative_button.on_click(on_button_clicked)
    neutral_button.on_click(on_button_clicked)
    irrelevant_button.on_click(on_button_clicked)
    skip_button.on_click(on_skip_button_clicked)
    save_button.on_click(on_save_button_clicked)

    # --- Layout and Display ---
    main_container.children = [
        post_display,
        llm_reasoning_display, # Display LLM Reasoning Output below Post
        button_box,
        total_processed_count,
        save_button,
        sentiment_box,
        relevance_box,
        subreddit_rotation_label
    ]

    display(main_container)

    # --- Initial Display and counter setup ---
    update_counters()
    update_display()

In [None]:
if __name__ == '__main__':
    samsung_data_name = 'data/Samsung/qwen2.5-7b-instruct_cleaned_company_reputation_data_samsung_posts.csv' # Or your LLM data CSV

    try:
        df = pd.read_csv(samsung_data_name)
        # Assuming your LLM data CSV has 'sentiment', 'is_relevant', and 'reasoning' columns
        if 'sentiment' not in df.columns or 'is_relevant' not in df.columns or 'reasoning' not in df.columns:
            raise ValueError("CSV must contain 'sentiment', 'is_relevant', and 'reasoning' columns for LLM data.")
        create_llm_comparison_widget(df, samsung_data_name)
    except FileNotFoundError:
        print("Error: specified CSV file not found. Please ensure the CSV file exists at the specified path.")
    except ValueError as e:
        print(f"Data Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        print("Please check the CSV file and script.")

VBox(children=(Output(), Output(), HBox(children=(Button(description='Agree with LLM', style=ButtonStyle()), B…