Objective with the below code:
1)a UI to Display Post Content, Column content. 
2)Should have buttons for classification, i.e. should have postive, negative, neutral and relevant and irrelevant.
3)Should display counters for how many posts completed classification, how many marked as each positive, negative, neutral and relevant and irrelevant, and what the subreddit rotation counter is.
4)Sort posts based on upvotes and date, and shuffle posts based on subreddit, i.e. rotate posts displayed in each subreddit so that equal amounts of each can be labelled.
expected dataframe structure: text,title,upvotes,type,date,post_flair,user_flair,parent_text,subreddit,category
5)There should be a save button, which exports a CSV into data/labeled_*original_csv_name*
6) While the user is clicking on the buttons, entries should be saved into columns for labelling, the columns should be called 'sentiment' and 'isRelevant'
7)When the user imports a csv that already has two columns called 'sentiment' and 'isRelevant' the code should continue after the last labelled value and should continue labelling values
8) error checking measures should be added for code that already has been fully labelled.

In [None]:
%pip install ipywidgets

In [23]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import pandas as pd
import numpy as np
from datetime import datetime
import os
import textwrap

def create_labeling_widget(df, csv_name):
    """
    Creates an interactive Jupyter Notebook widget for labeling Reddit data.

    Args:
        df: pandas DataFrame with Reddit data.
        csv_name: Original CSV file name.
    """

    # --- Data Prep & Initialization ---
    # Sort by upvotes (descending) and then by subreddit
    df = df.sort_values(by=['upvotes', 'subreddit'], ascending=[False, True]).reset_index(drop=True)

    if 'sentiment' not in df.columns:
        df['sentiment'] = None
    if 'isRelevant' not in df.columns:
        df['isRelevant'] = None

    start_index = 0
    if df['sentiment'].notna().any() or df['isRelevant'].notna().any():
            try:
                # Use .index[-1] safely with reset index
                last_labeled_index = df[df['sentiment'].notna() | df['isRelevant'].notna()].index[-1]
                start_index = last_labeled_index + 1
            except IndexError:
                start_index = 0

    if start_index >= len(df):
        print("All data has already been labeled.")
        return

    current_index = start_index
    # Initialize subreddit counts.
    subreddit_counts = {sub: 0 for sub in df['subreddit'].unique()}
    # Initialize subreddit counts from the start index, VERY IMPORTANT for resuming
    for sub in subreddit_counts:
            subreddit_counts[sub] = df.loc[:start_index-1, 'subreddit'].eq(sub).sum()


    # --- UI Elements ---
    post_display = widgets.Output()  # Create the Output widget *once*
    main_container = widgets.VBox(layout = {'border': '1px solid black'})
    positive_button = widgets.Button(description="Positive")
    negative_button = widgets.Button(description="Negative")
    neutral_button = widgets.Button(description="Neutral")
    irrelevant_button = widgets.Button(description="Irrelevant")
    skip_button = widgets.Button(description="Skip") # Add skip button
    save_button = widgets.Button(description="Save Progress")

    button_box = widgets.HBox([positive_button, negative_button, neutral_button, irrelevant_button, skip_button]) # Include skip button

    total_labeled_count = widgets.Label(value=f"Labeled: {start_index}/{len(df)}")
    sentiment_counts = {
        'positive': widgets.Label(value="Positive: 0"),
        'negative': widgets.Label(value="Negative: 0"),
        'neutral':  widgets.Label(value="Neutral: 0"),
    }
    relevance_counts = {
        'relevant': widgets.Label(value="Relevant: 0"),
        'irrelevant': widgets.Label(value="Irrelevant: 0"),
    }
    subreddit_rotation_label = widgets.Label(value="Subreddit Rotation: ")

    sentiment_box = widgets.HBox(list(sentiment_counts.values()))
    relevance_box = widgets.HBox(list(relevance_counts.values()))

    # --- Helper Functions ---
    def get_next_index(current_index, subreddit_counts):
        """Gets the next index, prioritizing subreddit rotation."""

        available_subreddits = [sub for sub in df['subreddit'].unique()
                                if subreddit_counts[sub] < df['subreddit'].value_counts()[sub]]

        if not available_subreddits:
            return None

        min_sub = min(available_subreddits, key=subreddit_counts.get)
        # Find next unlabeled in min_sub using .loc for boolean indexing
        mask = (df['subreddit'] == min_sub) & (df['sentiment'].isna()) & (df['isRelevant'].isna())
        unlabeled_in_min_sub = df[mask]

        if not unlabeled_in_min_sub.empty:
            return unlabeled_in_min_sub.index[0]  # Correct index due to reset # Corrected typo here
        else:
            subreddit_counts[min_sub] = df['subreddit'].eq(min_sub).sum()
            return get_next_index(current_index, subreddit_counts)


    count_update_display_calls = 0 # DEBUG Counter

    def update_display():
        nonlocal current_index, count_update_display_calls
        count_update_display_calls += 1 # DEBUG Counter
        #print(f"DEBUG: update_display called #{count_update_display_calls}, current_index: {current_index}") # DEBUG PRINT
        post_display.clear_output(wait=True)  # Clear BEFORE printing
        if current_index >= len(df):
            with post_display:
                print("All data has been labeled!")
            return

        row = df.loc[current_index] # Changed from df.iloc to df.loc
        with post_display:
            # Build output as single string
            output = []
            output.append("---- Post Details ----")
            output.append(f"Title: {row['title']}")
            output.append(f"Subreddit: {row['subreddit']} | Category: {row['category']}")
            output.append(f"Upvotes: {row['upvotes']} | Date: {row['date']}")
            if not pd.isna(row['post_flair']):
                output.append(f"Post Flair: {row['post_flair']}")
            if not pd.isna(row['user_flair']):
                output.append(f"User Flair: {row['user_flair']}")
            output.append("\n---- Content ----")
            # if not pd.isna(row['parent_text']):
            #     wrapped_parent_text = textwrap.fill(str(row['parent_text']), width=170)
            #     output.append(f"Parent Comment: {wrapped_parent_text}")
            wrapped_text = textwrap.fill(str(row['text']), width=170)
            output.append(wrapped_text)

            # Corrected print statement: use display instead of print inside Output widget context
            # display(widgets.Label(value='\n'.join(output))) # replaced with append_stdout
            for line in output:
                post_display.append_stdout(line + '\n')


    def update_counters():
        nonlocal current_index
        total_labeled_count.value = f"Labeled: {df['sentiment'].count() + df['isRelevant'].eq('irrelevant').sum()}/{len(df)}"
        sentiment_counts['positive'].value = f"Positive: {df['sentiment'].eq('positive').sum()}"
        sentiment_counts['negative'].value = f"Negative: {df['sentiment'].eq('negative').sum()}"
        sentiment_counts['neutral'].value = f"Neutral: {df['sentiment'].eq('neutral').sum()}"
        relevance_counts['relevant'].value = f"Relevant: {df['isRelevant'].eq('relevant').sum()}"
        relevance_counts['irrelevant'].value = f"Irrelevant: {df['isRelevant'].eq('irrelevant').sum()}"
        subreddit_list = [f'{key}:{subreddit_counts[key]}' for key in subreddit_counts]
        wrapped_subreddit_rotation = textwrap.fill(", ".join(subreddit_list), width=170)
        subreddit_rotation_label.value = f"Subreddit Rotation: {wrapped_subreddit_rotation}"

    def on_button_clicked(button):
        nonlocal current_index
        if current_index >= len(df):
            return

        if button.description == "Positive":
            df.loc[current_index, 'sentiment'] = 'positive'
            df.loc[current_index, 'isRelevant'] = 'relevant'
        elif button.description == "Negative":
            df.loc[current_index, 'sentiment'] = 'negative'
            df.loc[current_index, 'isRelevant'] = 'relevant'
        elif button.description == "Neutral":
            df.loc[current_index, 'sentiment'] = 'neutral'
            df.loc[current_index, 'isRelevant'] = 'relevant'
        elif button.description == "Irrelevant":
            df.loc[current_index, 'isRelevant'] = 'irrelevant'
            df.loc[current_index, 'sentiment'] = None  # Clear sentiment if irrelevant

        subreddit_counts[df.loc[current_index]['subreddit']] += 1
        update_counters()

        next_index = get_next_index(current_index, subreddit_counts)

        if next_index is not None:
            current_index = next_index
            update_display()
        else:
            with post_display:
                clear_output(wait=True)
                print("All data has been labeled!")
            return

    def on_skip_button_clicked(button): # New skip button functionality
        nonlocal current_index
        nonlocal df # Explicitly declare df as nonlocal

        if current_index >= len(df):
            return

        skipped_row = df.loc[current_index].copy() # Get a copy of the current row
        df = df.drop(current_index) # Drop the current row - Changed to avoid inplace and re-assign
        df = pd.concat([df, pd.DataFrame([skipped_row])], ignore_index=True) # Append to the end
        df.reset_index(drop=True, inplace=True) # Reset index

        next_index = get_next_index(current_index, subreddit_counts) # Recalculate next index

        if next_index is not None:
            current_index = next_index
            update_display()
        else:
            with post_display:
                clear_output(wait=True)
                print("All data has been labeled!")
            return


    def on_save_button_clicked(button):
        if not os.path.exists('data'):
            os.makedirs('data')
        base_filename = os.path.basename(csv_name) # Extract base filename
        if base_filename.startswith("labeled_"):
            output_filename = csv_name # Save to original filename if it starts with "labeled_"
        else:
            output_filename = f"data/labeled_{base_filename}" # Otherwise create labeled file in data dir
        df.to_csv(output_filename, index=False)
        print(f"Data saved to {output_filename}")

    # --- Event Handling ---
    positive_button.on_click(on_button_clicked)
    negative_button.on_click(on_button_clicked)
    neutral_button.on_click(on_button_clicked)
    irrelevant_button.on_click(on_button_clicked)
    skip_button.on_click(on_skip_button_clicked) # Add event handler for skip button
    save_button.on_click(on_save_button_clicked)

    # --- Layout and Display ---
    main_container.children = [
            post_display,
            button_box,
            total_labeled_count,
            save_button,
            sentiment_box,
            relevance_box,
            subreddit_rotation_label
        ]

    display(main_container)

    # --- Initial Display and counter setup ---
    update_counters()
    update_display()

In [26]:

if __name__ == '__main__':
    apple_data_name = 'data/cleaned_company_reputation_data_apple_posts.csv'
    samsung_data_name = 'data/labeled_cleaned_company_reputation_data_samsung_posts.csv'


    try:
        df = pd.read_csv(samsung_data_name) # Use samsung_data_name for testing, or labeled_samsung_data_name
        create_labeling_widget(df, samsung_data_name) # Use samsung_data_name for testing, or labeled_samsung_data_name
    except FileNotFoundError:
        print("Error: specified CSV file not found. Creating dummy data.")
        data = {
            'text': ['This is a great post about Samsung! ...',
                     'I hate this Samsung product.',
                     'Neutral comment.',
                     'Another good Samsung post with length...',
                     'Bad post.',
                     'Samsung is ok.',
                     'I like apples. Not the company.',
                     'Galaxy is great.'],
            'title': ['Title 1', 'Title 2', 'Title 3', 'Title 4', 'Title 5', 'Title 6', 'Title 7', 'Title 8'],
            'upvotes': [10, 2, 5, 20, 1, 7, 3, 12],
            'type': ['post'] * 8,
            'date': pd.to_datetime(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04', '2024-01-05',
                                   '2024-01-06', '2024-01-07', '2024-01-08']),
            'post_flair': [None, 'flair1', None, 'flair2', None, None, None, 'flair1'],
            'user_flair': [None, None, 'user_flair1', None, None, None, 'user_flair2', None],
            'subreddit': ['Samsung', 'Android', 'Samsung', 'Android', 'Samsung', 'Apple', 'Apple', 'Samsung'],
            'category': ['cat1', 'cat2', 'cat1', 'cat2', 'cat1', 'cat3', 'cat3', 'cat1']
        }
        df = pd.DataFrame(data)
        create_labeling_widget(df, 'dummy_reddit_data.csv')

VBox(children=(Output(), HBox(children=(Button(description='Positive', style=ButtonStyle()), Button(descriptio…